Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions src/service/core/data/data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import base64
import datetime
import json
import mimetypes
import shlex
from typing import Any, Dict, List, Sequence
import uuid
Expand Down Expand Up @@ -988,6 +989,65 @@ def get_info(
versions=rows)


@router.get('/{bucket}/dataset/{name}/manifest')
def get_manifest(
bucket: objects.DatasetPattern,
name: objects.DatasetPattern,
version: str = fastapi.Query(...),
) -> List:
""" This api returns the manifest for a dataset version. """
postgres = connectors.PostgresConnector.get_instance()
dataset_info = get_dataset(postgres, bucket=bucket, name=name)

fetch_command = '''
SELECT location FROM dataset_version
WHERE dataset_id = %s AND version_id = %s AND status = %s;
'''
rows = postgres.execute_fetch_command(
fetch_command, (dataset_info.id, version, objects.DatasetStatus.READY.name))
if not rows:
raise osmo_errors.OSMODatabaseError(
f'Version {version} not found for dataset {name} in bucket {bucket}.')

bucket_config = postgres.get_dataset_configs().get_bucket_config(bucket)
client = storage.Client.create(
storage_uri=rows[0].location,
data_credential=bucket_config.default_credential,
)
manifest_content = client.get_object_stream(as_io=True).read()
return json.loads(manifest_content)


@router.api_route('/{bucket}/dataset/{name}/file-content', methods=['GET', 'HEAD'])
def get_file_content(
bucket: objects.DatasetPattern,
name: objects.DatasetPattern,
storage_path: str = fastapi.Query(...),
) -> fastapi.responses.StreamingResponse:
""" This api streams file content from storage for the dataset file preview. """
postgres = connectors.PostgresConnector.get_instance()
dataset_info = get_dataset(postgres, bucket=bucket, name=name)

# Validate that the storage path belongs to this dataset's storage prefix
requested_backend = storage.construct_storage_backend(storage_path)
dataset_backend = storage.construct_storage_backend(dataset_info.hash_location)
if requested_backend.container != dataset_backend.container:
raise osmo_errors.OSMOUserError(
'Storage path does not belong to this dataset.')
Comment on lines +1032 to +1036
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Container-only validation may be overly permissive.

The validation only checks that requested_backend.container matches dataset_backend.container. This allows access to any file within the same container, not just files belonging to this specific dataset's storage prefix. Consider validating that storage_path starts with dataset_info.hash_location to restrict access to only the dataset's files.

🛡️ Proposed stricter path validation
     # Validate that the storage path belongs to this dataset's storage prefix
     requested_backend = storage.construct_storage_backend(storage_path)
     dataset_backend = storage.construct_storage_backend(dataset_info.hash_location)
-    if requested_backend.container != dataset_backend.container:
+    if requested_backend.container != dataset_backend.container or \
+       not storage_path.startswith(dataset_info.hash_location.rstrip('/')):
         raise osmo_errors.OSMOUserError(
             'Storage path does not belong to this dataset.')
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
requested_backend = storage.construct_storage_backend(storage_path)
dataset_backend = storage.construct_storage_backend(dataset_info.hash_location)
if requested_backend.container != dataset_backend.container:
raise osmo_errors.OSMOUserError(
'Storage path does not belong to this dataset.')
requested_backend = storage.construct_storage_backend(storage_path)
dataset_backend = storage.construct_storage_backend(dataset_info.hash_location)
if requested_backend.container != dataset_backend.container or \
not storage_path.startswith(dataset_info.hash_location.rstrip('/')):
raise osmo_errors.OSMOUserError(
'Storage path does not belong to this dataset.')
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/service/core/data/data_service.py` around lines 1032 - 1036, The current
check only compares requested_backend.container to dataset_backend.container
(constructed via storage.construct_storage_backend), which allows any path in
the same container; instead validate that the supplied storage_path is within
the dataset's storage prefix by ensuring storage_path (or the backend's full
path) starts with dataset_info.hash_location (or its normalized prefix) before
proceeding; if it does not, raise osmo_errors.OSMOUserError with a clear
message. Normalize both paths (resolve trailing slashes, case/URL encoding as
appropriate) prior to the startswith check and keep the container equality check
as a fast precondition.


bucket_config = postgres.get_dataset_configs().get_bucket_config(bucket)
client = storage.Client.create(
storage_uri=storage_path,
data_credential=bucket_config.default_credential,
)

content_type = mimetypes.guess_type(storage_path)[0] or 'application/octet-stream'
return fastapi.responses.StreamingResponse(
client.get_object_stream(),
media_type=content_type,
)


@router.get('/list_dataset')
def list_dataset_from_bucket(name: objects.DatasetPattern | None = None,
user: List[str] | None = fastapi.Query(default = None),
Expand Down
3 changes: 0 additions & 3 deletions src/ui/next.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,6 @@ const nextConfig: NextConfig = {
// This allows Turbopack aliasing to work (aliases work for imports, not file discovery)
"@/app/api/[...path]/route.impl": "@/app/api/[...path]/route.impl.production",

// Dataset manifest server action - alias to production version (zero mock code)
"@/lib/api/server/dataset-actions": "@/lib/api/server/dataset-actions.production",

// Dataset file proxy route - alias to production version (zero mock code)
"@/app/proxy/dataset/file/route.impl":
"@/app/proxy/dataset/file/route.impl.production",
Expand Down
95 changes: 68 additions & 27 deletions src/ui/src/app/proxy/dataset/file/route.impl.production.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,55 +18,96 @@
* Dataset File Proxy — Production Implementation
*
* Server-side proxy for fetching dataset files from storage URLs.
* Routes requests through the server to avoid CSP restrictions.
*
* GET /proxy/dataset/file?url={encodedFileUrl} → streams file content
* HEAD /proxy/dataset/file?url={encodedFileUrl} → returns headers only
* When bucket/name/storagePath params are present, routes through the backend
* service's file-content endpoint (handles private buckets with credentials).
* Falls back to direct fetch for legacy callers that only provide a url param.
*
* GET /proxy/dataset/file?bucket=...&name=...&storagePath=... → service proxy
* GET /proxy/dataset/file?url={encodedFileUrl} → direct fetch (legacy)
* HEAD variants of the above → headers only
*/

import type { NextRequest } from "next/server";
import { getServerApiBaseUrl } from "@/lib/api/server/config";
import { forwardAuthHeaders } from "@/lib/api/server/proxy-headers";

const FORWARDED_HEADERS = ["content-type", "content-length", "last-modified", "etag", "cache-control"] as const;

function parseAndValidateUrl(request: Request): { url: string } | Response {
function forwardResponseHeaders(upstream: Response): Headers {
const headers = new Headers();
for (const header of FORWARDED_HEADERS) {
const value = upstream.headers.get(header);
if (value) headers.set(header, value);
}
return headers;
}

interface ServiceParams {
bucket: string;
name: string;
storagePath: string;
}

interface LegacyParams {
url: string;
}

function parseParams(request: Request): ServiceParams | LegacyParams | Response {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");

if (!url) {
return Response.json({ error: "url parameter is required" }, { status: 400 });
const bucket = searchParams.get("bucket");
const name = searchParams.get("name");
const storagePath = searchParams.get("storagePath");

if (bucket && name && storagePath) {
return { bucket, name, storagePath };
}

// Legacy fallback: direct URL fetch (works for public buckets only)
const url = searchParams.get("url");
if (!url) {
return Response.json({ error: "bucket/name/storagePath or url parameter is required" }, { status: 400 });
}
if (!url.startsWith("http://") && !url.startsWith("https://")) {
return Response.json({ error: "Only http/https URLs are supported" }, { status: 400 });
}

return { url };
}

export async function GET(request: Request) {
const result = parseAndValidateUrl(request);
if (result instanceof Response) return result;

const upstream = await fetch(result.url);
function isServiceParams(params: ServiceParams | LegacyParams): params is ServiceParams {
return "storagePath" in params;
}

const headers = new Headers();
for (const header of FORWARDED_HEADERS) {
const value = upstream.headers.get(header);
if (value) headers.set(header, value);
async function fetchUpstream(
request: NextRequest,
params: ServiceParams | LegacyParams,
method: string,
): Promise<Response> {
if (isServiceParams(params)) {
const backendUrl = getServerApiBaseUrl();
const query = new URLSearchParams({ storage_path: params.storagePath });
const serviceUrl = `${backendUrl}/api/bucket/${encodeURIComponent(params.bucket)}/dataset/${encodeURIComponent(params.name)}/file-content?${query}`;
const headers = forwardAuthHeaders(request);
return fetch(serviceUrl, { method, headers });
}

return new Response(upstream.body, { status: upstream.status, headers });
return fetch(params.url, { method });
}

export async function HEAD(request: Request) {
const result = parseAndValidateUrl(request);
export async function GET(request: NextRequest) {
const result = parseParams(request);
if (result instanceof Response) return result;

const upstream = await fetch(result.url, { method: "HEAD" });
const upstream = await fetchUpstream(request, result, "GET");
const headers = forwardResponseHeaders(upstream);
return new Response(upstream.body, { status: upstream.status, headers });
}

const headers = new Headers();
for (const header of FORWARDED_HEADERS) {
const value = upstream.headers.get(header);
if (value) headers.set(header, value);
}
export async function HEAD(request: NextRequest) {
const result = parseParams(request);
if (result instanceof Response) return result;

const upstream = await fetchUpstream(request, result, "HEAD");
const headers = forwardResponseHeaders(upstream);
return new Response(null, { status: upstream.status, headers });
}
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ export function DatasetDetailContent({ bucket, name }: Props) {
const {
versions,
location,
resolvedName,
resolvedVersion,
files: virtualFiles,
memberSubPath,
segmentLabels,
Expand All @@ -184,6 +186,8 @@ export function DatasetDetailContent({ bucket, name }: Props) {
return {
versions: [],
location: null as string | null,
resolvedName: name,
resolvedVersion: "",
files: null as DatasetFile[] | null,
memberSubPath: "",
segmentLabels: {} as Record<string, string>,
Expand All @@ -197,6 +201,8 @@ export function DatasetDetailContent({ bucket, name }: Props) {
return {
versions: detail.versions,
location: currentVersionData?.location ?? null,
resolvedName: name,
resolvedVersion: currentVersionData?.version ?? "",
files: null,
memberSubPath: path,
segmentLabels: {},
Expand All @@ -221,6 +227,8 @@ export function DatasetDetailContent({ bucket, name }: Props) {
return {
versions: [],
location: null,
resolvedName: name,
resolvedVersion: "",
files: memberEntries,
memberSubPath: "",
segmentLabels: labels,
Expand All @@ -234,11 +242,13 @@ export function DatasetDetailContent({ bucket, name }: Props) {
return {
versions: [],
location: member?.location ?? null,
resolvedName: member?.name ?? name,
resolvedVersion: member?.version ?? "",
files: null,
memberSubPath: subPath,
segmentLabels: labels,
};
}, [detail, version, path]);
}, [detail, version, path, name]);

// ==========================================================================
// File listing — fetch manifest for selected version/member
Expand All @@ -249,7 +259,7 @@ export function DatasetDetailContent({ bucket, name }: Props) {
isLoading: isFilesLoading,
error: filesError,
refetch: refetchFiles,
} = useDatasetFiles(location);
} = useDatasetFiles(bucket, resolvedName, resolvedVersion, location);

// Normal (unfiltered) directory listing — used for FilterBar suggestions and as base view
const normalFiles = useMemo(
Expand Down Expand Up @@ -573,6 +583,8 @@ export function DatasetDetailContent({ bucket, name }: Props) {
<FilePreviewPanel
file={panelFileData}
path={fileDirPath}
bucket={bucket}
datasetName={resolvedName}
onClose={handleClosePanel}
/>
)}
Expand Down
Loading