Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions helper-apps/cortex-file-handler/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion helper-apps/cortex-file-handler/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@aj-archipelago/cortex-file-handler",
"version": "2.7.0",
"version": "2.8.0",
"description": "File handling service for Cortex - handles file uploads, media chunking, and document processing",
"type": "module",
"main": "src/index.js",
Expand Down
92 changes: 92 additions & 0 deletions helper-apps/cortex-file-handler/src/blobHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,64 @@ async function downloadFromGCS(gcsUrl, destinationPath) {
}
}

/**
* Extracts MIME type from a URL based on file extension
* @param {string} url - The URL to extract MIME type from
* @returns {string} The MIME type or 'application/octet-stream' as fallback
*/
function getMimeTypeFromUrl(url) {
const defaultMimeType = 'application/octet-stream';
if (!url) return defaultMimeType;

try {
const urlObj = new URL(url);
const pathname = urlObj.pathname;
const extension = path.extname(pathname);
return mime.lookup(extension) || defaultMimeType;
} catch (e) {
// If URL parsing fails, try to extract extension from URL string
const urlMatch = url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/);
if (urlMatch) {
return mime.lookup(urlMatch[1]) || defaultMimeType;
}
return defaultMimeType;
}
}

/**
* Generates a short-lived SAS URL for a converted file
* @param {object} context - The request context for logging
* @param {string} convertedUrl - The URL of the converted file
* @param {string} [logSuffix=''] - Optional suffix for log messages
* @returns {Promise<string>} The short-lived URL or the original URL as fallback
*/
async function generateShortLivedUrlForConvertedFile(context, convertedUrl, logSuffix = '') {
let shortLivedUrl = convertedUrl; // Fallback to regular URL
try {
const storageFactory = StorageFactory.getInstance();
const primaryProvider = await storageFactory.getAzureProvider();
if (primaryProvider.generateShortLivedSASToken && primaryProvider.extractBlobNameFromUrl) {
const blobName = primaryProvider.extractBlobNameFromUrl(convertedUrl);
if (blobName) {
const { containerClient } = await primaryProvider.getBlobClient();
const sasToken = primaryProvider.generateShortLivedSASToken(
containerClient,
blobName,
5
);
const urlObj = new URL(convertedUrl);
const baseUrl = `${urlObj.protocol}//${urlObj.host}${urlObj.pathname}`;
shortLivedUrl = `${baseUrl}?${sasToken}`;
context.log(`Generated shortLivedUrl for converted file${logSuffix}`);
}
}
} catch (error) {
context.log(`Warning: Could not generate shortLivedUrl for converted file: ${error.message}`);
// Fallback to regular URL
}
return shortLivedUrl;
}

export const getBlobClient = async () => {
const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
// Always use default container from env var
Expand Down Expand Up @@ -598,11 +656,28 @@ function uploadBlob(
);
}

// Generate shortLivedUrl for converted file
const convertedShortLivedUrl = await generateShortLivedUrlForConvertedFile(
context,
convertedSaveResult.url,
' (busboy)'
);

// Determine MIME type of converted file from its URL
const convertedMimeType = getMimeTypeFromUrl(convertedSaveResult.url);

// Attach to response body
result.converted = {
url: convertedSaveResult.url,
shortLivedUrl: convertedShortLivedUrl,
gcs: convertedGcsUrl,
mimeType: convertedMimeType,
};

// Note: result.shortLivedUrl remains pointing to the original file
// result.converted.shortLivedUrl points to the converted file
// Both are available for different use cases

context.log(
"Conversion process (busboy) completed successfully",
);
Expand Down Expand Up @@ -895,11 +970,27 @@ async function uploadFile(
context.log("Converted file saved to GCS");
}

// Generate shortLivedUrl for converted file
const convertedShortLivedUrl = await generateShortLivedUrlForConvertedFile(
context,
convertedSaveResult.url
);

// Determine MIME type of converted file from its URL
const convertedMimeType = getMimeTypeFromUrl(convertedSaveResult.url);

// Add converted file info to result
result.converted = {
url: convertedSaveResult.url,
shortLivedUrl: convertedShortLivedUrl,
gcs: convertedGcsUrl,
mimeType: convertedMimeType,
};

// Note: result.shortLivedUrl remains pointing to the original file
// result.converted.shortLivedUrl points to the converted file
// Both are available for different use cases

context.log("Conversion process completed successfully");
}
} catch (error) {
Expand Down Expand Up @@ -1188,6 +1279,7 @@ export {
gcs,
uploadChunkToGCS,
downloadFromGCS,
getMimeTypeFromUrl,
// Re-export container constants
getDefaultContainerName,
GCS_BUCKETNAME,
Expand Down
164 changes: 88 additions & 76 deletions helper-apps/cortex-file-handler/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import fs from "fs";
import os from "os";
import path from "path";
import { v4 as uuidv4 } from "uuid";
import mime from "mime-types";

import { DOC_EXTENSIONS, AZURITE_ACCOUNT_NAME } from "./constants.js";
import { easyChunker } from "./docHelper.js";
Expand All @@ -17,7 +18,7 @@ import {
} from "./redis.js";
import { FileConversionService } from "./services/FileConversionService.js";
import { StorageService } from "./services/storage/StorageService.js";
import { uploadBlob } from "./blobHandler.js";
import { uploadBlob, getMimeTypeFromUrl } from "./blobHandler.js";
import { generateShortId } from "./utils/filenameUtils.js";
import { redactContextId, redactSasToken, sanitizeForLogging } from "./utils/logSecurity.js";

Expand Down Expand Up @@ -526,93 +527,104 @@ async function CortexFileHandler(context, req) {
context.log(`Error ensuring converted version: ${error}`);
}

// Attach converted info to response if present
if (hashResult.converted) {
response.converted = {
url: hashResult.converted.url,
gcs: hashResult.converted.gcs,
};
// Add mimeType to converted block if it exists but doesn't have mimeType yet
if (hashResult.converted && !hashResult.converted.mimeType) {
hashResult.converted.mimeType = getMimeTypeFromUrl(hashResult.converted.url);
}

// Always generate short-lived URL for checkHash operations
// Use converted URL if available, otherwise use original URL
const urlForShortLived = hashResult.converted?.url || hashResult.url;
try {
// Extract blob name from the URL to generate new SAS token
// Container parameter is ignored - always uses default container from env var
let blobName;
// Generate short-lived URLs for both original and converted files (if converted exists)
// Helper function to generate short-lived URL for a given URL
const generateShortLivedUrlForUrl = async (urlToProcess) => {
if (!urlToProcess) return null;

try {
const url = new URL(urlForShortLived);
// Extract blob name from the URL path (remove leading slash)
let path = url.pathname.substring(1);

// For Azurite URLs, the path includes account name: devstoreaccount1/container/blob
// For real Azure URLs, the path is: container/blob
// Check if this is an Azurite URL (contains devstoreaccount1)
if (path.startsWith(`${AZURITE_ACCOUNT_NAME}/`)) {
path = path.substring(`${AZURITE_ACCOUNT_NAME}/`.length); // Remove account prefix
}

// Extract blob name from path (skip container name, always use default container)
const pathSegments = path.split('/').filter(segment => segment.length > 0);
if (pathSegments.length >= 2) {
// Skip container name (first segment), get blob name (remaining segments)
blobName = pathSegments.slice(1).join('/');
} else if (pathSegments.length === 1) {
// Fallback: assume it's just the blob name in default container
blobName = pathSegments[0];
}

} catch (urlError) {
context.log(`Error parsing URL for short-lived generation: ${urlError}`);
}

// Generate short-lived SAS token
// Container parameter is ignored - always uses default container from env var
if (blobName) {
const provider = storageService.primaryProvider;

if (provider && provider.generateShortLivedSASToken) {
const blobClientResult = await provider.getBlobClient();
const containerClient = blobClientResult.containerClient;

const sasToken = provider.generateShortLivedSASToken(
containerClient,
blobName,
shortLivedDuration
);
// Extract blob name from the URL to generate new SAS token
let blobName;
try {
const url = new URL(urlToProcess);
let path = url.pathname.substring(1);

// Construct new URL with short-lived SAS token
const baseUrl = urlForShortLived.split('?')[0]; // Remove existing SAS token
const shortLivedUrl = `${baseUrl}?${sasToken}`;
// For Azurite URLs, the path includes account name: devstoreaccount1/container/blob
// For real Azure URLs, the path is: container/blob
if (path.startsWith(`${AZURITE_ACCOUNT_NAME}/`)) {
path = path.substring(`${AZURITE_ACCOUNT_NAME}/`.length);
}

// Add short-lived URL to response
response.shortLivedUrl = shortLivedUrl;
response.expiresInMinutes = shortLivedDuration;
const pathSegments = path.split('/').filter(segment => segment.length > 0);
if (pathSegments.length >= 2) {
blobName = pathSegments.slice(1).join('/');
} else if (pathSegments.length === 1) {
blobName = pathSegments[0];
}
} catch (urlError) {
context.log(`Error parsing URL for short-lived generation: ${urlError}`);
return null;
}

if (blobName) {
const provider = storageService.primaryProvider;

const urlType = hashResult.converted?.url ? 'converted' : 'original';
context.log(`Generated short-lived URL for hash: ${hash} using ${urlType} URL (expires in ${shortLivedDuration} minutes)`);
} else {
// Fallback for storage providers that don't support short-lived tokens
response.shortLivedUrl = urlForShortLived;
response.expiresInMinutes = shortLivedDuration;
const urlType = hashResult.converted?.url ? 'converted' : 'original';
context.log(`Storage provider doesn't support short-lived tokens, using ${urlType} URL`);
if (provider && provider.generateShortLivedSASToken) {
const blobClientResult = await provider.getBlobClient();
const containerClient = blobClientResult.containerClient;

const sasToken = provider.generateShortLivedSASToken(
containerClient,
blobName,
shortLivedDuration
);

const baseUrl = urlToProcess.split('?')[0];
return `${baseUrl}?${sasToken}`;
}
}
} else {
// If we couldn't extract blob name, use original URL
response.shortLivedUrl = urlForShortLived;
response.expiresInMinutes = shortLivedDuration;
context.log(`Could not extract blob name from URL, using original URL for short-lived`);
} catch (error) {
context.log(`Error generating short-lived URL: ${error}`);
}
} catch (error) {
context.log(`Error generating short-lived URL: ${error}`);
// Provide fallback even on error

return null;
};

// Generate short-lived URLs for response (not stored in Redis)
// Generate short-lived URL for converted file if it exists
let convertedShortLivedUrl = null;
if (hashResult.converted?.url) {
convertedShortLivedUrl = await generateShortLivedUrlForUrl(hashResult.converted.url);
if (!convertedShortLivedUrl) {
// Fallback to regular URL
convertedShortLivedUrl = hashResult.converted.url;
}
context.log(`Generated shortLivedUrl for converted file`);
}

// Generate short-lived URL for original file (for main response)
const urlForShortLived = hashResult.converted?.url || hashResult.url;
const mainShortLivedUrl = await generateShortLivedUrlForUrl(urlForShortLived);
if (mainShortLivedUrl) {
response.shortLivedUrl = mainShortLivedUrl;
response.expiresInMinutes = shortLivedDuration;
const urlType = hashResult.converted?.url ? 'converted' : 'original';
context.log(`Generated short-lived URL for hash: ${hash} using ${urlType} URL (expires in ${shortLivedDuration} minutes)`);
} else {
// Fallback for storage providers that don't support short-lived tokens
response.shortLivedUrl = urlForShortLived;
response.expiresInMinutes = shortLivedDuration;
const urlType = hashResult.converted?.url ? 'converted' : 'original';
context.log(`Storage provider doesn't support short-lived tokens, using ${urlType} URL`);
}

//update redis timestamp with current time
// Attach converted info to response if present (include shortLivedUrl in response only)
if (hashResult.converted) {
response.converted = {
url: hashResult.converted.url,
shortLivedUrl: convertedShortLivedUrl || hashResult.converted.url,
gcs: hashResult.converted.gcs,
mimeType: hashResult.converted.mimeType || null,
};
}

// Update redis timestamp with current time
// Note: setFileStoreMap will remove shortLivedUrl fields before storing
await setFileStoreMap(hash, hashResult, resolvedContextId);

context.res = {
Expand Down
9 changes: 9 additions & 0 deletions helper-apps/cortex-file-handler/src/redis.js
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,15 @@ const setFileStoreMap = async (hash, value, contextId = null) => {
// Remove 'message' field - it's only for the upload response, not for persistence
delete valueToStore.message;

// Remove shortLivedUrl fields - they're only for responses, not for persistence
// Store only permanent URLs (url, gcs, converted.url, converted.gcs)
delete valueToStore.shortLivedUrl;
if (valueToStore.converted) {
const convertedCopy = { ...valueToStore.converted };
delete convertedCopy.shortLivedUrl;
valueToStore.converted = convertedCopy;
}

// Only set timestamp if one doesn't already exist
if (!valueToStore.timestamp) {
valueToStore.timestamp = new Date().toISOString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,8 @@ export class StorageService {
convertedResult = {
url: hashResult.converted.url,
shortLivedUrl: convertedShortLivedUrl,
gcs: hashResult.converted.gcs
gcs: hashResult.converted.gcs,
mimeType: hashResult.converted.mimeType || null
};
} catch (error) {
context.log?.(`Warning: Failed to update converted file tag: ${error.message}`);
Expand Down
6 changes: 4 additions & 2 deletions helper-apps/cortex-file-handler/tests/setRetention.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,8 @@ test.serial("should update Redis map with retention information", async (t) => {
const newEntry = await getFileStoreMap(testHash);
t.truthy(newEntry, "Redis entry should still exist after setting retention");
t.is(newEntry.url, retentionResponse.data.url, "Entry should have correct URL");
t.truthy(newEntry.shortLivedUrl, "Entry should have shortLivedUrl");
// Note: shortLivedUrl is intentionally NOT stored in Redis (stripped before persistence)
// It's only returned in the response, which is checked above
t.is(newEntry.permanent, true, "Entry should have permanent=true in Redis (matches file collection logic)");

} finally {
Expand Down Expand Up @@ -551,7 +552,8 @@ test.serial("should set retention for context-scoped file", async (t) => {
// Verify Redis entry was updated with context-scoped key
const updatedEntry = await getFileStoreMap(testHash, false, contextId);
t.truthy(updatedEntry, "Should have updated entry in Redis");
t.truthy(updatedEntry.shortLivedUrl, "Should have shortLivedUrl in Redis entry");
// Note: shortLivedUrl is intentionally NOT stored in Redis (stripped before persistence)
// It's only returned in the response, which is checked above
t.is(updatedEntry.permanent, true, "Entry should have permanent=true in Redis (matches file collection logic)");

// Wait for operations to complete
Expand Down
Loading