From 9037dba705b0d728a1b31e6a8d76c9d6b1bd7485 Mon Sep 17 00:00:00 2001 From: Jason McCartney Date: Sat, 20 Dec 2025 20:26:45 -0700 Subject: [PATCH 1/5] feat: add short-lived URL generation for converted files - Implemented functionality to generate short-lived URLs for converted files during the upload process. - Updated response structure to include `shortLivedUrl` for both original and converted files, allowing for flexible use cases. - Enhanced error handling to log warnings if short-lived URL generation fails, ensuring fallback to regular URLs. - Added comprehensive tests to verify the presence and correctness of `shortLivedUrl` in upload responses, ensuring robust functionality for file handling. --- .../cortex-file-handler/src/blobHandler.js | 62 +++++++ .../tests/shortLivedUrlConversion.test.js | 93 ++++++++++ lib/fileUtils.js | 168 ++++++++++++------ .../system/entity/tools/sys_tool_editfile.js | 9 +- .../system/entity/tools/sys_tool_readfile.js | 83 ++++++--- .../features/tools/fileCollection.test.js | 154 ++++++++++++++++ tests/unit/core/fileCollection.test.js | 128 +++++++++++++ 7 files changed, 613 insertions(+), 84 deletions(-) diff --git a/helper-apps/cortex-file-handler/src/blobHandler.js b/helper-apps/cortex-file-handler/src/blobHandler.js index eeae97a4..5fa04fe3 100644 --- a/helper-apps/cortex-file-handler/src/blobHandler.js +++ b/helper-apps/cortex-file-handler/src/blobHandler.js @@ -598,11 +598,42 @@ function uploadBlob( ); } + // Generate shortLivedUrl for converted file + let convertedShortLivedUrl = convertedSaveResult.url; // Fallback to regular URL + try { + const storageFactory = StorageFactory.getInstance(); + const primaryProvider = await storageFactory.getAzureProvider(); + if (primaryProvider.generateShortLivedSASToken && primaryProvider.extractBlobNameFromUrl) { + const convertedBlobName = primaryProvider.extractBlobNameFromUrl(convertedSaveResult.url); + if (convertedBlobName) { + const { containerClient } = await primaryProvider.getBlobClient(); + const convertedShortLivedSasToken = primaryProvider.generateShortLivedSASToken( + containerClient, + convertedBlobName, + 5 + ); + const convertedUrlObj = new URL(convertedSaveResult.url); + const convertedBaseUrl = `${convertedUrlObj.protocol}//${convertedUrlObj.host}${convertedUrlObj.pathname}`; + convertedShortLivedUrl = `${convertedBaseUrl}?${convertedShortLivedSasToken}`; + context.log("Generated shortLivedUrl for converted file (busboy)"); + } + } + } catch (error) { + context.log(`Warning: Could not generate shortLivedUrl for converted file: ${error.message}`); + // Fallback to regular URL + } + // Attach to response body result.converted = { url: convertedSaveResult.url, + shortLivedUrl: convertedShortLivedUrl, gcs: convertedGcsUrl, }; + + // Note: result.shortLivedUrl remains pointing to the original file + // result.converted.shortLivedUrl points to the converted file + // Both are available for different use cases + context.log( "Conversion process (busboy) completed successfully", ); @@ -895,11 +926,42 @@ async function uploadFile( context.log("Converted file saved to GCS"); } + // Generate shortLivedUrl for converted file + let convertedShortLivedUrl = convertedSaveResult.url; // Fallback to regular URL + try { + const storageFactory = StorageFactory.getInstance(); + const primaryProvider = await storageFactory.getAzureProvider(); + if (primaryProvider.generateShortLivedSASToken && primaryProvider.extractBlobNameFromUrl) { + const convertedBlobName = primaryProvider.extractBlobNameFromUrl(convertedSaveResult.url); + if (convertedBlobName) { + const { containerClient } = await primaryProvider.getBlobClient(); + const convertedShortLivedSasToken = primaryProvider.generateShortLivedSASToken( + containerClient, + convertedBlobName, + 5 + ); + const convertedUrlObj = new URL(convertedSaveResult.url); + const convertedBaseUrl = `${convertedUrlObj.protocol}//${convertedUrlObj.host}${convertedUrlObj.pathname}`; + convertedShortLivedUrl = `${convertedBaseUrl}?${convertedShortLivedSasToken}`; + context.log("Generated shortLivedUrl for converted file"); + } + } + } catch (error) { + context.log(`Warning: Could not generate shortLivedUrl for converted file: ${error.message}`); + // Fallback to regular URL + } + // Add converted file info to result result.converted = { url: convertedSaveResult.url, + shortLivedUrl: convertedShortLivedUrl, gcs: convertedGcsUrl, }; + + // Note: result.shortLivedUrl remains pointing to the original file + // result.converted.shortLivedUrl points to the converted file + // Both are available for different use cases + context.log("Conversion process completed successfully"); } } catch (error) { diff --git a/helper-apps/cortex-file-handler/tests/shortLivedUrlConversion.test.js b/helper-apps/cortex-file-handler/tests/shortLivedUrlConversion.test.js index 1e8c2cb2..24dbb890 100644 --- a/helper-apps/cortex-file-handler/tests/shortLivedUrlConversion.test.js +++ b/helper-apps/cortex-file-handler/tests/shortLivedUrlConversion.test.js @@ -162,6 +162,99 @@ test.serial( }, ); +/** + * Test that initial upload response includes shortLivedUrl for converted files + */ +test.serial( + "upload should generate shortLivedUrl for converted file in initial response", + async (t) => { + // Create a minimal XLSX workbook in-memory + const workbook = XLSX.utils.book_new(); + const worksheet = XLSX.utils.aoa_to_sheet([ + ["Product", "Price"], + ["Apple", 1.50], + ["Banana", 0.75], + ]); + XLSX.utils.book_append_sheet(workbook, worksheet, "Products"); + + // Write it to a temp file + const filePath = path.join(t.context.testDir, `${uuidv4()}.xlsx`); + XLSX.writeFile(workbook, filePath); + + const hash = `test-upload-converted-${uuidv4()}`; + let uploadedUrl; + let convertedUrl; + + try { + // Upload the XLSX file (conversion should run automatically) + const uploadResponse = await uploadFile(filePath, null, hash); + t.is(uploadResponse.status, 200, "Upload should succeed"); + t.truthy(uploadResponse.data.converted, "Upload response must contain converted info"); + t.truthy(uploadResponse.data.converted.url, "Converted URL should be present"); + + uploadedUrl = uploadResponse.data.url; + convertedUrl = uploadResponse.data.converted.url; + + // Verify the original file is .xlsx and converted file is .csv + t.is(getFileExtensionFromUrl(uploadedUrl), '.xlsx', "Original URL should point to .xlsx file"); + t.is(getFileExtensionFromUrl(convertedUrl), '.csv', "Converted URL should point to .csv file"); + + // CRITICAL TEST: Verify converted.shortLivedUrl is present in upload response + t.truthy( + uploadResponse.data.converted.shortLivedUrl, + "Upload response should include converted.shortLivedUrl" + ); + + // Verify converted.shortLivedUrl points to the converted file (.csv) + const convertedShortLivedBase = uploadResponse.data.converted.shortLivedUrl.split('?')[0]; + const convertedUrlBase = convertedUrl.split('?')[0]; + t.is( + convertedShortLivedBase, + convertedUrlBase, + "converted.shortLivedUrl should be based on converted file URL (.csv)" + ); + + // Verify converted.shortLivedUrl points to CSV, not XLSX + t.true( + uploadResponse.data.converted.shortLivedUrl.includes('.csv'), + "converted.shortLivedUrl should point to .csv file" + ); + t.false( + uploadResponse.data.converted.shortLivedUrl.includes('.xlsx'), + "converted.shortLivedUrl should NOT contain .xlsx extension" + ); + + // Verify main shortLivedUrl points to original file (for reference) + // The converted file's shortLivedUrl is in result.converted.shortLivedUrl + if (uploadResponse.data.shortLivedUrl) { + const mainShortLivedBase = uploadResponse.data.shortLivedUrl.split('?')[0]; + const originalUrlBase = uploadedUrl.split('?')[0]; + t.is( + mainShortLivedBase, + originalUrlBase, + "Main shortLivedUrl should point to original file (.xlsx)" + ); + t.true( + uploadResponse.data.shortLivedUrl.includes('.xlsx'), + "Main shortLivedUrl should point to .xlsx file (original)" + ); + t.false( + uploadResponse.data.shortLivedUrl.includes('.csv'), + "Main shortLivedUrl should NOT point to .csv file (that's in converted.shortLivedUrl)" + ); + } + + } finally { + // Clean up + fs.unlinkSync(filePath); + await cleanupHashAndFile(hash, uploadedUrl, baseUrl); + if (convertedUrl) { + await cleanupHashAndFile(null, convertedUrl, baseUrl); + } + } + }, +); + /** * Test that short-lived URLs fallback to original files when no conversion exists */ diff --git a/lib/fileUtils.js b/lib/fileUtils.js index b3657263..f4ee8892 100644 --- a/lib/fileUtils.js +++ b/lib/fileUtils.js @@ -13,6 +13,7 @@ import path from 'path'; import FormData from 'form-data'; import xxhash from 'xxhash-wasm'; import mime from 'mime-types'; +import mimeDb from 'mime-db'; import { encrypt, decrypt } from './crypto.js'; const pipeline = promisify(stream.pipeline); @@ -1007,11 +1008,16 @@ async function addFileToCollection(contextId, contextKey, url, gcs, filename, ta throw new Error("url or fileUrl is required"); } - // Determine MIME type from URL (preferring converted URL if available) - const mimeType = determineMimeTypeFromUrl(finalUrl, finalGcs, filename); + // Determine MIME type from URL (the actual stored content, which may be converted) + // E.g., if user uploaded foo.docx but it was converted to foo.md, MIME type should be text/markdown + const mimeType = determineMimeTypeFromUrl(finalUrl, finalGcs, null); - // Ensure filename has correct extension based on MIME type - const correctedFilename = ensureFilenameExtension(filename, mimeType); + // IMPORTANT: Keep the original user-provided filename as displayFilename + // Do NOT "correct" the extension based on MIME type + // The user's original filename (e.g., "foo.docx") should be preserved even if the + // stored content is a converted format (e.g., "foo.md") + // This allows users to recognize their files by original name while tools + // use the actual URL to determine content type for operations // If no hash, generate one from URL for storage key (needed for Redis hash map) const storageHash = finalHash || await computeBufferHash(Buffer.from(finalUrl)); @@ -1021,8 +1027,8 @@ async function addFileToCollection(contextId, contextKey, url, gcs, filename, ta id: `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`, url: finalUrl, gcs: finalGcs || null, - displayFilename: correctedFilename, // Store user-provided filename as displayFilename (filename is managed by CFH) - mimeType: mimeType, + displayFilename: filename, // Keep original user-provided filename as displayFilename (NOT corrected by MIME type) + mimeType: mimeType, // MIME type from actual URL content (may differ from displayFilename extension) tags: Array.isArray(tags) ? tags : [], notes: notes || '', hash: storageHash, // Use storageHash (actual hash or generated from URL) @@ -1058,10 +1064,11 @@ async function addFileToCollection(contextId, contextKey, url, gcs, filename, ta url: finalUrl, // Use new URL (guaranteed to be truthy at this point) gcs: finalGcs || existingData.gcs || null, // Use new GCS if provided, otherwise keep existing // Preserve CFH's filename (managed by CFH), store user-provided filename as displayFilename - displayFilename: correctedFilename, // Store user-provided filename as displayFilename + // Keep original user-provided filename (do NOT correct based on MIME type) + displayFilename: filename, tags: fileEntry.tags.length > 0 ? fileEntry.tags : (existingData.tags || []), // Merge tags if new ones provided notes: fileEntry.notes || existingData.notes || '', // Keep existing notes if new ones empty - mimeType: fileEntry.mimeType || existingData.mimeType || null, + mimeType: fileEntry.mimeType || existingData.mimeType || null, // MIME type from URL (actual content type) inCollection: ['*'], // Mark as global chat file (available to all chats) addedDate: existingData.addedDate || fileEntry.addedDate, // Keep earliest addedDate lastAccessed: new Date().toISOString(), // Always update lastAccessed @@ -1194,6 +1201,33 @@ function determineMimeTypeFromUrl(url, gcs = null, filename = null) { return 'application/octet-stream'; } +/** + * Get the actual content MIME type from a file object. + * This determines the MIME type from the actual stored content (URL), not the displayFilename. + * + * Use this for operations that need to know the actual content type (e.g., reading, editing), + * not for display purposes where displayFilename should be used. + * + * Example: A file with displayFilename="report.docx" but url="...report.md" + * will return "text/markdown" because that's what the actual content is. + * + * @param {Object} file - File object with url, gcs, and optionally mimeType fields + * @returns {string} MIME type of the actual content + */ +function getActualContentMimeType(file) { + if (!file) { + return 'application/octet-stream'; + } + + // If mimeType is already stored and valid, use it (it was computed from URL at add time) + if (file.mimeType && file.mimeType !== 'application/octet-stream') { + return file.mimeType; + } + + // Determine MIME type from URL (the actual stored content) + // Do NOT use displayFilename as it may have a different extension (e.g., docx for an md file) + return determineMimeTypeFromUrl(file.url, file.gcs, null); +} /** * Sync files from chat history to file collection @@ -1263,7 +1297,9 @@ async function syncFilesToCollection(chatHistory, contextId, contextKey = null) // CFH already has this file - merge CFH data with Cortex metadata // Only set Cortex-managed fields (tags, notes, id, dates), preserve all CFH data // Ensure mimeType is set (CFH doesn't store it, so we need to determine it) - const mimeType = existingData.mimeType || determineMimeTypeFromUrl(existingData.url, existingData.gcs, existingData.displayFilename); + // IMPORTANT: Determine MIME type from URL (actual content), not displayFilename + // displayFilename may have original extension (e.g., .docx) while URL points to converted content (e.g., .md) + const mimeType = existingData.mimeType || determineMimeTypeFromUrl(existingData.url, existingData.gcs, null); const fileData = { ...existingData, // Preserve all CFH data (url, gcs, filename, displayFilename, permanent, etc.) @@ -1827,14 +1863,14 @@ async function checkHashExists(hash, fileHandlerUrl, pathwayResolver = null, con // If file exists (200), return URLs with short-lived URL preferred if (checkResponse.status === 200 && checkResponse.data && checkResponse.data.url) { const data = checkResponse.data; - // shortLivedUrl automatically prefers converted URL if it exists - // Use shortLivedUrl if available, otherwise fall back to regular URL - // For GCS, always use the GCS URL from checkHash (no short-lived for GCS) - const url = data.shortLivedUrl || data.converted?.url || data.url; + // Prefer converted file's shortLivedUrl if available (that's what we want to send to LLMs) + // Otherwise use converted URL, then original shortLivedUrl, then original URL + // For GCS, always use the GCS URL from checkHash (prefers converted, no short-lived for GCS) + const url = data.converted?.shortLivedUrl || data.converted?.url || data.shortLivedUrl || data.url; const gcs = data.converted?.gcs || data.gcs || null; return { - url: url, // shortLivedUrl if available (prefers converted), otherwise regular URL + url: url, // Prefers converted.shortLivedUrl (for LLM processing), then converted.url, then original shortLivedUrl/url gcs: gcs, // GCS URL (prefers converted, no short-lived version for GCS) hash: data.hash || hash, filename: data.filename || null // Include filename from response @@ -1884,13 +1920,13 @@ async function ensureShortLivedUrl(fileObject, fileHandlerUrl, contextId = null, try { const resolved = await checkHashExists(fileObject.hash, fileHandlerUrl, null, contextId, shortLivedMinutes); if (resolved && resolved.url) { - // Return file object with url replaced by shortLivedUrl (or fallback to regular url) - // GCS URL comes from checkHash (no short-lived version for GCS) + // Return file object with url replaced by shortLivedUrl (prefers converted.shortLivedUrl if available) + // GCS URL comes from checkHash (prefers converted, no short-lived version for GCS) // Preserve filename from original, but use resolved filename if original doesn't have one return { ...fileObject, - url: resolved.url, // shortLivedUrl (or fallback) - gcs: resolved.gcs || fileObject.gcs || null, // GCS from checkHash + url: resolved.url, // Prefers converted.shortLivedUrl (for LLM processing), then converted.url, then original shortLivedUrl/url + gcs: resolved.gcs || fileObject.gcs || null, // GCS from checkHash (prefers converted) filename: fileObject.filename || resolved.filename || fileObject.filename // Preserve original, fallback to resolved }; } @@ -2097,11 +2133,18 @@ async function uploadFileToCloud(fileInput, mimeType = null, filename = null, pa }); if (uploadResponse.data && uploadResponse.data.url) { + const data = uploadResponse.data; + // Prefer converted file's shortLivedUrl if available (that's what we want to send to LLMs) + // Otherwise use converted URL, then original shortLivedUrl, then original URL + // For GCS, prefer converted GCS URL if available + const url = data.converted?.shortLivedUrl || data.converted?.url || data.shortLivedUrl || data.url; + const gcs = data.converted?.gcs || data.gcs || null; + // Return both url and gcs if available return { - url: uploadResponse.data.url, - gcs: uploadResponse.data.gcs || null, - hash: uploadResponse.data.hash || fileHash + url: url, // Prefers converted.shortLivedUrl (for LLM processing), then converted.url, then original shortLivedUrl/url + gcs: gcs, // GCS URL (prefers converted if available) + hash: data.hash || fileHash }; } else { throw new Error('No URL returned from file handler'); @@ -2179,12 +2222,12 @@ async function resolveFileHashesToContent(fileHashes, config, contextId = null) const existingFile = await checkHashExists(hash, fileHandlerUrl, null, contextId, 5); if (existingFile) { // checkHashExists already returns: - // - shortLivedUrl (prefers converted) in url field + // - converted.shortLivedUrl (preferred) or converted.url or original shortLivedUrl/url in url field // - GCS URL (prefers converted) in gcs field // - filename in filename field return JSON.stringify({ type: "image_url", - url: existingFile.url, // Already has shortLivedUrl (prefers converted) + url: existingFile.url, // Prefers converted.shortLivedUrl (for LLM processing) image_url: { url: existingFile.url }, gcs: existingFile.gcs || null, // GCS URL (prefers converted, no short-lived) hash: hash @@ -2259,54 +2302,66 @@ function isTextMimeType(mimeType) { // e.g., "application/json; charset=utf-8" -> "application/json" const baseMimeType = mimeType.split(';')[0].trim().toLowerCase(); - // All text/* types + // 1. All text/* types are text (text/plain, text/html, text/css, text/csv, text/markdown, etc.) if (baseMimeType.startsWith('text/')) { return true; } - // Text-based application types (consistent with sys_tool_writefile and sys_tool_editfile) - const textApplicationTypes = [ - 'application/json', - 'application/javascript', - 'application/x-javascript', - 'application/typescript', + // 2. Structured text formats with +json, +xml, +yaml suffix + // (application/ld+json, application/rss+xml, application/vnd.api+json, etc.) + if (baseMimeType.endsWith('+json') || baseMimeType.endsWith('+xml') || baseMimeType.endsWith('+yaml')) { + return true; + } + + // 3. Check mime-db for charset property - if a MIME type has a default charset, it's text + // This catches application/json, application/javascript, etc. + const dbEntry = mimeDb[baseMimeType]; + if (dbEntry && dbEntry.charset) { + return true; + } + + // 4. Well-known text-based application/* types not in mime-db with charset + // These are common formats that are definitely text but don't have charset in the database + const knownTextTypes = new Set([ 'application/xml', 'application/x-yaml', 'application/yaml', - 'application/x-python', + 'application/toml', + 'application/x-toml', 'application/x-sh', 'application/x-shellscript', - 'application/x-c', - 'application/x-c++', - 'application/x-cpp', - 'application/x-java', - 'application/x-go', - 'application/x-rust', - 'application/x-ruby', - 'application/x-php', + 'application/x-httpd-php', 'application/x-perl', - 'application/x-lua', + 'application/x-python', 'application/x-sql', - 'application/x-toml', - 'application/x-ini', - 'application/x-config', - ]; - - // Also check for common code file patterns in application types - if (baseMimeType.startsWith('application/x-') || baseMimeType.startsWith('application/vnd.')) { - // Check if it's a known text-based subtype - const knownTextSubtypes = [ - 'json', 'javascript', 'typescript', 'xml', 'yaml', 'python', 'sh', 'shellscript', - 'c', 'cpp', 'c++', 'java', 'go', 'rust', 'ruby', 'php', 'perl', 'lua', 'sql', - 'toml', 'ini', 'config', 'csv', 'tsv', 'plain', 'text', 'source', 'script' - ]; - const subtype = baseMimeType.split('/').pop().split('+')[0]; - if (knownTextSubtypes.some(known => subtype.includes(known))) { + 'application/sql', + 'application/graphql', + 'application/x-tex', + 'application/x-latex', + 'application/rtf', + ]); + + if (knownTextTypes.has(baseMimeType)) { + return true; + } + + // 5. Check for source code patterns in application/x-* types + if (baseMimeType.startsWith('application/x-')) { + const subtype = baseMimeType.substring('application/x-'.length); + // Common patterns indicating source code + if (subtype.includes('source') || subtype.includes('script') || + subtype.includes('src') || subtype.includes('code')) { return true; } } - return textApplicationTypes.includes(baseMimeType); + // 6. Check if charset parameter is present in original MIME string + // e.g., "application/octet-stream; charset=utf-8" is likely text + if (mimeType.toLowerCase().includes('charset=')) { + return true; + } + + return false; } export { @@ -2346,6 +2401,7 @@ export { isTextMimeType, isFileInCollection, writeFileDataToRedis, + getActualContentMimeType, // isYoutubeUrl is exported inline above // Exported for testing extractFilenameFromUrl, diff --git a/pathways/system/entity/tools/sys_tool_editfile.js b/pathways/system/entity/tools/sys_tool_editfile.js index ba171c18..82be8795 100644 --- a/pathways/system/entity/tools/sys_tool_editfile.js +++ b/pathways/system/entity/tools/sys_tool_editfile.js @@ -2,7 +2,7 @@ // Entity tool that modifies existing files by replacing line ranges or exact string matches import logger from '../../../../lib/logger.js'; import { axios } from '../../../../lib/requestExecutor.js'; -import { uploadFileToCloud, findFileInCollection, loadFileCollection, getMimeTypeFromFilename, resolveFileParameter, deleteFileByHash, isTextMimeType, updateFileMetadata, writeFileDataToRedis, invalidateFileCollectionCache } from '../../../../lib/fileUtils.js'; +import { uploadFileToCloud, findFileInCollection, loadFileCollection, getMimeTypeFromFilename, resolveFileParameter, deleteFileByHash, isTextMimeType, updateFileMetadata, writeFileDataToRedis, invalidateFileCollectionCache, getActualContentMimeType } from '../../../../lib/fileUtils.js'; // Maximum file size for editing (50MB) - prevents memory blowup on huge files const MAX_EDITABLE_FILE_SIZE = 50 * 1024 * 1024; @@ -398,10 +398,11 @@ export default { } } - // Determine MIME type from filename using utility function - // Use displayFilename (user-friendly) if available, otherwise fall back to filename (CFH-managed) + // Determine MIME type from actual stored content (URL), not displayFilename + // displayFilename may have a different extension than the actual content + // (e.g., displayFilename="report.docx" but content is markdown after conversion) const filename = currentFile.displayFilename || currentFile.filename || 'modified.txt'; - let mimeType = getMimeTypeFromFilename(filename, 'text/plain'); + let mimeType = getActualContentMimeType(currentFile) || getMimeTypeFromFilename(filename, 'text/plain'); // Add charset=utf-8 for text-based MIME types if (isTextMimeType(mimeType)) { diff --git a/pathways/system/entity/tools/sys_tool_readfile.js b/pathways/system/entity/tools/sys_tool_readfile.js index ed8cce58..64759afd 100644 --- a/pathways/system/entity/tools/sys_tool_readfile.js +++ b/pathways/system/entity/tools/sys_tool_readfile.js @@ -4,8 +4,53 @@ import logger from '../../../../lib/logger.js'; import { axios } from '../../../../lib/requestExecutor.js'; import { resolveFileParameter, getMimeTypeFromFilename, isTextMimeType } from '../../../../lib/fileUtils.js'; +// Code/text file extensions that mime-types doesn't recognize or misidentifies +// The mime-types library lacks coverage for many programming languages +// See: https://github.com/jshttp/mime-types (uses mime-db which is IANA-focused) +// +// We only list extensions where mime.lookup() returns false or a wrong type +// (e.g., .ts -> video/mp2t instead of TypeScript) +const KNOWN_TEXT_EXTENSIONS = new Set([ + // TypeScript (mime-types returns video/mp2t for .ts!) + 'ts', 'tsx', 'mts', 'cts', + // Languages not in mime-db + 'py', 'pyw', 'pyi', // Python + 'go', // Go + 'rs', // Rust + 'rb', 'rake', 'gemspec', // Ruby + 'swift', // Swift + 'kt', 'kts', // Kotlin + 'scala', 'sbt', // Scala + 'clj', 'cljs', 'cljc', 'edn',// Clojure + 'elm', // Elm + 'ex', 'exs', // Elixir + 'erl', 'hrl', // Erlang + 'hs', 'lhs', // Haskell + 'ml', 'mli', // OCaml + 'fs', 'fsi', 'fsx', // F# + 'r', 'rmd', // R + 'jl', // Julia + 'nim', // Nim + 'zig', // Zig + 'v', // V + 'cr', // Crystal + // Modern web frameworks + 'vue', 'svelte', 'astro', + // Shell variants (only bash/zsh/fish not recognized) + 'bash', 'zsh', 'fish', + // Config files (dotfiles without extension) + 'env', 'envrc', 'editorconfig', 'gitignore', 'dockerignore', + // Data/config formats not in mime-db + 'prisma', 'graphql', 'gql', + 'tf', 'tfvars', 'hcl', // Terraform + 'proto', // Protocol Buffers + 'diff', 'patch', +]); + /** * Check if a file is a text file type that can be read + * Uses MIME type detection via the mime-types library, with fallback for + * common code file extensions that the library doesn't recognize * @param {string} url - File URL or path * @returns {boolean} - Returns true if it's a text file, false otherwise */ @@ -16,39 +61,29 @@ function isTextFile(url) { // Extract filename from URL (remove query string and fragment) const urlPath = url.split('?')[0].split('#')[0]; + const extension = urlPath.split('.').pop()?.toLowerCase(); - // Use existing MIME utility to get MIME type + // Use MIME library to get MIME type from filename/extension const mimeType = getMimeTypeFromFilename(urlPath); - // If we have a valid MIME type (not unknown/octet-stream), check it + // If MIME library returns a valid type, check it (but watch for misidentifications) if (mimeType && mimeType !== 'application/octet-stream') { + // Handle known misidentifications from mime-types library + // .ts -> video/mp2t (wrong, it's TypeScript) + // .rs -> application/rls-services+xml (wrong, it's Rust) + if (extension && KNOWN_TEXT_EXTENSIONS.has(extension)) { + return true; // Trust our extension list over wrong MIME type + } return isTextMimeType(mimeType); } - // Fallback: Check file extension for common text/code file types - // This handles cases where MIME type detection fails - const extension = urlPath.split('.').pop()?.toLowerCase(); - if (extension) { - const textExtensions = [ - // Code files - 'json', 'js', 'jsx', 'ts', 'tsx', 'py', 'go', 'rs', 'java', 'c', 'cpp', 'cc', 'cxx', 'h', 'hpp', 'hxx', - 'rb', 'php', 'pl', 'lua', 'sql', 'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd', - // Config/data files - 'yaml', 'yml', 'toml', 'ini', 'cfg', 'conf', 'config', 'xml', 'csv', 'tsv', - // Markup/documentation - 'md', 'markdown', 'txt', 'text', 'log', 'diff', 'patch', - // Web - 'html', 'htm', 'css', 'scss', 'sass', 'less', 'vue', 'svelte', - // Other text formats - 'rtf', 'rtfd', 'tex', 'latex', 'bib', 'r', 'm', 'swift', 'kt', 'scala', 'clj', 'cljs', 'edn', - 'dart', 'elm', 'ex', 'exs', 'fs', 'fsx', 'ml', 'mli', 'hs', 'lhs', 'vim', 'vimrc' - ]; - if (textExtensions.includes(extension)) { - return true; - } + // Fallback: Check our list of known text/code extensions + // This handles cases where the MIME library doesn't recognize the extension + if (extension && KNOWN_TEXT_EXTENSIONS.has(extension)) { + return true; } - // Unknown type, reject to be safe + // For truly unknown extensions, we can't reliably determine if it's text return false; } diff --git a/tests/integration/features/tools/fileCollection.test.js b/tests/integration/features/tools/fileCollection.test.js index fe2bef03..6f2f2f54 100644 --- a/tests/integration/features/tools/fileCollection.test.js +++ b/tests/integration/features/tools/fileCollection.test.js @@ -2055,3 +2055,157 @@ test('Analyzer tool: Returns error JSON format when file not found', async t => await cleanup(contextId); } }); + +// ============================================ +// Converted Files Tests (displayFilename != URL extension) +// ============================================ + +test('Converted files: displayFilename .docx but URL .md - MIME type from URL', async t => { + const contextId = createTestContext(); + + try { + // Add a file where displayFilename is .docx but URL points to converted .md file + // This simulates the case where a docx file was converted to markdown + const addResult = await callPathway('sys_tool_file_collection', { + contextId, + url: 'https://example.com/converted-document.md', // Converted to markdown + gcs: 'gs://bucket/converted-document.md', + filename: 'original-document.docx', // Original filename preserved + userMessage: 'Add converted file' + }); + + const addParsed = JSON.parse(addResult); + t.is(addParsed.success, true); + + // Verify file was stored correctly + const collection = await loadFileCollection(contextId, null, false); + t.is(collection.length, 1); + + const file = collection[0]; + // displayFilename should preserve original user filename + t.is(file.displayFilename, 'original-document.docx', 'displayFilename should preserve original filename'); + // mimeType should be determined from URL (actual content), not displayFilename + t.is(file.mimeType, 'text/markdown', 'mimeType should be from URL (.md), not displayFilename (.docx)'); + t.is(file.url, 'https://example.com/converted-document.md', 'URL should point to converted file'); + } finally { + await cleanup(contextId); + } +}); + +test('Converted files: EditFile should use URL MIME type, not displayFilename', async t => { + const contextId = createTestContext(); + + try { + // Add a converted file: displayFilename is .docx but URL is .md + const addResult = await callPathway('sys_tool_file_collection', { + contextId, + url: 'https://example.com/report.md', // Converted markdown + gcs: 'gs://bucket/report.md', + filename: 'report.docx', // Original filename + userMessage: 'Add converted file' + }); + + const addParsed = JSON.parse(addResult); + t.is(addParsed.success, true); + + // Verify the file has correct MIME type from URL + const collection = await loadFileCollection(contextId, null, false); + const file = collection[0]; + t.is(file.mimeType, 'text/markdown', 'MIME type should be from URL'); + t.is(file.displayFilename, 'report.docx', 'displayFilename should be original'); + + // Note: We can't actually test EditFile here without a real file handler, + // but we can verify the file is set up correctly for editing + // The EditFile tool should use getActualContentMimeType() which uses URL + const { getActualContentMimeType } = await import('../../../../lib/fileUtils.js'); + const actualMimeType = getActualContentMimeType(file); + t.is(actualMimeType, 'text/markdown', 'getActualContentMimeType should return URL MIME type'); + } finally { + await cleanup(contextId); + } +}); + +test('Converted files: ReadFile should accept text files based on URL, not displayFilename', async t => { + const contextId = createTestContext(); + + try { + // Add a converted file: displayFilename is .docx but URL is .md + const addResult = await callPathway('sys_tool_file_collection', { + contextId, + url: 'https://example.com/document.md', // Converted markdown (text file) + gcs: 'gs://bucket/document.md', + filename: 'document.docx', // Original filename (would be binary if checked) + userMessage: 'Add converted text file' + }); + + const addParsed = JSON.parse(addResult); + t.is(addParsed.success, true); + + // Verify file setup + const collection = await loadFileCollection(contextId, null, false); + const file = collection[0]; + t.is(file.displayFilename, 'document.docx'); + t.is(file.mimeType, 'text/markdown'); + + // ReadFile should use resolveFileParameter which returns the URL + // The URL (.md) should be recognized as text, not the displayFilename (.docx) + const { resolveFileParameter } = await import('../../../../lib/fileUtils.js'); + const resolvedUrl = await resolveFileParameter('document.docx', contextId); + t.is(resolvedUrl, 'https://example.com/document.md', 'Should resolve to URL'); + + // The isTextFile function in ReadFile should check the URL, not displayFilename + // Since the URL is .md, it should be recognized as text + const { getMimeTypeFromFilename, isTextMimeType } = await import('../../../../lib/fileUtils.js'); + const urlMimeType = getMimeTypeFromFilename(resolvedUrl); + t.is(urlMimeType, 'text/markdown', 'URL should be recognized as markdown'); + t.true(isTextMimeType(urlMimeType), 'Markdown should be recognized as text type'); + } finally { + await cleanup(contextId); + } +}); + +test('Converted files: Multiple converted files with different extensions', async t => { + const contextId = createTestContext(); + + try { + // Add multiple converted files + await callPathway('sys_tool_file_collection', { + contextId, + url: 'https://example.com/doc1.md', // docx -> md + filename: 'document1.docx', + userMessage: 'Add docx->md' + }); + + await callPathway('sys_tool_file_collection', { + contextId, + url: 'https://example.com/doc2.txt', // xlsx -> txt (CSV) + filename: 'spreadsheet.xlsx', + userMessage: 'Add xlsx->txt' + }); + + await callPathway('sys_tool_file_collection', { + contextId, + url: 'https://example.com/doc3.json', // pptx -> json (structured data) + filename: 'presentation.pptx', + userMessage: 'Add pptx->json' + }); + + // Verify all files have correct MIME types from URLs + const collection = await loadFileCollection(contextId, null, false); + t.is(collection.length, 3); + + const doc1 = collection.find(f => f.displayFilename === 'document1.docx'); + t.truthy(doc1); + t.is(doc1.mimeType, 'text/markdown', 'docx->md should have markdown MIME type'); + + const doc2 = collection.find(f => f.displayFilename === 'spreadsheet.xlsx'); + t.truthy(doc2); + t.is(doc2.mimeType, 'text/plain', 'xlsx->txt should have text/plain MIME type'); + + const doc3 = collection.find(f => f.displayFilename === 'presentation.pptx'); + t.truthy(doc3); + t.is(doc3.mimeType, 'application/json', 'pptx->json should have JSON MIME type'); + } finally { + await cleanup(contextId); + } +}); diff --git a/tests/unit/core/fileCollection.test.js b/tests/unit/core/fileCollection.test.js index 19c2eefd..cd476df2 100644 --- a/tests/unit/core/fileCollection.test.js +++ b/tests/unit/core/fileCollection.test.js @@ -411,3 +411,131 @@ test('isTextMimeType should identify text MIME types', async t => { t.false(isTextMimeType('')); }); +// Test converted files: displayFilename has different MIME type than URL +test('determineMimeTypeFromUrl should use URL extension, not displayFilename', async t => { + const { determineMimeTypeFromUrl } = await import('../../../lib/fileUtils.js'); + + // Simulate converted file: displayFilename is .docx but URL is .md + const url = 'https://example.com/converted-file.md'; + const gcs = 'gs://bucket/converted-file.md'; + const displayFilename = 'original-document.docx'; + + // MIME type should be determined from URL (.md), not displayFilename (.docx) + const mimeType = determineMimeTypeFromUrl(url, gcs, null); + t.is(mimeType, 'text/markdown', 'Should use URL extension (.md) for MIME type'); + + // Even if displayFilename is provided, URL takes precedence + const mimeType2 = determineMimeTypeFromUrl(url, gcs, displayFilename); + t.is(mimeType2, 'text/markdown', 'Should still use URL extension even with displayFilename'); +}); + +test('getActualContentMimeType should use URL, not displayFilename', async t => { + const { getActualContentMimeType } = await import('../../../lib/fileUtils.js'); + + // Simulate converted file: displayFilename is .docx but URL is .md + const file = { + url: 'https://example.com/converted-file.md', + gcs: 'gs://bucket/converted-file.md', + displayFilename: 'original-document.docx', + mimeType: null // Not set yet + }; + + const mimeType = getActualContentMimeType(file); + t.is(mimeType, 'text/markdown', 'Should determine MIME type from URL, not displayFilename'); + + // If mimeType is already set (from URL), use it + const fileWithMimeType = { + ...file, + mimeType: 'text/markdown' + }; + const mimeType2 = getActualContentMimeType(fileWithMimeType); + t.is(mimeType2, 'text/markdown', 'Should use stored mimeType if available'); +}); + +test('addFileToCollection should preserve original displayFilename for converted files', async t => { + const { addFileToCollection } = await import('../../../lib/fileUtils.js'); + + // Simulate adding a file where URL points to converted content (.md) + // but user wants to keep original filename (.docx) + const contextId = `test-converted-${Date.now()}`; + const url = 'https://example.com/converted-file.md'; // Converted to markdown + const gcs = 'gs://bucket/converted-file.md'; + const originalFilename = 'original-document.docx'; // User's original filename + + try { + const fileEntry = await addFileToCollection( + contextId, + null, + url, + gcs, + originalFilename, // This should be preserved as displayFilename + [], + '', + null, + null, + null, + false + ); + + // displayFilename should be the original user-provided filename + t.is(fileEntry.displayFilename, 'original-document.docx', 'displayFilename should preserve original filename'); + + // mimeType should be determined from URL (actual content) + t.is(fileEntry.mimeType, 'text/markdown', 'mimeType should be from URL, not displayFilename'); + + // Verify it was saved correctly + const { loadFileCollection } = await import('../../../lib/fileUtils.js'); + const collection = await loadFileCollection(contextId, null, false); + t.is(collection.length, 1); + t.is(collection[0].displayFilename, 'original-document.docx'); + t.is(collection[0].mimeType, 'text/markdown'); + t.is(collection[0].url, url); + } finally { + // Cleanup + const { getRedisClient } = await import('../../../lib/fileUtils.js'); + const redisClient = await getRedisClient(); + if (redisClient) { + await redisClient.del(`FileStoreMap:ctx:${contextId}`); + } + } +}); + +test('syncFilesToCollection should determine MIME type from URL, not displayFilename', async t => { + const { syncFilesToCollection } = await import('../../../lib/fileUtils.js'); + + // Simulate chat history with converted file + const contextId = `test-sync-converted-${Date.now()}`; + const chatHistory = [ + { + role: 'user', + content: [ + { + type: 'file', + url: 'https://example.com/converted-report.md', // Converted to markdown + gcs: 'gs://bucket/converted-report.md', + hash: 'hash123' + } + ] + } + ]; + + try { + await syncFilesToCollection(chatHistory, contextId, null); + + const { loadFileCollection } = await import('../../../lib/fileUtils.js'); + const collection = await loadFileCollection(contextId, null, false); + t.is(collection.length, 1); + + // MIME type should be from URL (.md), not from any displayFilename + t.is(collection[0].mimeType, 'text/markdown', 'MIME type should be determined from URL'); + t.is(collection[0].url, 'https://example.com/converted-report.md'); + } finally { + // Cleanup + const { getRedisClient } = await import('../../../lib/fileUtils.js'); + const redisClient = await getRedisClient(); + if (redisClient) { + await redisClient.del(`FileStoreMap:ctx:${contextId}`); + } + } +}); + From ed4d656af4d435e3629bb3548fc95bdc82aa0e6f Mon Sep 17 00:00:00 2001 From: Jason McCartney Date: Sat, 20 Dec 2025 20:27:25 -0700 Subject: [PATCH 2/5] chore: update version to 2.8.0 in package.json and package-lock.json --- helper-apps/cortex-file-handler/package-lock.json | 4 ++-- helper-apps/cortex-file-handler/package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/helper-apps/cortex-file-handler/package-lock.json b/helper-apps/cortex-file-handler/package-lock.json index ac4a2558..646374d2 100644 --- a/helper-apps/cortex-file-handler/package-lock.json +++ b/helper-apps/cortex-file-handler/package-lock.json @@ -1,12 +1,12 @@ { "name": "@aj-archipelago/cortex-file-handler", - "version": "2.7.0", + "version": "2.8.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@aj-archipelago/cortex-file-handler", - "version": "2.7.0", + "version": "2.8.0", "dependencies": { "@azure/storage-blob": "^12.13.0", "@distube/ytdl-core": "^4.14.3", diff --git a/helper-apps/cortex-file-handler/package.json b/helper-apps/cortex-file-handler/package.json index 39ce23de..88a26399 100644 --- a/helper-apps/cortex-file-handler/package.json +++ b/helper-apps/cortex-file-handler/package.json @@ -1,6 +1,6 @@ { "name": "@aj-archipelago/cortex-file-handler", - "version": "2.7.0", + "version": "2.8.0", "description": "File handling service for Cortex - handles file uploads, media chunking, and document processing", "type": "module", "main": "src/index.js", From f075838ba934642e48b1b02d5cfc9a3f46d6d9e9 Mon Sep 17 00:00:00 2001 From: Jason McCartney Date: Sat, 20 Dec 2025 21:43:03 -0700 Subject: [PATCH 3/5] feat: enhance MIME type handling for converted files - Added functionality to determine and attach the MIME type of converted files based on their URLs during the upload process. - Updated response structure to include `mimeType` for both original and converted files, ensuring accurate content type representation. - Improved error handling for URL parsing to ensure robust MIME type extraction, enhancing overall file handling capabilities. --- .../cortex-file-handler/src/blobHandler.js | 32 ++++ helper-apps/cortex-file-handler/src/index.js | 173 ++++++++++-------- helper-apps/cortex-file-handler/src/redis.js | 9 + .../src/services/storage/StorageService.js | 3 +- 4 files changed, 142 insertions(+), 75 deletions(-) diff --git a/helper-apps/cortex-file-handler/src/blobHandler.js b/helper-apps/cortex-file-handler/src/blobHandler.js index 5fa04fe3..0d9f2032 100644 --- a/helper-apps/cortex-file-handler/src/blobHandler.js +++ b/helper-apps/cortex-file-handler/src/blobHandler.js @@ -623,11 +623,27 @@ function uploadBlob( // Fallback to regular URL } + // Determine MIME type of converted file from its URL + let convertedMimeType = 'application/octet-stream'; + try { + const convertedUrlObj = new URL(convertedSaveResult.url); + const convertedPathname = convertedUrlObj.pathname; + const convertedExtension = path.extname(convertedPathname); + convertedMimeType = mime.lookup(convertedExtension) || 'application/octet-stream'; + } catch (e) { + // If URL parsing fails, try to extract extension from URL string + const urlMatch = convertedSaveResult.url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); + if (urlMatch) { + convertedMimeType = mime.lookup(urlMatch[1]) || 'application/octet-stream'; + } + } + // Attach to response body result.converted = { url: convertedSaveResult.url, shortLivedUrl: convertedShortLivedUrl, gcs: convertedGcsUrl, + mimeType: convertedMimeType, }; // Note: result.shortLivedUrl remains pointing to the original file @@ -951,11 +967,27 @@ async function uploadFile( // Fallback to regular URL } + // Determine MIME type of converted file from its URL + let convertedMimeType = 'application/octet-stream'; + try { + const convertedUrlObj = new URL(convertedSaveResult.url); + const convertedPathname = convertedUrlObj.pathname; + const convertedExtension = path.extname(convertedPathname); + convertedMimeType = mime.lookup(convertedExtension) || 'application/octet-stream'; + } catch (e) { + // If URL parsing fails, try to extract extension from URL string + const urlMatch = convertedSaveResult.url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); + if (urlMatch) { + convertedMimeType = mime.lookup(urlMatch[1]) || 'application/octet-stream'; + } + } + // Add converted file info to result result.converted = { url: convertedSaveResult.url, shortLivedUrl: convertedShortLivedUrl, gcs: convertedGcsUrl, + mimeType: convertedMimeType, }; // Note: result.shortLivedUrl remains pointing to the original file diff --git a/helper-apps/cortex-file-handler/src/index.js b/helper-apps/cortex-file-handler/src/index.js index d00db8d5..443280fc 100644 --- a/helper-apps/cortex-file-handler/src/index.js +++ b/helper-apps/cortex-file-handler/src/index.js @@ -2,6 +2,7 @@ import fs from "fs"; import os from "os"; import path from "path"; import { v4 as uuidv4 } from "uuid"; +import mime from "mime-types"; import { DOC_EXTENSIONS, AZURITE_ACCOUNT_NAME } from "./constants.js"; import { easyChunker } from "./docHelper.js"; @@ -526,93 +527,117 @@ async function CortexFileHandler(context, req) { context.log(`Error ensuring converted version: ${error}`); } - // Attach converted info to response if present - if (hashResult.converted) { - response.converted = { - url: hashResult.converted.url, - gcs: hashResult.converted.gcs, - }; - } - - // Always generate short-lived URL for checkHash operations - // Use converted URL if available, otherwise use original URL - const urlForShortLived = hashResult.converted?.url || hashResult.url; - try { - // Extract blob name from the URL to generate new SAS token - // Container parameter is ignored - always uses default container from env var - let blobName; + // Add mimeType to converted block if it exists but doesn't have mimeType yet + if (hashResult.converted && !hashResult.converted.mimeType) { + let convertedMimeType = 'application/octet-stream'; try { - const url = new URL(urlForShortLived); - // Extract blob name from the URL path (remove leading slash) - let path = url.pathname.substring(1); - - // For Azurite URLs, the path includes account name: devstoreaccount1/container/blob - // For real Azure URLs, the path is: container/blob - // Check if this is an Azurite URL (contains devstoreaccount1) - if (path.startsWith(`${AZURITE_ACCOUNT_NAME}/`)) { - path = path.substring(`${AZURITE_ACCOUNT_NAME}/`.length); // Remove account prefix - } - - // Extract blob name from path (skip container name, always use default container) - const pathSegments = path.split('/').filter(segment => segment.length > 0); - if (pathSegments.length >= 2) { - // Skip container name (first segment), get blob name (remaining segments) - blobName = pathSegments.slice(1).join('/'); - } else if (pathSegments.length === 1) { - // Fallback: assume it's just the blob name in default container - blobName = pathSegments[0]; + const convertedUrlObj = new URL(hashResult.converted.url); + const convertedPathname = convertedUrlObj.pathname; + const convertedExtension = path.extname(convertedPathname); + convertedMimeType = mime.lookup(convertedExtension) || 'application/octet-stream'; + } catch (e) { + // If URL parsing fails, try to extract extension from URL string + const urlMatch = hashResult.converted.url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); + if (urlMatch) { + convertedMimeType = mime.lookup(urlMatch[1]) || 'application/octet-stream'; } - - } catch (urlError) { - context.log(`Error parsing URL for short-lived generation: ${urlError}`); } + hashResult.converted.mimeType = convertedMimeType; + } - // Generate short-lived SAS token - // Container parameter is ignored - always uses default container from env var - if (blobName) { - const provider = storageService.primaryProvider; - - if (provider && provider.generateShortLivedSASToken) { - const blobClientResult = await provider.getBlobClient(); - const containerClient = blobClientResult.containerClient; - - const sasToken = provider.generateShortLivedSASToken( - containerClient, - blobName, - shortLivedDuration - ); + // Generate short-lived URLs for both original and converted files (if converted exists) + // Helper function to generate short-lived URL for a given URL + const generateShortLivedUrlForUrl = async (urlToProcess) => { + if (!urlToProcess) return null; + + try { + // Extract blob name from the URL to generate new SAS token + let blobName; + try { + const url = new URL(urlToProcess); + let path = url.pathname.substring(1); - // Construct new URL with short-lived SAS token - const baseUrl = urlForShortLived.split('?')[0]; // Remove existing SAS token - const shortLivedUrl = `${baseUrl}?${sasToken}`; + // For Azurite URLs, the path includes account name: devstoreaccount1/container/blob + // For real Azure URLs, the path is: container/blob + if (path.startsWith(`${AZURITE_ACCOUNT_NAME}/`)) { + path = path.substring(`${AZURITE_ACCOUNT_NAME}/`.length); + } - // Add short-lived URL to response - response.shortLivedUrl = shortLivedUrl; - response.expiresInMinutes = shortLivedDuration; + const pathSegments = path.split('/').filter(segment => segment.length > 0); + if (pathSegments.length >= 2) { + blobName = pathSegments.slice(1).join('/'); + } else if (pathSegments.length === 1) { + blobName = pathSegments[0]; + } + } catch (urlError) { + context.log(`Error parsing URL for short-lived generation: ${urlError}`); + return null; + } + + if (blobName) { + const provider = storageService.primaryProvider; - const urlType = hashResult.converted?.url ? 'converted' : 'original'; - context.log(`Generated short-lived URL for hash: ${hash} using ${urlType} URL (expires in ${shortLivedDuration} minutes)`); - } else { - // Fallback for storage providers that don't support short-lived tokens - response.shortLivedUrl = urlForShortLived; - response.expiresInMinutes = shortLivedDuration; - const urlType = hashResult.converted?.url ? 'converted' : 'original'; - context.log(`Storage provider doesn't support short-lived tokens, using ${urlType} URL`); + if (provider && provider.generateShortLivedSASToken) { + const blobClientResult = await provider.getBlobClient(); + const containerClient = blobClientResult.containerClient; + + const sasToken = provider.generateShortLivedSASToken( + containerClient, + blobName, + shortLivedDuration + ); + + const baseUrl = urlToProcess.split('?')[0]; + return `${baseUrl}?${sasToken}`; + } } - } else { - // If we couldn't extract blob name, use original URL - response.shortLivedUrl = urlForShortLived; - response.expiresInMinutes = shortLivedDuration; - context.log(`Could not extract blob name from URL, using original URL for short-lived`); + } catch (error) { + context.log(`Error generating short-lived URL: ${error}`); } - } catch (error) { - context.log(`Error generating short-lived URL: ${error}`); - // Provide fallback even on error + + return null; + }; + + // Generate short-lived URLs for response (not stored in Redis) + // Generate short-lived URL for converted file if it exists + let convertedShortLivedUrl = null; + if (hashResult.converted?.url) { + convertedShortLivedUrl = await generateShortLivedUrlForUrl(hashResult.converted.url); + if (!convertedShortLivedUrl) { + // Fallback to regular URL + convertedShortLivedUrl = hashResult.converted.url; + } + context.log(`Generated shortLivedUrl for converted file`); + } + + // Generate short-lived URL for original file (for main response) + const urlForShortLived = hashResult.converted?.url || hashResult.url; + const mainShortLivedUrl = await generateShortLivedUrlForUrl(urlForShortLived); + if (mainShortLivedUrl) { + response.shortLivedUrl = mainShortLivedUrl; + response.expiresInMinutes = shortLivedDuration; + const urlType = hashResult.converted?.url ? 'converted' : 'original'; + context.log(`Generated short-lived URL for hash: ${hash} using ${urlType} URL (expires in ${shortLivedDuration} minutes)`); + } else { + // Fallback for storage providers that don't support short-lived tokens response.shortLivedUrl = urlForShortLived; response.expiresInMinutes = shortLivedDuration; + const urlType = hashResult.converted?.url ? 'converted' : 'original'; + context.log(`Storage provider doesn't support short-lived tokens, using ${urlType} URL`); } - //update redis timestamp with current time + // Attach converted info to response if present (include shortLivedUrl in response only) + if (hashResult.converted) { + response.converted = { + url: hashResult.converted.url, + shortLivedUrl: convertedShortLivedUrl || hashResult.converted.url, + gcs: hashResult.converted.gcs, + mimeType: hashResult.converted.mimeType || null, + }; + } + + // Update redis timestamp with current time + // Note: setFileStoreMap will remove shortLivedUrl fields before storing await setFileStoreMap(hash, hashResult, resolvedContextId); context.res = { diff --git a/helper-apps/cortex-file-handler/src/redis.js b/helper-apps/cortex-file-handler/src/redis.js index f2d6796b..114b8493 100644 --- a/helper-apps/cortex-file-handler/src/redis.js +++ b/helper-apps/cortex-file-handler/src/redis.js @@ -238,6 +238,15 @@ const setFileStoreMap = async (hash, value, contextId = null) => { // Remove 'message' field - it's only for the upload response, not for persistence delete valueToStore.message; + // Remove shortLivedUrl fields - they're only for responses, not for persistence + // Store only permanent URLs (url, gcs, converted.url, converted.gcs) + delete valueToStore.shortLivedUrl; + if (valueToStore.converted) { + const convertedCopy = { ...valueToStore.converted }; + delete convertedCopy.shortLivedUrl; + valueToStore.converted = convertedCopy; + } + // Only set timestamp if one doesn't already exist if (!valueToStore.timestamp) { valueToStore.timestamp = new Date().toISOString(); diff --git a/helper-apps/cortex-file-handler/src/services/storage/StorageService.js b/helper-apps/cortex-file-handler/src/services/storage/StorageService.js index 7edb1d16..ddb4f56c 100644 --- a/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +++ b/helper-apps/cortex-file-handler/src/services/storage/StorageService.js @@ -350,7 +350,8 @@ export class StorageService { convertedResult = { url: hashResult.converted.url, shortLivedUrl: convertedShortLivedUrl, - gcs: hashResult.converted.gcs + gcs: hashResult.converted.gcs, + mimeType: hashResult.converted.mimeType || null }; } catch (error) { context.log?.(`Warning: Failed to update converted file tag: ${error.message}`); From be9e1aa9c1bbbefdc353392f3bed6b58acbce391 Mon Sep 17 00:00:00 2001 From: Jason McCartney Date: Sat, 20 Dec 2025 22:13:31 -0700 Subject: [PATCH 4/5] feat: enhance file handling for converted files - Updated `parseRawFileData` to prioritize converted values (URL, GCS, mimeType) when available, ensuring accurate representation of processed content. - Implemented checks to prevent editing of converted files, providing clear error messages to users attempting to modify them. - Added tests to verify that file collection correctly uses converted values and enforces editing restrictions on converted files, enhancing overall file management capabilities. --- lib/fileUtils.js | 38 ++++---- .../system/entity/tools/sys_tool_editfile.js | 11 +++ .../features/tools/fileCollection.test.js | 86 +++++++++++++++++++ tests/unit/core/shortLivedUrl.test.js | 8 +- 4 files changed, 117 insertions(+), 26 deletions(-) diff --git a/lib/fileUtils.js b/lib/fileUtils.js index f4ee8892..0e17d769 100644 --- a/lib/fileUtils.js +++ b/lib/fileUtils.js @@ -670,19 +670,28 @@ function parseRawFileData(allFiles, contextKey = null) { return null; } + // Use converted URL, GCS, and mimeType if converted block exists + // This ensures we use the converted file (the actual processable content) as the primary values + // Keep displayFilename as the original (e.g., "foo.docx" even if URL is "foo.md") + const url = decryptedData.converted?.url || decryptedData.url; + const gcs = decryptedData.converted?.gcs || decryptedData.gcs || null; + const mimeType = decryptedData.converted?.mimeType || decryptedData.mimeType || null; + // Return parsed file data with hash and inCollection preserved for filtering return { id: decryptedData.id || `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`, - url: decryptedData.url, - gcs: decryptedData.gcs || null, + url: url, // Use converted URL if available + gcs: gcs, // Use converted GCS if available displayFilename: decryptedData.displayFilename || decryptedData.filename || null, - mimeType: decryptedData.mimeType || null, + mimeType: mimeType, // Use converted mimeType if available tags: decryptedData.tags || [], notes: decryptedData.notes || '', hash: hash, permanent: decryptedData.permanent || false, addedDate: decryptedData.addedDate || decryptedData.timestamp || new Date().toISOString(), lastAccessed: decryptedData.lastAccessed || decryptedData.timestamp || new Date().toISOString(), + // Mark as converted if converted block exists (for edit prevention) + ...(decryptedData.converted && { _isConverted: true }), // Preserve inCollection for filtering inCollection: decryptedData.inCollection }; @@ -1614,15 +1623,6 @@ function findFileInCollection(fileParam, collection) { return file; } } - - // Also check by CFH-managed filename (case-insensitive, using basename) - if (file.filename) { - const normalizedFilename = file.filename.toLowerCase(); - const fileFilename = path.basename(normalizedFilename); - if (fileFilename === paramFilename) { - return file; - } - } } // If no exact match, try simple "contains" matches on displayFilename, filename, url, and gcs @@ -1637,14 +1637,6 @@ function findFileInCollection(fileParam, collection) { } } - // Check if CFH-managed filename contains the parameter - if (file.filename) { - const normalizedFilename = file.filename.toLowerCase(); - if (normalizedFilename.includes(normalizedParam)) { - return file; - } - } - // Check if URL contains the parameter if (file.url) { const normalizedUrl = file.url.toLowerCase(); @@ -1871,7 +1863,7 @@ async function checkHashExists(hash, fileHandlerUrl, pathwayResolver = null, con return { url: url, // Prefers converted.shortLivedUrl (for LLM processing), then converted.url, then original shortLivedUrl/url - gcs: gcs, // GCS URL (prefers converted, no short-lived version for GCS) + gcs: gcs, // GCS URL (prefers converted if available, no short-lived version for GCS) hash: data.hash || hash, filename: data.filename || null // Include filename from response }; @@ -1926,7 +1918,7 @@ async function ensureShortLivedUrl(fileObject, fileHandlerUrl, contextId = null, return { ...fileObject, url: resolved.url, // Prefers converted.shortLivedUrl (for LLM processing), then converted.url, then original shortLivedUrl/url - gcs: resolved.gcs || fileObject.gcs || null, // GCS from checkHash (prefers converted) + gcs: resolved.gcs || fileObject.gcs || null, // GCS from checkHash (prefers converted if available) filename: fileObject.filename || resolved.filename || fileObject.filename // Preserve original, fallback to resolved }; } @@ -2223,7 +2215,7 @@ async function resolveFileHashesToContent(fileHashes, config, contextId = null) if (existingFile) { // checkHashExists already returns: // - converted.shortLivedUrl (preferred) or converted.url or original shortLivedUrl/url in url field - // - GCS URL (prefers converted) in gcs field + // - GCS URL (prefers converted if available) in gcs field // - filename in filename field return JSON.stringify({ type: "image_url", diff --git a/pathways/system/entity/tools/sys_tool_editfile.js b/pathways/system/entity/tools/sys_tool_editfile.js index 82be8795..067d3104 100644 --- a/pathways/system/entity/tools/sys_tool_editfile.js +++ b/pathways/system/entity/tools/sys_tool_editfile.js @@ -255,6 +255,16 @@ export default { return JSON.stringify(errorResult); } + // Prevent editing converted files (they can be read but not edited) + if (foundFile._isConverted) { + const errorResult = { + success: false, + error: `Cannot edit converted files. The file "${foundFile.displayFilename || file}" is a converted version and cannot be edited. You can read it using ReadTextFile, but to edit it you would need to edit the original file.` + }; + resolver.tool = JSON.stringify({ toolUsed: toolName }); + return JSON.stringify(errorResult); + } + const fileId = foundFile.id; // Serialize edits to this file (prevents concurrent edits on same instance) @@ -283,6 +293,7 @@ export default { return { jsonResult: JSON.stringify(errorResult) }; } + // Use the file URL (already uses converted URL if it exists) const fileUrl = currentFile.url; if (!fileUrl) { diff --git a/tests/integration/features/tools/fileCollection.test.js b/tests/integration/features/tools/fileCollection.test.js index 6f2f2f54..f813cc1a 100644 --- a/tests/integration/features/tools/fileCollection.test.js +++ b/tests/integration/features/tools/fileCollection.test.js @@ -2209,3 +2209,89 @@ test('Converted files: Multiple converted files with different extensions', asyn await cleanup(contextId); } }); + +test('Converted files: loadFileCollection should use converted values as primary (no converted block in response)', async t => { + const contextId = createTestContext(); + + try { + // Write a file with converted block directly to Redis (simulating CFH behavior) + // The converted block exists in Redis, but is not exposed in the file collection response + const { getRedisClient, writeFileDataToRedis } = await import('../../../../lib/fileUtils.js'); + const redisClient = await getRedisClient(); + + if (!redisClient) { + t.skip('Redis not available'); + return; + } + + const hash = 'test-converted-hash-123'; + const fileData = { + id: 'test-file-id', + url: 'https://example.com/original.docx', + gcs: 'gs://bucket/original.docx', + filename: 'original.docx', + displayFilename: 'original.docx', + mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + hash: hash, + permanent: false, + timestamp: new Date().toISOString(), + // Include converted block in Redis (as CFH would write it) + // This will be used to set primary values, but the block itself is not exposed + converted: { + url: 'https://example.com/converted.md', + gcs: 'gs://bucket/converted.md', + mimeType: 'text/markdown' + }, + inCollection: ['*'] + }; + + const contextMapKey = `FileStoreMap:ctx:${contextId}`; + await writeFileDataToRedis(redisClient, contextMapKey, hash, fileData, null); + + // Load the collection + const collection = await loadFileCollection(contextId, null, false); + t.is(collection.length, 1); + + const file = collection[0]; + + // CRITICAL: Main URL, GCS, and mimeType should use converted values as primary + // The converted block is NOT included in the response - only the _isConverted flag + t.is(file.url, 'https://example.com/converted.md', 'Main URL should be converted URL'); + t.is(file.gcs, 'gs://bucket/converted.md', 'Main GCS should be converted GCS'); + t.is(file.mimeType, 'text/markdown', 'Main mimeType should be converted mimeType'); + t.is(file.displayFilename, 'original.docx', 'displayFilename should preserve original filename'); + + // Verify converted block is NOT included in file collection response + t.falsy(file.converted, 'File should NOT have converted block in collection response'); + t.truthy(file._isConverted, 'File should be marked as converted'); + + // Verify we can match by displayFilename (original filename) + const { findFileInCollection } = await import('../../../../lib/fileUtils.js'); + const matchedByDisplayFilename = findFileInCollection('original.docx', collection); + t.truthy(matchedByDisplayFilename, 'Should match file by original displayFilename'); + + // Verify resolveFileParameter returns converted URL (now the main URL) + const { resolveFileParameter } = await import('../../../../lib/fileUtils.js'); + const resolvedUrl = await resolveFileParameter('original.docx', contextId); + t.is(resolvedUrl, 'https://example.com/converted.md', 'Should resolve to converted URL (now main URL)'); + + // Verify converted files can be read (text type) + const { isTextMimeType } = await import('../../../../lib/fileUtils.js'); + t.true(isTextMimeType(file.mimeType), 'Converted file should be recognized as text type'); + + // Verify converted files cannot be edited + const editResult = await callPathway('sys_tool_editfile', { + contextId, + file: 'original.docx', + startLine: 1, + endLine: 1, + content: 'test', + userMessage: 'Try to edit converted file' + }); + const editParsed = JSON.parse(editResult); + t.is(editParsed.success, false, 'Should not allow editing converted files'); + t.true(editParsed.error.includes('converted') || editParsed.error.includes('Cannot edit'), 'Error should mention converted files cannot be edited'); + } finally { + await cleanup(contextId); + } +}); diff --git a/tests/unit/core/shortLivedUrl.test.js b/tests/unit/core/shortLivedUrl.test.js index 9bbe9f7c..9cff7a0d 100644 --- a/tests/unit/core/shortLivedUrl.test.js +++ b/tests/unit/core/shortLivedUrl.test.js @@ -74,9 +74,10 @@ test('checkHashExists should prefer converted URL in shortLivedUrl', async t => status: 200, data: { url: 'https://storage.example.com/file.xlsx?sv=2023-11-03&se=2025-01-01T00:00:00Z&sig=long-lived', - shortLivedUrl: 'https://storage.example.com/file.csv?sv=2023-11-03&se=2024-01-01T10:15:00Z&sig=short-lived', + shortLivedUrl: 'https://storage.example.com/file.xlsx?sv=2023-11-03&se=2024-01-01T10:15:00Z&sig=short-lived', converted: { url: 'https://storage.example.com/file.csv?sv=2023-11-03&se=2025-01-01T00:00:00Z&sig=long-lived', + shortLivedUrl: 'https://storage.example.com/file.csv?sv=2023-11-03&se=2024-01-01T10:15:00Z&sig=short-lived', gcs: 'gs://bucket/file.csv' }, gcs: 'gs://bucket/file.xlsx', @@ -89,8 +90,9 @@ test('checkHashExists should prefer converted URL in shortLivedUrl', async t => const result = await checkHashExists(hash, fileHandlerUrl); t.truthy(result); - // shortLivedUrl should be based on converted file - t.is(result.url, mockResponse.data.shortLivedUrl, 'Should use shortLivedUrl (which prefers converted)'); + // Should prefer converted.shortLivedUrl first, then converted.url, then original shortLivedUrl/url + // Since converted.shortLivedUrl exists, it should be used + t.is(result.url, mockResponse.data.converted.shortLivedUrl, 'Should prefer converted.shortLivedUrl'); // GCS should prefer converted t.is(result.gcs, mockResponse.data.converted.gcs, 'Should prefer converted GCS URL'); }); From 679ba5169d3402aa612e8f00be0c17ef9f0ab372 Mon Sep 17 00:00:00 2001 From: Jason McCartney Date: Sun, 21 Dec 2025 08:41:53 -0700 Subject: [PATCH 5/5] CR feedback --- .../cortex-file-handler/src/blobHandler.js | 142 +++++++++--------- helper-apps/cortex-file-handler/src/index.js | 17 +-- .../tests/setRetention.test.js | 6 +- .../system/entity/tools/sys_tool_editfile.js | 2 +- 4 files changed, 77 insertions(+), 90 deletions(-) diff --git a/helper-apps/cortex-file-handler/src/blobHandler.js b/helper-apps/cortex-file-handler/src/blobHandler.js index 0d9f2032..07becac5 100644 --- a/helper-apps/cortex-file-handler/src/blobHandler.js +++ b/helper-apps/cortex-file-handler/src/blobHandler.js @@ -177,6 +177,64 @@ async function downloadFromGCS(gcsUrl, destinationPath) { } } +/** + * Extracts MIME type from a URL based on file extension + * @param {string} url - The URL to extract MIME type from + * @returns {string} The MIME type or 'application/octet-stream' as fallback + */ +function getMimeTypeFromUrl(url) { + const defaultMimeType = 'application/octet-stream'; + if (!url) return defaultMimeType; + + try { + const urlObj = new URL(url); + const pathname = urlObj.pathname; + const extension = path.extname(pathname); + return mime.lookup(extension) || defaultMimeType; + } catch (e) { + // If URL parsing fails, try to extract extension from URL string + const urlMatch = url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); + if (urlMatch) { + return mime.lookup(urlMatch[1]) || defaultMimeType; + } + return defaultMimeType; + } +} + +/** + * Generates a short-lived SAS URL for a converted file + * @param {object} context - The request context for logging + * @param {string} convertedUrl - The URL of the converted file + * @param {string} [logSuffix=''] - Optional suffix for log messages + * @returns {Promise} The short-lived URL or the original URL as fallback + */ +async function generateShortLivedUrlForConvertedFile(context, convertedUrl, logSuffix = '') { + let shortLivedUrl = convertedUrl; // Fallback to regular URL + try { + const storageFactory = StorageFactory.getInstance(); + const primaryProvider = await storageFactory.getAzureProvider(); + if (primaryProvider.generateShortLivedSASToken && primaryProvider.extractBlobNameFromUrl) { + const blobName = primaryProvider.extractBlobNameFromUrl(convertedUrl); + if (blobName) { + const { containerClient } = await primaryProvider.getBlobClient(); + const sasToken = primaryProvider.generateShortLivedSASToken( + containerClient, + blobName, + 5 + ); + const urlObj = new URL(convertedUrl); + const baseUrl = `${urlObj.protocol}//${urlObj.host}${urlObj.pathname}`; + shortLivedUrl = `${baseUrl}?${sasToken}`; + context.log(`Generated shortLivedUrl for converted file${logSuffix}`); + } + } + } catch (error) { + context.log(`Warning: Could not generate shortLivedUrl for converted file: ${error.message}`); + // Fallback to regular URL + } + return shortLivedUrl; +} + export const getBlobClient = async () => { const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING; // Always use default container from env var @@ -599,44 +657,14 @@ function uploadBlob( } // Generate shortLivedUrl for converted file - let convertedShortLivedUrl = convertedSaveResult.url; // Fallback to regular URL - try { - const storageFactory = StorageFactory.getInstance(); - const primaryProvider = await storageFactory.getAzureProvider(); - if (primaryProvider.generateShortLivedSASToken && primaryProvider.extractBlobNameFromUrl) { - const convertedBlobName = primaryProvider.extractBlobNameFromUrl(convertedSaveResult.url); - if (convertedBlobName) { - const { containerClient } = await primaryProvider.getBlobClient(); - const convertedShortLivedSasToken = primaryProvider.generateShortLivedSASToken( - containerClient, - convertedBlobName, - 5 - ); - const convertedUrlObj = new URL(convertedSaveResult.url); - const convertedBaseUrl = `${convertedUrlObj.protocol}//${convertedUrlObj.host}${convertedUrlObj.pathname}`; - convertedShortLivedUrl = `${convertedBaseUrl}?${convertedShortLivedSasToken}`; - context.log("Generated shortLivedUrl for converted file (busboy)"); - } - } - } catch (error) { - context.log(`Warning: Could not generate shortLivedUrl for converted file: ${error.message}`); - // Fallback to regular URL - } + const convertedShortLivedUrl = await generateShortLivedUrlForConvertedFile( + context, + convertedSaveResult.url, + ' (busboy)' + ); // Determine MIME type of converted file from its URL - let convertedMimeType = 'application/octet-stream'; - try { - const convertedUrlObj = new URL(convertedSaveResult.url); - const convertedPathname = convertedUrlObj.pathname; - const convertedExtension = path.extname(convertedPathname); - convertedMimeType = mime.lookup(convertedExtension) || 'application/octet-stream'; - } catch (e) { - // If URL parsing fails, try to extract extension from URL string - const urlMatch = convertedSaveResult.url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); - if (urlMatch) { - convertedMimeType = mime.lookup(urlMatch[1]) || 'application/octet-stream'; - } - } + const convertedMimeType = getMimeTypeFromUrl(convertedSaveResult.url); // Attach to response body result.converted = { @@ -943,44 +971,13 @@ async function uploadFile( } // Generate shortLivedUrl for converted file - let convertedShortLivedUrl = convertedSaveResult.url; // Fallback to regular URL - try { - const storageFactory = StorageFactory.getInstance(); - const primaryProvider = await storageFactory.getAzureProvider(); - if (primaryProvider.generateShortLivedSASToken && primaryProvider.extractBlobNameFromUrl) { - const convertedBlobName = primaryProvider.extractBlobNameFromUrl(convertedSaveResult.url); - if (convertedBlobName) { - const { containerClient } = await primaryProvider.getBlobClient(); - const convertedShortLivedSasToken = primaryProvider.generateShortLivedSASToken( - containerClient, - convertedBlobName, - 5 - ); - const convertedUrlObj = new URL(convertedSaveResult.url); - const convertedBaseUrl = `${convertedUrlObj.protocol}//${convertedUrlObj.host}${convertedUrlObj.pathname}`; - convertedShortLivedUrl = `${convertedBaseUrl}?${convertedShortLivedSasToken}`; - context.log("Generated shortLivedUrl for converted file"); - } - } - } catch (error) { - context.log(`Warning: Could not generate shortLivedUrl for converted file: ${error.message}`); - // Fallback to regular URL - } + const convertedShortLivedUrl = await generateShortLivedUrlForConvertedFile( + context, + convertedSaveResult.url + ); // Determine MIME type of converted file from its URL - let convertedMimeType = 'application/octet-stream'; - try { - const convertedUrlObj = new URL(convertedSaveResult.url); - const convertedPathname = convertedUrlObj.pathname; - const convertedExtension = path.extname(convertedPathname); - convertedMimeType = mime.lookup(convertedExtension) || 'application/octet-stream'; - } catch (e) { - // If URL parsing fails, try to extract extension from URL string - const urlMatch = convertedSaveResult.url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); - if (urlMatch) { - convertedMimeType = mime.lookup(urlMatch[1]) || 'application/octet-stream'; - } - } + const convertedMimeType = getMimeTypeFromUrl(convertedSaveResult.url); // Add converted file info to result result.converted = { @@ -1282,6 +1279,7 @@ export { gcs, uploadChunkToGCS, downloadFromGCS, + getMimeTypeFromUrl, // Re-export container constants getDefaultContainerName, GCS_BUCKETNAME, diff --git a/helper-apps/cortex-file-handler/src/index.js b/helper-apps/cortex-file-handler/src/index.js index 443280fc..c7c12344 100644 --- a/helper-apps/cortex-file-handler/src/index.js +++ b/helper-apps/cortex-file-handler/src/index.js @@ -18,7 +18,7 @@ import { } from "./redis.js"; import { FileConversionService } from "./services/FileConversionService.js"; import { StorageService } from "./services/storage/StorageService.js"; -import { uploadBlob } from "./blobHandler.js"; +import { uploadBlob, getMimeTypeFromUrl } from "./blobHandler.js"; import { generateShortId } from "./utils/filenameUtils.js"; import { redactContextId, redactSasToken, sanitizeForLogging } from "./utils/logSecurity.js"; @@ -529,20 +529,7 @@ async function CortexFileHandler(context, req) { // Add mimeType to converted block if it exists but doesn't have mimeType yet if (hashResult.converted && !hashResult.converted.mimeType) { - let convertedMimeType = 'application/octet-stream'; - try { - const convertedUrlObj = new URL(hashResult.converted.url); - const convertedPathname = convertedUrlObj.pathname; - const convertedExtension = path.extname(convertedPathname); - convertedMimeType = mime.lookup(convertedExtension) || 'application/octet-stream'; - } catch (e) { - // If URL parsing fails, try to extract extension from URL string - const urlMatch = hashResult.converted.url.match(/\.([a-zA-Z0-9]+)(?:\?|$)/); - if (urlMatch) { - convertedMimeType = mime.lookup(urlMatch[1]) || 'application/octet-stream'; - } - } - hashResult.converted.mimeType = convertedMimeType; + hashResult.converted.mimeType = getMimeTypeFromUrl(hashResult.converted.url); } // Generate short-lived URLs for both original and converted files (if converted exists) diff --git a/helper-apps/cortex-file-handler/tests/setRetention.test.js b/helper-apps/cortex-file-handler/tests/setRetention.test.js index 22cdf418..356017e0 100644 --- a/helper-apps/cortex-file-handler/tests/setRetention.test.js +++ b/helper-apps/cortex-file-handler/tests/setRetention.test.js @@ -313,7 +313,8 @@ test.serial("should update Redis map with retention information", async (t) => { const newEntry = await getFileStoreMap(testHash); t.truthy(newEntry, "Redis entry should still exist after setting retention"); t.is(newEntry.url, retentionResponse.data.url, "Entry should have correct URL"); - t.truthy(newEntry.shortLivedUrl, "Entry should have shortLivedUrl"); + // Note: shortLivedUrl is intentionally NOT stored in Redis (stripped before persistence) + // It's only returned in the response, which is checked above t.is(newEntry.permanent, true, "Entry should have permanent=true in Redis (matches file collection logic)"); } finally { @@ -551,7 +552,8 @@ test.serial("should set retention for context-scoped file", async (t) => { // Verify Redis entry was updated with context-scoped key const updatedEntry = await getFileStoreMap(testHash, false, contextId); t.truthy(updatedEntry, "Should have updated entry in Redis"); - t.truthy(updatedEntry.shortLivedUrl, "Should have shortLivedUrl in Redis entry"); + // Note: shortLivedUrl is intentionally NOT stored in Redis (stripped before persistence) + // It's only returned in the response, which is checked above t.is(updatedEntry.permanent, true, "Entry should have permanent=true in Redis (matches file collection logic)"); // Wait for operations to complete diff --git a/pathways/system/entity/tools/sys_tool_editfile.js b/pathways/system/entity/tools/sys_tool_editfile.js index 067d3104..193dae9c 100644 --- a/pathways/system/entity/tools/sys_tool_editfile.js +++ b/pathways/system/entity/tools/sys_tool_editfile.js @@ -2,7 +2,7 @@ // Entity tool that modifies existing files by replacing line ranges or exact string matches import logger from '../../../../lib/logger.js'; import { axios } from '../../../../lib/requestExecutor.js'; -import { uploadFileToCloud, findFileInCollection, loadFileCollection, getMimeTypeFromFilename, resolveFileParameter, deleteFileByHash, isTextMimeType, updateFileMetadata, writeFileDataToRedis, invalidateFileCollectionCache, getActualContentMimeType } from '../../../../lib/fileUtils.js'; +import { uploadFileToCloud, findFileInCollection, loadFileCollection, getMimeTypeFromFilename, deleteFileByHash, isTextMimeType, updateFileMetadata, writeFileDataToRedis, invalidateFileCollectionCache, getActualContentMimeType } from '../../../../lib/fileUtils.js'; // Maximum file size for editing (50MB) - prevents memory blowup on huge files const MAX_EDITABLE_FILE_SIZE = 50 * 1024 * 1024;