From 6641b7aad8bae165576fe0868e681d39a6a70f6d Mon Sep 17 00:00:00 2001 From: rafalzawadzki Date: Mon, 23 Feb 2026 16:54:59 +0100 Subject: [PATCH 1/2] Add metadata and extract tools from @supadata/js v1.4.0 Expose two new SDK features as MCP tools: supadata_metadata for fetching media metadata from YouTube, TikTok, Instagram, and Twitter; supadata_extract for AI-powered structured data extraction from video content. Both use the async job pattern with corresponding status-check tools. Enhanced status-check tool descriptions and responses with polling hints to guide LLM assistants to continue polling until job completion. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 27 ++++++- package-lock.json | 8 +- package.json | 2 +- src/index.test.ts | 189 +++++++++++++++++++++++++++++++++++++++++++++- src/mcp.ts | 99 ++++++++++++++++++++++-- 5 files changed, 307 insertions(+), 18 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index cff2e0c..d889418 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -This is a Model Context Protocol (MCP) server implementation for Supadata web scraping and video transcript integration. The project provides six main tools: `supadata_transcript`, `supadata_check_transcript_status`, `supadata_scrape`, `supadata_map`, `supadata_crawl`, and `supadata_check_crawl_status` for video transcription, web scraping, URL discovery, and batch crawling operations. +This is a Model Context Protocol (MCP) server implementation for Supadata web scraping and video transcript integration. The project provides nine main tools: `supadata_transcript`, `supadata_check_transcript_status`, `supadata_scrape`, `supadata_map`, `supadata_crawl`, `supadata_check_crawl_status`, `supadata_metadata`, `supadata_extract`, and `supadata_check_extract_status` for video transcription, web scraping, URL discovery, batch crawling, media metadata retrieval, and AI-powered structured data extraction. ## Key Commands @@ -26,7 +26,7 @@ This is a Model Context Protocol (MCP) server implementation for Supadata web sc The server is built using the `@modelcontextprotocol/sdk` and runs on stdio transport. The main server logic is in `src/index.ts` with the following key components: - **Server Creation**: `createServer()` function creates an McpServer instance -- **Tool Registration**: Six tools are registered with input validation using Zod schemas +- **Tool Registration**: Nine tools are registered with input validation using Zod schemas - **Error Handling**: Comprehensive error handling with retry logic and exponential backoff - **Configuration**: Environment-based configuration with defaults @@ -36,7 +36,9 @@ The server integrates with Supadata's JavaScript SDK (`@supadata/js`) and provid - **Web Scraping**: Single page content extraction to Markdown - **URL Mapping**: Website URL discovery and indexing - **Crawling**: Asynchronous batch crawling of multiple pages -- **Status Checking**: Monitor crawl and transcript job progress and retrieve results +- **Media Metadata**: Retrieve metadata from YouTube, TikTok, Instagram, and Twitter URLs +- **Structured Extraction**: AI-powered extraction of structured data from video content +- **Status Checking**: Monitor crawl, transcript, and extract job progress and retrieve results ### Tool Implementations @@ -76,6 +78,23 @@ The server integrates with Supadata's JavaScript SDK (`@supadata/js`) and provid - **Output**: Job status and results (if completed) - **Cost**: No additional cost +#### supadata_metadata +- **Purpose**: Fetch metadata from media URLs on supported platforms +- **Input**: `url` (string) +- **Output**: Rich metadata object with platform, title, description, author info, engagement stats, media details, tags, and creation date +- **Supported Platforms**: YouTube, TikTok, Instagram, Twitter + +#### supadata_extract +- **Purpose**: Extract structured data from video content using AI +- **Input**: `url` (string), `prompt` (string optional), `schema` (object optional - JSON Schema for output format) +- **Output**: Job ID for async processing + +#### supadata_check_extract_status +- **Purpose**: Check extract job status and retrieve results +- **Input**: `id` (string - job ID from extract) +- **Output**: Job status and extracted data (if completed) +- **Cost**: No additional cost + ## Configuration ### Required Environment Variables @@ -100,7 +119,7 @@ The server includes robust error handling with: ## Testing The test suite uses Jest with TypeScript and ESM support. Tests cover: -- All six tool implementations +- All nine tool implementations - Error handling scenarios - Rate limiting behavior - Mock-based testing with `@jest/globals` diff --git a/package-lock.json b/package-lock.json index bd542b2..0b64429 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,7 +10,7 @@ "license": "MIT", "dependencies": { "@modelcontextprotocol/sdk": "^1.25.3", - "@supadata/js": "^1.3.0", + "@supadata/js": "^1.4.0", "dotenv": "^16.4.7", "zod": "^3.25.76" }, @@ -1747,9 +1747,9 @@ } }, "node_modules/@supadata/js": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/@supadata/js/-/js-1.3.0.tgz", - "integrity": "sha512-fk7EnwvuMfhI1pahGr+38xpBh+T0gc+Jq1bMsh8bWlHsaXhh12S+Pv/3BI9OP9lR47MxxNBwdEUVBk9LoTm9pA==", + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/@supadata/js/-/js-1.4.0.tgz", + "integrity": "sha512-DzKwwXApb58bAMv6PzW+kbRGMOffNfxBlN4B7eLJ/boFhylmlj6zI9fXlMKFIPt+vnPpJXGCk7OaguaosTYdYA==", "license": "MIT", "dependencies": { "cross-fetch": "^4.0.0" diff --git a/package.json b/package.json index a2c1dcf..c145980 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,7 @@ "license": "MIT", "dependencies": { "@modelcontextprotocol/sdk": "^1.25.3", - "@supadata/js": "^1.3.0", + "@supadata/js": "^1.4.0", "dotenv": "^16.4.7", "zod": "^3.25.76" }, diff --git a/src/index.test.ts b/src/index.test.ts index 2c857cd..a96a941 100644 --- a/src/index.test.ts +++ b/src/index.test.ts @@ -31,9 +31,16 @@ interface MockWebService { getCrawlResults: jest.MockedFunction<(id: string) => Promise>; } +interface MockExtractService { + get: jest.MockedFunction<(params: any) => Promise<{ jobId: string }>>; + getResults: jest.MockedFunction<(id: string) => Promise>; +} + interface MockSupadataClient { transcript: MockTranscriptService; web: MockWebService; + metadata: jest.MockedFunction<(params: any) => Promise>; + extract: MockExtractService; } describe('Supadata Tool Tests', () => { @@ -54,7 +61,12 @@ describe('Supadata Tool Tests', () => { map: jest.fn(), crawl: jest.fn(), getCrawlResults: jest.fn(), - } + }, + metadata: jest.fn(), + extract: { + get: jest.fn(), + getResults: jest.fn(), + }, }; // Create request handler @@ -234,6 +246,144 @@ describe('Supadata Tool Tests', () => { }); }); + // Test metadata functionality + test('should handle metadata request', async () => { + const url = 'https://www.youtube.com/watch?v=example'; + + const mockResponse = { + platform: 'youtube', + type: 'video', + id: 'example', + url: url, + title: 'Example Video', + description: 'An example video description', + author: { + username: 'examplechannel', + displayName: 'Example Channel', + avatarUrl: 'https://example.com/avatar.jpg', + verified: true, + }, + stats: { views: 1000000, likes: 50000, comments: 3000, shares: null }, + media: { type: 'video', url: 'https://example.com/video.mp4' }, + tags: ['example', 'test'], + createdAt: '2024-01-01T00:00:00Z', + additionalData: {}, + }; + + mockClient.metadata.mockResolvedValueOnce(mockResponse); + + const response = await requestHandler({ + method: 'call_tool', + params: { + name: 'supadata_metadata', + arguments: { url }, + }, + }); + + expect(response.isError).toBe(false); + expect(response.content[0].text).toContain('Example Video'); + expect(response.content[0].text).toContain('youtube'); + expect(mockClient.metadata).toHaveBeenCalledWith({ url }); + }); + + // Test extract functionality + test('should handle extract request with prompt', async () => { + const url = 'https://www.youtube.com/watch?v=example'; + const prompt = 'Extract the main topics discussed'; + + mockClient.extract.get.mockResolvedValueOnce({ + jobId: 'test-extract-job-id', + }); + + const response = await requestHandler({ + method: 'call_tool', + params: { + name: 'supadata_extract', + arguments: { url, prompt }, + }, + }); + + expect(response.isError).toBe(false); + expect(response.content[0].text).toContain('test-extract-job-id'); + expect(mockClient.extract.get).toHaveBeenCalledWith({ url, prompt }); + }); + + test('should handle extract request with schema', async () => { + const url = 'https://www.youtube.com/watch?v=example'; + const schema = { + type: 'object', + properties: { + topics: { type: 'array', items: { type: 'string' } }, + sentiment: { type: 'string' }, + }, + }; + + mockClient.extract.get.mockResolvedValueOnce({ + jobId: 'test-extract-schema-job-id', + }); + + const response = await requestHandler({ + method: 'call_tool', + params: { + name: 'supadata_extract', + arguments: { url, schema }, + }, + }); + + expect(response.isError).toBe(false); + expect(response.content[0].text).toContain('test-extract-schema-job-id'); + expect(mockClient.extract.get).toHaveBeenCalledWith({ url, schema }); + }); + + // Test check extract status functionality + test('should handle extract status request', async () => { + const id = 'test-extract-job-id'; + + const mockStatusResponse = { + status: 'completed', + data: { topics: ['topic1', 'topic2'], sentiment: 'positive' }, + schema: { + type: 'object', + properties: { + topics: { type: 'array', items: { type: 'string' } }, + sentiment: { type: 'string' }, + }, + }, + }; + + mockClient.extract.getResults.mockResolvedValueOnce(mockStatusResponse); + + const response = await requestHandler({ + method: 'call_tool', + params: { + name: 'supadata_check_extract_status', + arguments: { id }, + }, + }); + + expect(response.isError).toBe(false); + expect(response.content[0].text).toContain('completed'); + expect(response.content[0].text).toContain('topic1'); + expect(mockClient.extract.getResults).toHaveBeenCalledWith(id); + }); + + test('should handle extract API errors', async () => { + const url = 'https://www.youtube.com/watch?v=example'; + + mockClient.extract.get.mockRejectedValueOnce(new Error('API Error')); + + const response = await requestHandler({ + method: 'call_tool', + params: { + name: 'supadata_extract', + arguments: { url }, + }, + }); + + expect(response.isError).toBe(true); + expect(response.content[0].text).toContain('API Error'); + }); + // Test error handling test('should handle API errors', async () => { const url = 'https://example.com'; @@ -371,6 +521,43 @@ async function handleRequest( }; } + case 'supadata_metadata': { + const response = await client.metadata({ url: args.url }); + return { + content: [ + { type: 'text', text: JSON.stringify(response, null, 2) }, + ], + isError: false, + }; + } + + case 'supadata_extract': { + const params: any = { url: args.url }; + if (args.prompt) params.prompt = args.prompt; + if (args.schema) params.schema = args.schema; + const response = await client.extract.get(params); + const jobId = response.jobId || response; + return { + content: [ + { + type: 'text', + text: `Started extract job for ${args.url} with job ID: ${jobId}. Use supadata_check_extract_status to check progress.`, + }, + ], + isError: false, + }; + } + + case 'supadata_check_extract_status': { + const response = await client.extract.getResults(args.id); + return { + content: [ + { type: 'text', text: JSON.stringify(response, null, 2) }, + ], + isError: false, + }; + } + default: throw new Error(`Unknown tool: ${name}`); } diff --git a/src/mcp.ts b/src/mcp.ts index 0a038ab..1b29e0e 100644 --- a/src/mcp.ts +++ b/src/mcp.ts @@ -46,11 +46,39 @@ async function callSupadata(path: string, args: any, apiKey: string, method: 'GE return res.json(); } +const TERMINAL_STATUSES = ['completed', 'failed', 'cancelled']; + +function addPollingHint( + result: any, + toolName: string, + id: string, + inProgressStatuses?: string[], +) { + const status = result?.status; + if (!status) return result; + + const isTerminal = inProgressStatuses + ? !inProgressStatuses.includes(status) + : TERMINAL_STATUSES.includes(status); + + if (!isTerminal) { + return { + ...result, + _polling: { + message: `Job is still processing (status: "${status}"). Call ${toolName} again with id "${id}" to check progress.`, + retry_after_seconds: 5, + }, + }; + } + + return result; +} + const toolRegistry = { supadata_transcript: { schema: { name: 'supadata_transcript', - description: 'Extract transcript from video or file URL', + description: 'Extract transcript from a video or file URL. For large files, returns a jobId instead of the transcript directly - use supadata_check_transcript_status with that jobId to poll for results.', inputSchema: { type: 'object', properties: { @@ -70,7 +98,7 @@ const toolRegistry = { supadata_check_transcript_status: { schema: { name: 'supadata_check_transcript_status', - description: 'Check transcript job status', + description: 'Check transcript job status and retrieve results. Returns status: "queued", "active", "completed", or "failed". If status is not "completed" or "failed", call this tool again after a few seconds with the same id.', inputSchema: { type: 'object', properties: { @@ -79,9 +107,10 @@ const toolRegistry = { required: ['id'], }, }, - handler: (args: any, apiKey: string) => { + handler: async (args: any, apiKey: string) => { const id = args.id; - return callSupadata(`/transcript/${id}`, {}, apiKey, 'GET'); + const result = await callSupadata(`/transcript/${id}`, {}, apiKey, 'GET'); + return addPollingHint(result, 'supadata_check_transcript_status', id); }, }, @@ -122,7 +151,7 @@ const toolRegistry = { supadata_crawl: { schema: { name: 'supadata_crawl', - description: 'Create crawl job', + description: 'Create a crawl job to extract content from all pages on a website. Returns a jobId - use supadata_check_crawl_status with that jobId to poll for results.', inputSchema: { type: 'object', properties: { @@ -139,7 +168,60 @@ const toolRegistry = { supadata_check_crawl_status: { schema: { name: 'supadata_check_crawl_status', - description: 'Check crawl job status', + description: 'Check crawl job status and retrieve results. Returns status: "scraping", "completed", "failed", or "cancelled". If status is "scraping", call this tool again after a few seconds with the same id.', + inputSchema: { + type: 'object', + properties: { + id: { type: 'string' }, + }, + required: ['id'], + }, + }, + handler: async (args: any, apiKey: string) => { + const id = args.id; + const result = await callSupadata(`/web/crawl/${id}`, {}, apiKey, 'GET'); + return addPollingHint(result, 'supadata_check_crawl_status', id, ['scraping']); + }, + }, + + supadata_metadata: { + schema: { + name: 'supadata_metadata', + description: 'Fetch metadata from a media URL (YouTube, TikTok, Instagram, Twitter). Returns platform info, title, description, author details, engagement stats, media details, tags, and creation date.', + inputSchema: { + type: 'object', + properties: { + url: { type: 'string' }, + }, + required: ['url'], + }, + }, + handler: (args: any, apiKey: string) => + callSupadata('/metadata', args, apiKey, 'GET'), + }, + + supadata_extract: { + schema: { + name: 'supadata_extract', + description: 'Extract structured data from a video URL using AI. Provide a prompt for what to extract, a JSON Schema for the output format, or both. Returns a jobId for async processing - use supadata_check_extract_status with that jobId to poll for results.', + inputSchema: { + type: 'object', + properties: { + url: { type: 'string' }, + prompt: { type: 'string' }, + schema: { type: 'object' }, + }, + required: ['url'], + }, + }, + handler: (args: any, apiKey: string) => + callSupadata('/extract', args, apiKey, 'POST'), + }, + + supadata_check_extract_status: { + schema: { + name: 'supadata_check_extract_status', + description: 'Check extract job status and retrieve results. Returns status: "queued", "active", "completed", or "failed". If status is not "completed" or "failed", call this tool again after a few seconds with the same id.', inputSchema: { type: 'object', properties: { @@ -148,9 +230,10 @@ const toolRegistry = { required: ['id'], }, }, - handler: (args: any, apiKey: string) => { + handler: async (args: any, apiKey: string) => { const id = args.id; - return callSupadata(`/web/crawl/${id}`, {}, apiKey, 'GET'); + const result = await callSupadata(`/extract/${id}`, {}, apiKey, 'GET'); + return addPollingHint(result, 'supadata_check_extract_status', id); }, }, }; From 23d6288565eb821a49cc6fa61be6857a4b11b6d6 Mon Sep 17 00:00:00 2001 From: rafalzawadzki Date: Mon, 23 Feb 2026 16:55:26 +0100 Subject: [PATCH 2/2] Bump version to 1.2.0 Co-Authored-By: Claude Opus 4.6 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index c145980..59848a0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@supadata/mcp", - "version": "1.1.0", + "version": "1.2.0", "description": "MCP server for Supadata video & web scraping integration. Features include YouTube, TikTok, Instagram, Twitter, and file video transcription, web scraping, batch processing and structured data extraction.", "type": "module", "bin": "./dist/index.js",