From 6641b7aad8bae165576fe0868e681d39a6a70f6d Mon Sep 17 00:00:00 2001
From: rafalzawadzki <iam@rafalzawadzki.com>
Date: Mon, 23 Feb 2026 16:54:59 +0100
Subject: [PATCH 1/2] Add metadata and extract tools from @supadata/js v1.4.0

Expose two new SDK features as MCP tools: supadata_metadata for fetching media metadata from YouTube, TikTok, Instagram, and Twitter; supadata_extract for AI-powered structured data extraction from video content. Both use the async job pattern with corresponding status-check tools. Enhanced status-check tool descriptions and responses with polling hints to guide LLM assistants to continue polling until job completion.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CLAUDE.md         |  27 ++++++-
 package-lock.json |   8 +-
 package.json      |   2 +-
 src/index.test.ts | 189 +++++++++++++++++++++++++++++++++++++++++++++-
 src/mcp.ts        |  99 ++++++++++++++++++++++--
 5 files changed, 307 insertions(+), 18 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index cff2e0c..d889418 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Project Overview
 
-This is a Model Context Protocol (MCP) server implementation for Supadata web scraping and video transcript integration. The project provides six main tools: `supadata_transcript`, `supadata_check_transcript_status`, `supadata_scrape`, `supadata_map`, `supadata_crawl`, and `supadata_check_crawl_status` for video transcription, web scraping, URL discovery, and batch crawling operations.
+This is a Model Context Protocol (MCP) server implementation for Supadata web scraping and video transcript integration. The project provides nine main tools: `supadata_transcript`, `supadata_check_transcript_status`, `supadata_scrape`, `supadata_map`, `supadata_crawl`, `supadata_check_crawl_status`, `supadata_metadata`, `supadata_extract`, and `supadata_check_extract_status` for video transcription, web scraping, URL discovery, batch crawling, media metadata retrieval, and AI-powered structured data extraction.
 
 ## Key Commands
 
@@ -26,7 +26,7 @@ This is a Model Context Protocol (MCP) server implementation for Supadata web sc
 The server is built using the `@modelcontextprotocol/sdk` and runs on stdio transport. The main server logic is in `src/index.ts` with the following key components:
 
 - **Server Creation**: `createServer()` function creates an McpServer instance
-- **Tool Registration**: Six tools are registered with input validation using Zod schemas
+- **Tool Registration**: Nine tools are registered with input validation using Zod schemas
 - **Error Handling**: Comprehensive error handling with retry logic and exponential backoff
 - **Configuration**: Environment-based configuration with defaults
 
@@ -36,7 +36,9 @@ The server integrates with Supadata's JavaScript SDK (`@supadata/js`) and provid
 - **Web Scraping**: Single page content extraction to Markdown
 - **URL Mapping**: Website URL discovery and indexing
 - **Crawling**: Asynchronous batch crawling of multiple pages
-- **Status Checking**: Monitor crawl and transcript job progress and retrieve results
+- **Media Metadata**: Retrieve metadata from YouTube, TikTok, Instagram, and Twitter URLs
+- **Structured Extraction**: AI-powered extraction of structured data from video content
+- **Status Checking**: Monitor crawl, transcript, and extract job progress and retrieve results
 
 ### Tool Implementations
 
@@ -76,6 +78,23 @@ The server integrates with Supadata's JavaScript SDK (`@supadata/js`) and provid
 - **Output**: Job status and results (if completed)
 - **Cost**: No additional cost
 
+#### supadata_metadata
+- **Purpose**: Fetch metadata from media URLs on supported platforms
+- **Input**: `url` (string)
+- **Output**: Rich metadata object with platform, title, description, author info, engagement stats, media details, tags, and creation date
+- **Supported Platforms**: YouTube, TikTok, Instagram, Twitter
+
+#### supadata_extract
+- **Purpose**: Extract structured data from video content using AI
+- **Input**: `url` (string), `prompt` (string optional), `schema` (object optional - JSON Schema for output format)
+- **Output**: Job ID for async processing
+
+#### supadata_check_extract_status
+- **Purpose**: Check extract job status and retrieve results
+- **Input**: `id` (string - job ID from extract)
+- **Output**: Job status and extracted data (if completed)
+- **Cost**: No additional cost
+
 ## Configuration
 
 ### Required Environment Variables
@@ -100,7 +119,7 @@ The server includes robust error handling with:
 ## Testing
 
 The test suite uses Jest with TypeScript and ESM support. Tests cover:
-- All six tool implementations
+- All nine tool implementations
 - Error handling scenarios
 - Rate limiting behavior
 - Mock-based testing with `@jest/globals`
diff --git a/package-lock.json b/package-lock.json
index bd542b2..0b64429 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -10,7 +10,7 @@
       "license": "MIT",
       "dependencies": {
         "@modelcontextprotocol/sdk": "^1.25.3",
-        "@supadata/js": "^1.3.0",
+        "@supadata/js": "^1.4.0",
         "dotenv": "^16.4.7",
         "zod": "^3.25.76"
       },
@@ -1747,9 +1747,9 @@
       }
     },
     "node_modules/@supadata/js": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/@supadata/js/-/js-1.3.0.tgz",
-      "integrity": "sha512-fk7EnwvuMfhI1pahGr+38xpBh+T0gc+Jq1bMsh8bWlHsaXhh12S+Pv/3BI9OP9lR47MxxNBwdEUVBk9LoTm9pA==",
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/@supadata/js/-/js-1.4.0.tgz",
+      "integrity": "sha512-DzKwwXApb58bAMv6PzW+kbRGMOffNfxBlN4B7eLJ/boFhylmlj6zI9fXlMKFIPt+vnPpJXGCk7OaguaosTYdYA==",
       "license": "MIT",
       "dependencies": {
         "cross-fetch": "^4.0.0"
diff --git a/package.json b/package.json
index a2c1dcf..c145980 100644
--- a/package.json
+++ b/package.json
@@ -19,7 +19,7 @@
   "license": "MIT",
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.25.3",
-    "@supadata/js": "^1.3.0",
+    "@supadata/js": "^1.4.0",
     "dotenv": "^16.4.7",
     "zod": "^3.25.76"
   },
diff --git a/src/index.test.ts b/src/index.test.ts
index 2c857cd..a96a941 100644
--- a/src/index.test.ts
+++ b/src/index.test.ts
@@ -31,9 +31,16 @@ interface MockWebService {
   getCrawlResults: jest.MockedFunction<(id: string) => Promise<any>>;
 }
 
+interface MockExtractService {
+  get: jest.MockedFunction<(params: any) => Promise<{ jobId: string }>>;
+  getResults: jest.MockedFunction<(id: string) => Promise<any>>;
+}
+
 interface MockSupadataClient {
   transcript: MockTranscriptService;
   web: MockWebService;
+  metadata: jest.MockedFunction<(params: any) => Promise<any>>;
+  extract: MockExtractService;
 }
 
 describe('Supadata Tool Tests', () => {
@@ -54,7 +61,12 @@ describe('Supadata Tool Tests', () => {
         map: jest.fn(),
         crawl: jest.fn(),
         getCrawlResults: jest.fn(),
-      }
+      },
+      metadata: jest.fn(),
+      extract: {
+        get: jest.fn(),
+        getResults: jest.fn(),
+      },
     };
 
     // Create request handler
@@ -234,6 +246,144 @@ describe('Supadata Tool Tests', () => {
     });
   });
 
+  // Test metadata functionality
+  test('should handle metadata request', async () => {
+    const url = 'https://www.youtube.com/watch?v=example';
+
+    const mockResponse = {
+      platform: 'youtube',
+      type: 'video',
+      id: 'example',
+      url: url,
+      title: 'Example Video',
+      description: 'An example video description',
+      author: {
+        username: 'examplechannel',
+        displayName: 'Example Channel',
+        avatarUrl: 'https://example.com/avatar.jpg',
+        verified: true,
+      },
+      stats: { views: 1000000, likes: 50000, comments: 3000, shares: null },
+      media: { type: 'video', url: 'https://example.com/video.mp4' },
+      tags: ['example', 'test'],
+      createdAt: '2024-01-01T00:00:00Z',
+      additionalData: {},
+    };
+
+    mockClient.metadata.mockResolvedValueOnce(mockResponse);
+
+    const response = await requestHandler({
+      method: 'call_tool',
+      params: {
+        name: 'supadata_metadata',
+        arguments: { url },
+      },
+    });
+
+    expect(response.isError).toBe(false);
+    expect(response.content[0].text).toContain('Example Video');
+    expect(response.content[0].text).toContain('youtube');
+    expect(mockClient.metadata).toHaveBeenCalledWith({ url });
+  });
+
+  // Test extract functionality
+  test('should handle extract request with prompt', async () => {
+    const url = 'https://www.youtube.com/watch?v=example';
+    const prompt = 'Extract the main topics discussed';
+
+    mockClient.extract.get.mockResolvedValueOnce({
+      jobId: 'test-extract-job-id',
+    });
+
+    const response = await requestHandler({
+      method: 'call_tool',
+      params: {
+        name: 'supadata_extract',
+        arguments: { url, prompt },
+      },
+    });
+
+    expect(response.isError).toBe(false);
+    expect(response.content[0].text).toContain('test-extract-job-id');
+    expect(mockClient.extract.get).toHaveBeenCalledWith({ url, prompt });
+  });
+
+  test('should handle extract request with schema', async () => {
+    const url = 'https://www.youtube.com/watch?v=example';
+    const schema = {
+      type: 'object',
+      properties: {
+        topics: { type: 'array', items: { type: 'string' } },
+        sentiment: { type: 'string' },
+      },
+    };
+
+    mockClient.extract.get.mockResolvedValueOnce({
+      jobId: 'test-extract-schema-job-id',
+    });
+
+    const response = await requestHandler({
+      method: 'call_tool',
+      params: {
+        name: 'supadata_extract',
+        arguments: { url, schema },
+      },
+    });
+
+    expect(response.isError).toBe(false);
+    expect(response.content[0].text).toContain('test-extract-schema-job-id');
+    expect(mockClient.extract.get).toHaveBeenCalledWith({ url, schema });
+  });
+
+  // Test check extract status functionality
+  test('should handle extract status request', async () => {
+    const id = 'test-extract-job-id';
+
+    const mockStatusResponse = {
+      status: 'completed',
+      data: { topics: ['topic1', 'topic2'], sentiment: 'positive' },
+      schema: {
+        type: 'object',
+        properties: {
+          topics: { type: 'array', items: { type: 'string' } },
+          sentiment: { type: 'string' },
+        },
+      },
+    };
+
+    mockClient.extract.getResults.mockResolvedValueOnce(mockStatusResponse);
+
+    const response = await requestHandler({
+      method: 'call_tool',
+      params: {
+        name: 'supadata_check_extract_status',
+        arguments: { id },
+      },
+    });
+
+    expect(response.isError).toBe(false);
+    expect(response.content[0].text).toContain('completed');
+    expect(response.content[0].text).toContain('topic1');
+    expect(mockClient.extract.getResults).toHaveBeenCalledWith(id);
+  });
+
+  test('should handle extract API errors', async () => {
+    const url = 'https://www.youtube.com/watch?v=example';
+
+    mockClient.extract.get.mockRejectedValueOnce(new Error('API Error'));
+
+    const response = await requestHandler({
+      method: 'call_tool',
+      params: {
+        name: 'supadata_extract',
+        arguments: { url },
+      },
+    });
+
+    expect(response.isError).toBe(true);
+    expect(response.content[0].text).toContain('API Error');
+  });
+
   // Test error handling
   test('should handle API errors', async () => {
     const url = 'https://example.com';
@@ -371,6 +521,43 @@ async function handleRequest(
         };
       }
 
+      case 'supadata_metadata': {
+        const response = await client.metadata({ url: args.url });
+        return {
+          content: [
+            { type: 'text', text: JSON.stringify(response, null, 2) },
+          ],
+          isError: false,
+        };
+      }
+
+      case 'supadata_extract': {
+        const params: any = { url: args.url };
+        if (args.prompt) params.prompt = args.prompt;
+        if (args.schema) params.schema = args.schema;
+        const response = await client.extract.get(params);
+        const jobId = response.jobId || response;
+        return {
+          content: [
+            {
+              type: 'text',
+              text: `Started extract job for ${args.url} with job ID: ${jobId}. Use supadata_check_extract_status to check progress.`,
+            },
+          ],
+          isError: false,
+        };
+      }
+
+      case 'supadata_check_extract_status': {
+        const response = await client.extract.getResults(args.id);
+        return {
+          content: [
+            { type: 'text', text: JSON.stringify(response, null, 2) },
+          ],
+          isError: false,
+        };
+      }
+
       default:
         throw new Error(`Unknown tool: ${name}`);
     }
diff --git a/src/mcp.ts b/src/mcp.ts
index 0a038ab..1b29e0e 100644
--- a/src/mcp.ts
+++ b/src/mcp.ts
@@ -46,11 +46,39 @@ async function callSupadata(path: string, args: any, apiKey: string, method: 'GE
   return res.json();
 }
 
+const TERMINAL_STATUSES = ['completed', 'failed', 'cancelled'];
+
+function addPollingHint(
+  result: any,
+  toolName: string,
+  id: string,
+  inProgressStatuses?: string[],
+) {
+  const status = result?.status;
+  if (!status) return result;
+
+  const isTerminal = inProgressStatuses
+    ? !inProgressStatuses.includes(status)
+    : TERMINAL_STATUSES.includes(status);
+
+  if (!isTerminal) {
+    return {
+      ...result,
+      _polling: {
+        message: `Job is still processing (status: "${status}"). Call ${toolName} again with id "${id}" to check progress.`,
+        retry_after_seconds: 5,
+      },
+    };
+  }
+
+  return result;
+}
+
 const toolRegistry = {
   supadata_transcript: {
     schema: {
       name: 'supadata_transcript',
-      description: 'Extract transcript from video or file URL',
+      description: 'Extract transcript from a video or file URL. For large files, returns a jobId instead of the transcript directly - use supadata_check_transcript_status with that jobId to poll for results.',
       inputSchema: {
         type: 'object',
         properties: {
@@ -70,7 +98,7 @@ const toolRegistry = {
   supadata_check_transcript_status: {
     schema: {
       name: 'supadata_check_transcript_status',
-      description: 'Check transcript job status',
+      description: 'Check transcript job status and retrieve results. Returns status: "queued", "active", "completed", or "failed". If status is not "completed" or "failed", call this tool again after a few seconds with the same id.',
       inputSchema: {
         type: 'object',
         properties: {
@@ -79,9 +107,10 @@ const toolRegistry = {
         required: ['id'],
       },
     },
-    handler: (args: any, apiKey: string) => {
+    handler: async (args: any, apiKey: string) => {
       const id = args.id;
-      return callSupadata(`/transcript/${id}`, {}, apiKey, 'GET');
+      const result = await callSupadata(`/transcript/${id}`, {}, apiKey, 'GET');
+      return addPollingHint(result, 'supadata_check_transcript_status', id);
     },
   },
 
@@ -122,7 +151,7 @@ const toolRegistry = {
   supadata_crawl: {
     schema: {
       name: 'supadata_crawl',
-      description: 'Create crawl job',
+      description: 'Create a crawl job to extract content from all pages on a website. Returns a jobId - use supadata_check_crawl_status with that jobId to poll for results.',
       inputSchema: {
         type: 'object',
         properties: {
@@ -139,7 +168,60 @@ const toolRegistry = {
   supadata_check_crawl_status: {
     schema: {
       name: 'supadata_check_crawl_status',
-      description: 'Check crawl job status',
+      description: 'Check crawl job status and retrieve results. Returns status: "scraping", "completed", "failed", or "cancelled". If status is "scraping", call this tool again after a few seconds with the same id.',
+      inputSchema: {
+        type: 'object',
+        properties: {
+          id: { type: 'string' },
+        },
+        required: ['id'],
+      },
+    },
+    handler: async (args: any, apiKey: string) => {
+      const id = args.id;
+      const result = await callSupadata(`/web/crawl/${id}`, {}, apiKey, 'GET');
+      return addPollingHint(result, 'supadata_check_crawl_status', id, ['scraping']);
+    },
+  },
+
+  supadata_metadata: {
+    schema: {
+      name: 'supadata_metadata',
+      description: 'Fetch metadata from a media URL (YouTube, TikTok, Instagram, Twitter). Returns platform info, title, description, author details, engagement stats, media details, tags, and creation date.',
+      inputSchema: {
+        type: 'object',
+        properties: {
+          url: { type: 'string' },
+        },
+        required: ['url'],
+      },
+    },
+    handler: (args: any, apiKey: string) =>
+      callSupadata('/metadata', args, apiKey, 'GET'),
+  },
+
+  supadata_extract: {
+    schema: {
+      name: 'supadata_extract',
+      description: 'Extract structured data from a video URL using AI. Provide a prompt for what to extract, a JSON Schema for the output format, or both. Returns a jobId for async processing - use supadata_check_extract_status with that jobId to poll for results.',
+      inputSchema: {
+        type: 'object',
+        properties: {
+          url: { type: 'string' },
+          prompt: { type: 'string' },
+          schema: { type: 'object' },
+        },
+        required: ['url'],
+      },
+    },
+    handler: (args: any, apiKey: string) =>
+      callSupadata('/extract', args, apiKey, 'POST'),
+  },
+
+  supadata_check_extract_status: {
+    schema: {
+      name: 'supadata_check_extract_status',
+      description: 'Check extract job status and retrieve results. Returns status: "queued", "active", "completed", or "failed". If status is not "completed" or "failed", call this tool again after a few seconds with the same id.',
       inputSchema: {
         type: 'object',
         properties: {
@@ -148,9 +230,10 @@ const toolRegistry = {
         required: ['id'],
       },
     },
-    handler: (args: any, apiKey: string) => {
+    handler: async (args: any, apiKey: string) => {
       const id = args.id;
-      return callSupadata(`/web/crawl/${id}`, {}, apiKey, 'GET');
+      const result = await callSupadata(`/extract/${id}`, {}, apiKey, 'GET');
+      return addPollingHint(result, 'supadata_check_extract_status', id);
     },
   },
 };

From 23d6288565eb821a49cc6fa61be6857a4b11b6d6 Mon Sep 17 00:00:00 2001
From: rafalzawadzki <iam@rafalzawadzki.com>
Date: Mon, 23 Feb 2026 16:55:26 +0100
Subject: [PATCH 2/2] Bump version to 1.2.0

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index c145980..59848a0 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@supadata/mcp",
-  "version": "1.1.0",
+  "version": "1.2.0",
   "description": "MCP server for Supadata video & web scraping integration. Features include YouTube, TikTok, Instagram, Twitter, and file video transcription, web scraping, batch processing and structured data extraction.",
   "type": "module",
   "bin": "./dist/index.js",