diff --git a/api-docs/swagger.json b/api-docs/swagger.json index f5ddfdbb..eb430c1b 100644 --- a/api-docs/swagger.json +++ b/api-docs/swagger.json @@ -4778,6 +4778,10 @@ "description": "OpenAI settings", "type": "string" }, + "api_url": { + "description": "Custom transcription API base URL (OpenAI adapter only)", + "type": "string" + }, "attention_context_left": { "description": "NVIDIA Parakeet-specific parameters for long-form audio", "type": "integer" @@ -4930,6 +4934,10 @@ "threads": { "type": "integer" }, + "timeout_minutes": { + "description": "HTTP request timeout in minutes (OpenAI adapter with custom base URL)", + "type": "integer" + }, "vad_method": { "description": "VAD (Voice Activity Detection) settings", "type": "string" diff --git a/api-docs/swagger.yaml b/api-docs/swagger.yaml index 4b43a4ca..39b1a8ec 100644 --- a/api-docs/swagger.yaml +++ b/api-docs/swagger.yaml @@ -641,6 +641,9 @@ definitions: api_key: description: OpenAI settings type: string + api_url: + description: Custom transcription API base URL (OpenAI adapter only) + type: string attention_context_left: description: NVIDIA Parakeet-specific parameters for long-form audio type: integer @@ -747,6 +750,9 @@ definitions: type: number threads: type: integer + timeout_minutes: + description: HTTP request timeout in minutes (OpenAI adapter with custom base URL) + type: integer vad_method: description: VAD (Voice Activity Detection) settings type: string diff --git a/internal/config/config.go b/internal/config/config.go index 72b99c3d..1768f63d 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -68,7 +68,7 @@ func Load() *Config { TempDir: getEnv("TEMP_DIR", "data/temp"), WhisperXEnv: getEnv("WHISPERX_ENV", "data/whisperx-env"), SecureCookies: getEnv("SECURE_COOKIES", defaultSecure) == "true", - OpenAIAPIKey: getEnv("OPENAI_API_KEY", ""), + OpenAIAPIKey: getEnv("OPENAI_API_KEY", ""), HFToken: getEnv("HF_TOKEN", ""), } } diff --git a/internal/models/transcription.go b/internal/models/transcription.go index 632c6c3b..04285b08 100644 --- a/internal/models/transcription.go +++ b/internal/models/transcription.go @@ -127,7 +127,9 @@ type WhisperXParams struct { CallbackURL *string `json:"callback_url,omitempty" gorm:"type:text"` // OpenAI settings - APIKey *string `json:"api_key,omitempty" gorm:"type:text"` + APIKey *string `json:"api_key,omitempty" gorm:"type:text"` + APIURL *string `json:"api_url,omitempty" gorm:"type:text"` + TimeoutMinutes *int `json:"timeout_minutes,omitempty" gorm:"type:int"` // Voxtral settings MaxNewTokens *int `json:"max_new_tokens,omitempty" gorm:"type:int"` diff --git a/internal/transcription/README.md b/internal/transcription/README.md index e915bc15..30c5b5de 100644 --- a/internal/transcription/README.md +++ b/internal/transcription/README.md @@ -155,6 +155,7 @@ err := adapter.ValidateParameters(params) | `whisperx` | `whisper` | 90+ languages | Timestamps, Diarization, Translation | | `parakeet` | `nvidia_parakeet` | English only | Timestamps, Long-form, High Quality | | `canary` | `nvidia_canary` | 12 languages | Timestamps, Translation, Multilingual | +| `openai_whisper` | `openai` | 57 languages | Timestamps, Diarization, Translation, Custom Endpoint | ### Diarization Models @@ -221,9 +222,18 @@ params := map[string]interface{}{ // NVIDIA Canary with translation params := map[string]interface{}{ "source_lang": "es", - "target_lang": "en", + "target_lang": "en", "task": "translate", } + +// OpenAI with custom self-hosted endpoint +params := map[string]interface{}{ + "base_url": "http://localhost:8000/v1", + "model": "Systran/faster-whisper-large-v3", + "timeout_minutes": 30, + "diarize": true, + "diarize_model": "pyannote", +} ``` ## Testing diff --git a/internal/transcription/adapters/openai_adapter.go b/internal/transcription/adapters/openai_adapter.go index a9b008ef..4f2824e0 100644 --- a/internal/transcription/adapters/openai_adapter.go +++ b/internal/transcription/adapters/openai_adapter.go @@ -40,7 +40,7 @@ func NewOpenAIAdapter(apiKey string) *OpenAIAdapter { Features: map[string]bool{ "timestamps": true, // Verbose JSON response includes segments "word_level": false, // Not supported by standard API yet (unless using verbose_json with timestamp_granularities which is beta) - "diarization": false, // Not supported by OpenAI API + "diarization": true, // Post-processing via pyannote/sortformer pipeline "translation": true, "language_detection": true, "vad": true, // Implicit @@ -59,13 +59,19 @@ func NewOpenAIAdapter(apiKey string) *OpenAIAdapter { Description: "OpenAI API Key (overrides system default)", Group: "authentication", }, + { + Name: "base_url", + Type: "string", + Required: false, + Description: "Custom transcription API base URL (overrides server default)", + Group: "authentication", + }, { Name: "model", Type: "string", Required: false, Default: "whisper-1", - Options: []string{"whisper-1"}, - Description: "ID of the model to use", + Description: "Model name (e.g. whisper-1, or any model exposed by a custom endpoint)", Group: "basic", }, { @@ -92,6 +98,15 @@ func NewOpenAIAdapter(apiKey string) *OpenAIAdapter { Description: "Sampling temperature", Group: "quality", }, + { + Name: "timeout_minutes", + Type: "int", + Required: false, + Default: 10, + Min: &[]float64{1}[0], + Description: "HTTP request timeout in minutes (increase for large files on self-hosted endpoints)", + Group: "advanced", + }, } baseAdapter := NewBaseAdapter("openai_whisper", "", capabilities, schema) @@ -153,7 +168,14 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn apiKey = key } - if apiKey == "" { + const officialURL = "https://api.openai.com/v1/audio/transcriptions" + endpointURL := officialURL + if url := a.GetStringParameter(params, "base_url"); url != "" { + endpointURL = strings.TrimRight(url, "/") + "/audio/transcriptions" + } + isOfficialEndpoint := endpointURL == officialURL + + if apiKey == "" && isOfficialEndpoint { writeLog("Error: OpenAI API key is required but not provided") return nil, fmt.Errorf("OpenAI API key is required but not provided") } @@ -188,7 +210,7 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn writeLog("Model: %s", model) _ = writer.WriteField("model", model) - if strings.HasPrefix(model, "gpt-4o") { + if isOfficialEndpoint && strings.HasPrefix(model, "gpt-4o") { if strings.Contains(model, "diarize") { _ = writer.WriteField("response_format", "diarized_json") } else { @@ -197,7 +219,6 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn // gpt-4o models don't support timestamp_granularities with these formats } else { _ = writer.WriteField("response_format", "verbose_json") - // timestamp_granularities is only supported for whisper-1 if model == "whisper-1" { _ = writer.WriteField("timestamp_granularities[]", "word") // Request word timestamps _ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps @@ -224,8 +245,8 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn } // Create request - writeLog("Sending request to OpenAI API...") - req, err := http.NewRequestWithContext(ctx, "POST", "https://api.openai.com/v1/audio/transcriptions", body) + writeLog("Sending request to %s...", endpointURL) + req, err := http.NewRequestWithContext(ctx, "POST", endpointURL, body) if err != nil { writeLog("Error: Failed to create request: %v", err) return nil, fmt.Errorf("failed to create request: %w", err) @@ -235,9 +256,14 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn req.Header.Set("Authorization", "Bearer "+apiKey) // Execute request - client := &http.Client{ - Timeout: 10 * time.Minute, // Generous timeout for large files + timeout := 10 * time.Minute + if !isOfficialEndpoint { + timeout = 30 * time.Minute // Default for self-hosted endpoints + } + if t := a.GetIntParameter(params, "timeout_minutes"); t > 0 { + timeout = time.Duration(t) * time.Minute } + client := &http.Client{Timeout: timeout} resp, err := client.Do(req) if err != nil { writeLog("Error: Request failed: %v", err) @@ -247,8 +273,8 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn if resp.StatusCode != http.StatusOK { respBody, _ := io.ReadAll(resp.Body) - writeLog("Error: OpenAI API error (status %d): %s", resp.StatusCode, string(respBody)) - return nil, fmt.Errorf("OpenAI API error (status %d): %s", resp.StatusCode, string(respBody)) + writeLog("Error: transcription API error (status %d): %s", resp.StatusCode, string(respBody)) + return nil, fmt.Errorf("transcription API error (status %d): %s", resp.StatusCode, string(respBody)) } writeLog("Response received. Parsing...") diff --git a/internal/transcription/adapters_test.go b/internal/transcription/adapters_test.go index ca09ca87..4c3a35c0 100644 --- a/internal/transcription/adapters_test.go +++ b/internal/transcription/adapters_test.go @@ -553,6 +553,49 @@ func BenchmarkModelRegistryLookup(b *testing.B) { } } +func TestOpenAIAdapter(t *testing.T) { + a := adapters.NewOpenAIAdapter("sk-test") + if a == nil { + t.Fatal("NewOpenAIAdapter returned nil") + } + + caps := a.GetCapabilities() + if caps.ModelID != "openai_whisper" { + t.Errorf("expected model ID 'openai_whisper', got %q", caps.ModelID) + } + if caps.ModelFamily != "openai" { + t.Errorf("expected model family 'openai', got %q", caps.ModelFamily) + } + if !caps.Features["diarization"] { + t.Error("diarization capability must be true") + } + + schema := a.GetParameterSchema() + hasBaseURL := false + for _, p := range schema { + if p.Name == "base_url" { + hasBaseURL = true + } + if p.Name == "model" && len(p.Options) > 0 { + t.Errorf("model parameter must not have a fixed Options list, got %v", p.Options) + } + } + if !hasBaseURL { + t.Error("schema must include base_url parameter") + } +} + +func TestOpenAIAdapterWithBaseURL(t *testing.T) { + a := adapters.NewOpenAIAdapter("") + if a == nil { + t.Fatal("NewOpenAIAdapter returned nil") + } + caps := a.GetCapabilities() + if !caps.Features["diarization"] { + t.Error("diarization capability must be true") + } +} + func BenchmarkParameterValidation(b *testing.B) { reg := registry.GetRegistry() adapter, err := reg.GetTranscriptionAdapter("whisperx") diff --git a/internal/transcription/unified_service.go b/internal/transcription/unified_service.go index e17ef8b4..f38d3306 100644 --- a/internal/transcription/unified_service.go +++ b/internal/transcription/unified_service.go @@ -589,6 +589,12 @@ func (u *UnifiedTranscriptionService) convertToOpenAIParams(params models.Whispe if params.APIKey != nil && *params.APIKey != "" { paramMap["api_key"] = *params.APIKey } + if params.APIURL != nil && *params.APIURL != "" { + paramMap["base_url"] = *params.APIURL + } + if params.TimeoutMinutes != nil && *params.TimeoutMinutes > 0 { + paramMap["timeout_minutes"] = *params.TimeoutMinutes + } return paramMap } diff --git a/tests/adapter_registration_test.go b/tests/adapter_registration_test.go index 3b9203c7..a4352090 100644 --- a/tests/adapter_registration_test.go +++ b/tests/adapter_registration_test.go @@ -47,6 +47,19 @@ func TestAdapterEnvPathInjection(t *testing.T) { } } +// TestOpenAIAdapterConstruction tests the OpenAI adapter constructor +func TestOpenAIAdapterConstruction(t *testing.T) { + a := adapters.NewOpenAIAdapter("") + if a == nil { + t.Fatal("NewOpenAIAdapter returned nil with empty key") + } + + a = adapters.NewOpenAIAdapter("sk-test") + if !a.GetCapabilities().Features["diarization"] { + t.Error("diarization capability must be true") + } +} + // TestRegisterAdapters tests that registerAdapters correctly registers all adapters func TestRegisterAdapters(t *testing.T) { // Clear registry before test @@ -67,6 +80,8 @@ func TestRegisterAdapters(t *testing.T) { adapters.NewParakeetAdapter(nvidiaEnvPath)) registry.RegisterTranscriptionAdapter("canary", adapters.NewCanaryAdapter(nvidiaEnvPath)) + registry.RegisterTranscriptionAdapter("openai_whisper", + adapters.NewOpenAIAdapter("")) registry.RegisterDiarizationAdapter("pyannote", adapters.NewPyAnnoteAdapter(nvidiaEnvPath)) @@ -75,8 +90,8 @@ func TestRegisterAdapters(t *testing.T) { // Verify registrations transcriptionAdapters := registry.GetTranscriptionAdapters() - if len(transcriptionAdapters) != 3 { - t.Errorf("Expected 3 transcription adapters, got %d", len(transcriptionAdapters)) + if len(transcriptionAdapters) != 4 { + t.Errorf("Expected 4 transcription adapters, got %d", len(transcriptionAdapters)) } // Check specific adapters are registered @@ -89,6 +104,9 @@ func TestRegisterAdapters(t *testing.T) { if _, exists := transcriptionAdapters["canary"]; !exists { t.Error("canary adapter not registered") } + if _, exists := transcriptionAdapters["openai_whisper"]; !exists { + t.Error("openai_whisper adapter not registered") + } diarizationAdapters := registry.GetDiarizationAdapters() if len(diarizationAdapters) != 2 { diff --git a/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx b/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx index 01f5da56..ffcfba7b 100644 --- a/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx +++ b/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx @@ -84,6 +84,8 @@ export interface WhisperXParams { attention_context_right: number; is_multi_track_enabled: boolean; api_key?: string; + api_url?: string; + timeout_minutes?: number; max_new_tokens?: number; } @@ -467,6 +469,7 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
- + + updateParam('api_url', e.target.value || undefined)} + className={inputClassName} + /> + + +
{ - updateParam('api_key', e.target.value); - }} + onChange={(e) => updateParam('api_key', e.target.value)} className={`${inputClassName} flex-1`} /> - + {!params.api_url && ( + + )}
- {validationStatus !== 'idle' && ( + {validationStatus !== 'idle' && !params.api_url && (
{validationStatus === 'valid' ? : } @@ -1094,16 +1108,26 @@ function OpenAIConfig({ - + {params.api_url ? ( + updateParam('model', e.target.value)} + className={inputClassName} + /> + ) : ( + + )} @@ -1118,10 +1142,123 @@ function OpenAIConfig({ + + {params.api_url && ( + + updateParam('timeout_minutes', e.target.value ? parseInt(e.target.value) : undefined)} + className={inputClassName} + /> + + )}
- {params.model && params.model !== "whisper-1" && ( + {!isMultiTrack && ( +
+
+
+ updateParam('diarize', v)} + /> + +
+ + {params.diarize && ( +
+ + + + +
+ + updateParam('min_speakers', e.target.value ? parseInt(e.target.value) : undefined)} + className={inputClassName} + /> + + + updateParam('max_speakers', e.target.value ? parseInt(e.target.value) : undefined)} + className={inputClassName} + /> + +
+ + {params.diarize_model === "pyannote" && ( + <> + + updateParam('hf_token', e.target.value || undefined)} + className={inputClassName} + /> + + +
+

Voice Detection Tuning (for noisy/distant audio)

+
+ + updateParam('vad_onset', parseFloat(e.target.value) || 0.5)} + className={inputClassName} + /> + + + updateParam('vad_offset', parseFloat(e.target.value) || 0.363)} + className={inputClassName} + /> + +
+
+ + )} +
+ )} +
+
+ )} + + {params.model && params.model !== "whisper-1" && !params.api_url && ( Word-level timestamps are only supported by whisper-1. Synchronized playback won't be available. diff --git a/web/project-site/public/api/swagger.json b/web/project-site/public/api/swagger.json index f5ddfdbb..eb430c1b 100644 --- a/web/project-site/public/api/swagger.json +++ b/web/project-site/public/api/swagger.json @@ -4778,6 +4778,10 @@ "description": "OpenAI settings", "type": "string" }, + "api_url": { + "description": "Custom transcription API base URL (OpenAI adapter only)", + "type": "string" + }, "attention_context_left": { "description": "NVIDIA Parakeet-specific parameters for long-form audio", "type": "integer" @@ -4930,6 +4934,10 @@ "threads": { "type": "integer" }, + "timeout_minutes": { + "description": "HTTP request timeout in minutes (OpenAI adapter with custom base URL)", + "type": "integer" + }, "vad_method": { "description": "VAD (Voice Activity Detection) settings", "type": "string"