diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4d4c767e..6935c37a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -80,3 +80,33 @@ jobs: # Fail if any changes were written to any source files or generated untracked files - run: git add -A && git diff --cached --exit-code + + python-adapters: + name: python-adapters + needs: build + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Cache models + uses: actions/cache@v4 + with: + path: data/whisperx-env/parakeet/*.nemo + key: ${{ runner.os }}-models-${{ hashFiles('Makefile') }} + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y ffmpeg + + - name: Run Python Adapter Tests + run: | + # The tests require a lot of RAM and might fail on standard runners + # We'll see how it goes + make test-python-adapters diff --git a/Makefile b/Makefile index 259ddcbe..97739f95 100644 --- a/Makefile +++ b/Makefile @@ -115,3 +115,13 @@ test: ## Run tests using gotestsum (via go tool) test-watch: ## Run tests in watch mode using gotestsum (via go tool) @echo "Running tests in watch mode..." go tool gotestsum --watch -- -v ./... + +setup-python-tests: ## Set up Python environments and download models for tests + @echo "Setting up Python environments using Go setup tool..." + go run cmd/setup-adapters/main.go + +test-python-adapters: setup-python-tests ## Run Python adapter tests + @echo "Running Parakeet adapter tests..." + uv run --with pytest --project data/whisperx-env/parakeet pytest internal/transcription/adapters/py/nvidia/tests/ + @echo "Running PyAnnote adapter tests..." + uv run --with pytest --project data/whisperx-env/pyannote pytest internal/transcription/adapters/py/pyannote/tests/ diff --git a/cmd/server/main.go b/cmd/server/main.go index acdd3beb..ee4e7d8d 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -7,7 +7,6 @@ import ( "net/http" "os" "os/signal" - "path/filepath" "syscall" "time" @@ -21,7 +20,6 @@ import ( "scriberr/internal/service" "scriberr/internal/sse" "scriberr/internal/transcription" - "scriberr/internal/transcription/adapters" "scriberr/internal/transcription/registry" "scriberr/pkg/logger" ) @@ -45,7 +43,7 @@ var ( // @license.name MIT // @license.url https://opensource.org/licenses/MIT -// @host localhost:8080 +// @host localhost:5318 // @BasePath /api/v1 // @securityDefinitions.apikey ApiKeyAuth @@ -78,7 +76,7 @@ func main() { cfg := config.Load() // Register adapters with config-based paths - registerAdapters(cfg) + registry.RegisterStandardAdapters(cfg) // Initialize database logger.Startup("database", "Connecting to database") @@ -113,6 +111,7 @@ func main() { logger.Startup("service", "Initializing services") userService := service.NewUserService(userRepo, authService) fileService := service.NewFileService() + speakerService := service.NewSpeakerService(jobRepo) // Initialize unified transcription processor logger.Startup("transcription", "Initializing transcription service") @@ -162,6 +161,7 @@ func main() { taskQueue, unifiedProcessor, quickTranscriptionService, + speakerService, multiTrackProcessor, broadcaster, ) @@ -214,37 +214,3 @@ func main() { logger.Info("Server stopped") } - -// registerAdapters registers all transcription and diarization adapters with config-based paths -func registerAdapters(cfg *config.Config) { - logger.Info("Registering adapters with environment path", "whisperx_env", cfg.WhisperXEnv) - - // Shared environment path for NVIDIA models (NeMo-based) - nvidiaEnvPath := filepath.Join(cfg.WhisperXEnv, "parakeet") - - // Dedicated environment path for PyAnnote (to avoid dependency conflicts) - pyannoteEnvPath := filepath.Join(cfg.WhisperXEnv, "pyannote") - - // Dedicated environment path for Voxtral (Mistral AI model) - voxtralEnvPath := filepath.Join(cfg.WhisperXEnv, "voxtral") - - // Register transcription adapters - registry.RegisterTranscriptionAdapter("whisperx", - adapters.NewWhisperXAdapter(cfg.WhisperXEnv)) - registry.RegisterTranscriptionAdapter("parakeet", - adapters.NewParakeetAdapter(nvidiaEnvPath)) - registry.RegisterTranscriptionAdapter("canary", - adapters.NewCanaryAdapter(nvidiaEnvPath)) // Shares with Parakeet - registry.RegisterTranscriptionAdapter("voxtral", - adapters.NewVoxtralAdapter(voxtralEnvPath)) - registry.RegisterTranscriptionAdapter("openai_whisper", - adapters.NewOpenAIAdapter(cfg.OpenAIAPIKey)) - - // Register diarization adapters - registry.RegisterDiarizationAdapter("pyannote", - adapters.NewPyAnnoteAdapter(pyannoteEnvPath)) // Dedicated environment - registry.RegisterDiarizationAdapter("sortformer", - adapters.NewSortformerAdapter(nvidiaEnvPath)) // Shares with Parakeet - - logger.Info("Adapter registration complete") -} diff --git a/cmd/setup-adapters/main.go b/cmd/setup-adapters/main.go new file mode 100644 index 00000000..3c6c4739 --- /dev/null +++ b/cmd/setup-adapters/main.go @@ -0,0 +1,35 @@ +package main + +import ( + "context" + "flag" + "fmt" + "os" + + "scriberr/internal/config" + "scriberr/internal/transcription/registry" + "scriberr/pkg/logger" +) + +func main() { + logLevel := flag.String("log-level", "info", "Log level (debug, info, warn, error)") + flag.Parse() + + logger.Init(*logLevel) + logger.Info("Starting adapter environment setup") + + cfg := config.Load() + + // Register all standard adapters + registry.RegisterStandardAdapters(cfg) + + // Initialize all registered models synchronously + ctx := context.Background() + err := registry.GetRegistry().InitializeModelsSync(ctx) + if err != nil { + fmt.Fprintf(os.Stderr, "Error during adapter setup: %v\n", err) + os.Exit(1) + } + + logger.Info("Adapter environment setup completed successfully") +} diff --git a/internal/transcription/adapters/base_adapter.go b/internal/transcription/adapters/base_adapter.go index b0f0347a..90488eb2 100644 --- a/internal/transcription/adapters/base_adapter.go +++ b/internal/transcription/adapters/base_adapter.go @@ -65,10 +65,20 @@ func CheckEnvironmentReady(envPath, importStatement string) bool { } envCacheMutex.RUnlock() + start := time.Now() + logger.Debug("Checking environment readiness", "env_path", envPath, "import", importStatement) + // Run the actual check testCmd := exec.Command("uv", "run", "--native-tls", "--project", envPath, "python", "-c", importStatement) ready := testCmd.Run() == nil + duration := time.Since(start) + logger.Info("Environment readiness check completed", + "env_path", envPath, + "import", importStatement, + "ready", ready, + "duration", duration.String()) + // Cache the result envCacheMutex.Lock() envCache[cacheKey] = ready @@ -80,6 +90,15 @@ func CheckEnvironmentReady(envPath, importStatement string) bool { return result.(bool) } +// EnsureEnvironment ensures an environment is fully set up using singleflight to prevent redundant work +func EnsureEnvironment(envPath string, setupFn func() error) error { + key := "setup:" + envPath + _, err, _ := requestGroup.Do(key, func() (interface{}, error) { + return nil, setupFn() + }) + return err +} + // BaseAdapter provides common functionality for all model adapters type BaseAdapter struct { modelID string diff --git a/internal/transcription/adapters/canary_adapter.go b/internal/transcription/adapters/canary_adapter.go index c69f6ce3..972ce3a2 100644 --- a/internal/transcription/adapters/canary_adapter.go +++ b/internal/transcription/adapters/canary_adapter.go @@ -174,7 +174,7 @@ func (c *CanaryAdapter) PrepareEnvironment(ctx context.Context) error { } // Check if environment is already ready (using cache to speed up repeated checks) - if CheckEnvironmentReady(c.envPath, "import nemo.collections.asr") { + if CheckEnvironmentReady(c.envPath, "import nemo") { modelPath := filepath.Join(c.envPath, "canary-1b-v2.nemo") if stat, err := os.Stat(modelPath); err == nil && stat.Size() > 1024*1024 { logger.Info("Canary environment already ready") @@ -184,7 +184,7 @@ func (c *CanaryAdapter) PrepareEnvironment(ctx context.Context) error { } // Setup environment (reuse Parakeet setup since they share the same environment) - if err := c.setupCanaryEnvironment(); err != nil { + if err := EnsureEnvironment(c.envPath, c.setupCanaryEnvironment); err != nil { return fmt.Errorf("failed to setup Canary environment: %w", err) } diff --git a/internal/transcription/adapters/parakeet_adapter.go b/internal/transcription/adapters/parakeet_adapter.go index 4fa252d4..08163b2f 100644 --- a/internal/transcription/adapters/parakeet_adapter.go +++ b/internal/transcription/adapters/parakeet_adapter.go @@ -134,8 +134,17 @@ func (p *ParakeetAdapter) PrepareEnvironment(ctx context.Context) error { return fmt.Errorf("failed to create buffered script: %w", err) } + // Copy transcription scripts (standard and buffered) + if err := p.copyTranscriptionScript(); err != nil { + return fmt.Errorf("failed to copy transcription script: %w", err) + } + + if err := p.copyBufferedScript(); err != nil { + return fmt.Errorf("failed to create buffered script: %w", err) + } + // Check if environment is already ready (using cache to speed up repeated checks) - if CheckEnvironmentReady(p.envPath, "import nemo.collections.asr") { + if CheckEnvironmentReady(p.envPath, "import nemo") { modelPath := filepath.Join(p.envPath, "parakeet-tdt-0.6b-v3.nemo") scriptPath := filepath.Join(p.envPath, "parakeet_transcribe.py") bufferedScriptPath := filepath.Join(p.envPath, "parakeet_transcribe_buffered.py") @@ -158,8 +167,8 @@ func (p *ParakeetAdapter) PrepareEnvironment(ctx context.Context) error { logger.Info("Parakeet environment not ready, setting up") } - // Setup environment - if err := p.setupParakeetEnvironment(); err != nil { + // Setup environment (shared with other NVIDIA adapters) + if err := EnsureEnvironment(p.envPath, p.setupParakeetEnvironment); err != nil { return fmt.Errorf("failed to setup Parakeet environment: %w", err) } diff --git a/internal/transcription/adapters/py/nvidia/canary_transcribe.py b/internal/transcription/adapters/py/nvidia/canary_transcribe.py index c9599129..88dcaedf 100644 --- a/internal/transcription/adapters/py/nvidia/canary_transcribe.py +++ b/internal/transcription/adapters/py/nvidia/canary_transcribe.py @@ -195,10 +195,11 @@ def main(): include_confidence=args.include_confidence, preserve_formatting=args.preserve_formatting, ) - except Exception as e: - print(f"Error during transcription: {e}") + except Exception: + import traceback + traceback.print_exc() sys.exit(1) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/internal/transcription/adapters/py/nvidia/pyproject.toml b/internal/transcription/adapters/py/nvidia/pyproject.toml index fdbe9937..8a49b4dd 100644 --- a/internal/transcription/adapters/py/nvidia/pyproject.toml +++ b/internal/transcription/adapters/py/nvidia/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "soundfile", "ml-dtypes>=0.3.1,<0.5.0", "onnx>=1.15.0,<1.18.0", + "qdrant-client", # "pyannote.audio" # needed for sortformer or no? ] @@ -23,12 +24,10 @@ dev = [ [tool.uv.sources] nemo-toolkit = { git = "https://github.com/NVIDIA/NeMo.git", tag = "v2.5.3" } torch = [ - { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, { index = "pytorch-cpu", marker = "platform_machine != 'x86_64' and sys_platform != 'darwin'" }, { index = "pytorch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] torchaudio = [ - { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, { index = "pytorch-cpu", marker = "platform_machine != 'x86_64' and sys_platform != 'darwin'" }, { index = "pytorch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] diff --git a/internal/transcription/adapters/pyannote_adapter.go b/internal/transcription/adapters/pyannote_adapter.go index b4168e0b..b7da2d4b 100644 --- a/internal/transcription/adapters/pyannote_adapter.go +++ b/internal/transcription/adapters/pyannote_adapter.go @@ -188,7 +188,7 @@ func (p *PyAnnoteAdapter) PrepareEnvironment(ctx context.Context) error { } // Check if PyAnnote is already available (using cache to speed up repeated checks) - if CheckEnvironmentReady(p.envPath, "from pyannote.audio import Pipeline") { + if CheckEnvironmentReady(p.envPath, "import pyannote") { logger.Info("PyAnnote already available in environment") // Still ensure script exists if err := p.copyDiarizationScript(); err != nil { @@ -199,13 +199,12 @@ func (p *PyAnnoteAdapter) PrepareEnvironment(ctx context.Context) error { } // Create environment if it doesn't exist or is incomplete - if err := p.setupPyAnnoteEnvironment(); err != nil { + if err := EnsureEnvironment(p.envPath, p.setupPyAnnoteEnvironment); err != nil { return fmt.Errorf("failed to setup PyAnnote environment: %w", err) } // Verify PyAnnote is now available - testCmd := exec.Command("uv", "run", "--native-tls", "--project", p.envPath, "python", "-c", "from pyannote.audio import Pipeline") - if testCmd.Run() != nil { + if !CheckEnvironmentReady(p.envPath, "import pyannote") { logger.Warn("PyAnnote environment test still failed after setup") } diff --git a/internal/transcription/registry/registration.go b/internal/transcription/registry/registration.go new file mode 100644 index 00000000..eabbc751 --- /dev/null +++ b/internal/transcription/registry/registration.go @@ -0,0 +1,45 @@ +package registry + +import ( + "path/filepath" + "scriberr/internal/config" + "scriberr/internal/transcription/adapters" + "scriberr/pkg/logger" +) + +// RegisterStandardAdapters registers all built-in model adapters using the provided configuration. +// This centralizes adapter registration so it can be used by the server, CLI, and setup tools. +func RegisterStandardAdapters(cfg *config.Config) { + // Shared environment path for NVIDIA models (NeMo-based) + nvidiaEnvPath := filepath.Join(cfg.WhisperXEnv, "parakeet") + + // Dedicated environment path for PyAnnote (to avoid dependency conflicts) + pyannoteEnvPath := filepath.Join(cfg.WhisperXEnv, "pyannote") + + // Dedicated environment path for Voxtral (Mistral AI model) + voxtralEnvPath := filepath.Join(cfg.WhisperXEnv, "voxtral") + + logger.Info("Registering standard adapters", + "nvidia_env", nvidiaEnvPath, + "pyannote_env", pyannoteEnvPath) + + // Register transcription adapters + RegisterTranscriptionAdapter("parakeet", + adapters.NewParakeetAdapter(nvidiaEnvPath)) + RegisterTranscriptionAdapter("canary", + adapters.NewCanaryAdapter(nvidiaEnvPath)) + RegisterTranscriptionAdapter("voxtral", + adapters.NewVoxtralAdapter(voxtralEnvPath)) + RegisterTranscriptionAdapter("openai_whisper", + adapters.NewOpenAIAdapter(cfg.OpenAIAPIKey)) + + // Register diarization adapters + RegisterDiarizationAdapter("sortformer", + adapters.NewSortformerAdapter(nvidiaEnvPath)) + + // PyAnnote is registered here so it's available in the setup tool and server + RegisterDiarizationAdapter("pyannote", + adapters.NewPyAnnoteAdapter(pyannoteEnvPath)) + + logger.Info("Standard adapter registration complete") +}