NovakLabOSU · SeanClay10 · Mar 15, 2026 · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/classsify_extract.py b/classsify_extract.py
@@ -31,6 +31,7 @@
 import logging
 import sys
 from datetime import datetime
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 
 from src.preprocessing.pdf_text_extraction import extract_text_from_pdf
@@ -46,8 +47,9 @@
 # Pipeline
 # ---------------------------------------------------------------------------
 
-def run_pipeline(
-    input_path: Path,
+
+def _process_single_pdf(
+    pdf_path: Path,
     model_dir: str,
     llm_model: str,
     output_dir: Path,
@@ -74,18 +76,18 @@ def run_pipeline(
         num_ctx: Context window size for Ollama.
     """
     # ── Collect PDF paths ─────────────────────────────────────────────────
-    if input_path.is_dir():
-        pdf_paths = sorted(input_path.glob("*.pdf"))
+    if pdf_path.is_dir():
+        pdf_paths = sorted(pdf_path.glob("*.pdf"))
         if not pdf_paths:
-            print(f"[ERROR] No PDF files found in directory: {input_path}", file=sys.stderr)
-            log.error("No PDF files found in directory: %s", input_path)
+            print(f"[ERROR] No PDF files found in directory: {pdf_path}", file=sys.stderr)
+            log.error("No PDF files found in directory: %s", pdf_path)
             sys.exit(1)
-        print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {input_path}", file=sys.stderr)
-    elif input_path.is_file() and input_path.suffix.lower() == ".pdf":
-        pdf_paths = [input_path]
+        print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {pdf_path}", file=sys.stderr)
+    elif pdf_path.is_file() and pdf_path.suffix.lower() == ".pdf":
+        pdf_paths = [pdf_path]
     else:
-        print(f"[ERROR] Input must be a .pdf file or a directory of PDFs: {input_path}", file=sys.stderr)
-        log.error("Input must be a .pdf file or a directory of PDFs: %s", input_path)
+        print(f"[ERROR] pdf must be a .pdf file or a directory of PDFs: {pdf_path}", file=sys.stderr)
+        log.error("pdf must be a .pdf file or a directory of PDFs: %s", pdf_path)
         sys.exit(1)
 
     # ── Load classifier once (avoid re-reading model artifacts per file) ──
@@ -119,92 +121,186 @@ def run_pipeline(
             "fraction_feeding": "",
         }
 
-        # ── Step 1: Extract text ──────────────────────────────────────────
+    # ── Step 1: Extract text ──────────────────────────────────────────
+    try:
+        original_text = extract_text_from_pdf(str(pdf_path))
+    except Exception as e:
+        print(f"  [ERROR] Text extraction failed ({pdf_path.name}): {e}", file=sys.stderr)
+        log.error("Text extraction failed for %s: %s", pdf_path.name, e)
+        row["extraction_status"] = "text_extraction_failed"
+        return row
+
+    if not original_text.strip():
+        print(f"  [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr)
+        log.warning("No text extracted from %s — skipping.", pdf_path.name)
+        row["extraction_status"] = "empty_text"
+        return row
+
+    print(f"  [INFO] {pdf_path.name}: {len(original_text)} chars", file=sys.stderr)
+
+    # ── Step 2: Classify ──────────────────────────────────────────────
+    clf_model, vectorizer, encoder = load_classifier(model_dir)
+    label, confidence, pred_prob = classify_text(
+        text=original_text,
+        model=clf_model,
+        vectorizer=vectorizer,
+        encoder=encoder,
+        threshold=confidence_threshold,
+    )
+    print(f"  [CLASSIFIER] {pdf_path.name} → {label} ({confidence:.2%})", file=sys.stderr)
+
+    row["classification"] = label
+    row["confidence"] = f"{confidence:.4f}"
+    row["pred_prob"] = f"{pred_prob:.4f}"
+
+    # ── Step 3: Extract ───────────────────────────────────────────────
+    if label == "useful":
+        print(f"  [INFO] {pdf_path.name}: Running LLM extraction...", file=sys.stderr)
+
+        text_for_llm = original_text
+        if len(text_for_llm) > max_chars:
+            text_for_llm = extract_key_sections(text_for_llm, max_chars)
+            print(f"  [INFO] {pdf_path.name}: trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr)
+
         try:
-            original_text = extract_text_from_pdf(str(pdf_path))
+            metrics = extract_metrics_from_text(
+                text=text_for_llm,
+                model=llm_model,
+                num_ctx=num_ctx,
+            )
+            result = save_extraction_result(
+                metrics=metrics,
+                source_file=pdf_path,
+                original_text=original_text,
+                output_dir=output_dir,
+            )
+
+            m = result["metrics"]
+            row["extraction_status"] = "success"
+            row["species_name"] = m.get("species_name") or ""
+            row["study_location"] = m.get("study_location") or ""
+            row["study_date"] = m.get("study_date") or ""
+            row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"]
+            row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"]
+            row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"]
+            row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"]
+
         except Exception as e:
-            print(f"  [ERROR] Text extraction failed: {e}", file=sys.stderr)
-            log.error("Text extraction failed for %s: %s", pdf_path.name, e)
-            row["extraction_status"] = "text_extraction_failed"
-            summary_rows.append(row)
-            continue
+            print(f"  [ERROR] LLM extraction failed ({pdf_path.name}): {e}", file=sys.stderr)
+            log.error("LLM extraction failed for %s: %s", pdf_path.name, e)
+            row["extraction_status"] = "extraction_failed"
 
-        if not original_text.strip():
-            print(f"  [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr)
-            log.warning("No text extracted from %s — skipping.", pdf_path.name)
-            row["extraction_status"] = "empty_text"
+    else:
+        print(f"  [INFO] {pdf_path.name}: Not useful — skipping LLM extraction.", file=sys.stderr)
+        row["extraction_status"] = "skipped_not_useful"
+
+    return row
+
+
+def run_pipeline(
+    input_path: Path,
+    model_dir: str,
+    llm_model: str,
+    output_dir: Path,
+    confidence_threshold: float,
+    max_chars: int,
+    num_ctx: int,
+    workers: int = 1,
+):
+    """Run classify → extract pipeline on one or more PDFs.
+
+    For each PDF:
+      1. Extract text via PyMuPDF / OCR (pdf_text_extraction.py)
+      2. Classify with XGBoost (pdf_classifier.py)
+      3. If 'useful': trim text to budget (llm_text.py), run LLM extraction
+         (llm_client.py), and save result JSON (llm_client.py)
+      4. Append a row to the summary CSV regardless of classification outcome
+
+    Args:
+        input_path: Path to a single PDF or a directory of PDFs.
+        model_dir: Directory containing classifier model artifacts.
+        llm_model: Ollama model name for extraction.
+        output_dir: Where to write JSON results and the summary CSV.
+        confidence_threshold: Classifier probability threshold for 'useful'.
+        max_chars: Max characters to send to the LLM.
+        num_ctx: Context window size for Ollama.
+        workers: Number of parallel worker processes (default: 1 = sequential).
+    """
+    # ── Collect PDF paths ─────────────────────────────────────────────────
+    if input_path.is_dir():
+        pdf_paths = sorted(input_path.glob("*.pdf"))
+        if not pdf_paths:
+            print(f"[ERROR] No PDF files found in directory: {input_path}", file=sys.stderr)
+            sys.exit(1)
+        print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {input_path}", file=sys.stderr)
+    elif input_path.is_file() and input_path.suffix.lower() == ".pdf":
+        pdf_paths = [input_path]
+    else:
+        print(f"[ERROR] Input must be a .pdf file or a directory of PDFs: {input_path}", file=sys.stderr)
+        sys.exit(1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    summary_rows = []
+
+    if workers > 1 and len(pdf_paths) > 1:
+        print(f"[INFO] Using {workers} worker processes.", file=sys.stderr)
+        with ProcessPoolExecutor(max_workers=workers) as executor:
+            futures = {
+                executor.submit(
+                    _process_single_pdf,
+                    pdf_path,
+                    model_dir,
+                    llm_model,
+                    output_dir,
+                    confidence_threshold,
+                    max_chars,
+                    num_ctx,
+                ): pdf_path
+                for pdf_path in pdf_paths
+            }
+            for future in as_completed(futures):
+                pdf_path = futures[future]
+                try:
+                    row = future.result()
+                except Exception as exc:
+                    print(f"  [ERROR] Worker failed for {pdf_path.name}: {exc}", file=sys.stderr)
+                    row = {"filename": pdf_path.name, "extraction_status": "worker_failed"}
+                summary_rows.append(row)
+    else:
+        for idx, pdf_path in enumerate(pdf_paths, start=1):
+            print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr)
+            row = _process_single_pdf(
+                pdf_path,
+                model_dir,
+                llm_model,
+                output_dir,
+                confidence_threshold,
+                max_chars,
+                num_ctx,
+            )
             summary_rows.append(row)
-            continue
-
-        print(f"  [INFO] Text size: {len(original_text)} chars", file=sys.stderr)
-
-        # ── Step 2: Classify ──────────────────────────────────────────────
-        label, confidence, pred_prob = classify_text(
-            text=original_text,
-            model=clf_model,
-            vectorizer=vectorizer,
-            encoder=encoder,
-            threshold=confidence_threshold,
-        )
-        print(f"  [CLASSIFIER] → {label} ({confidence:.2%} confidence)", file=sys.stderr)
-
-        row["classification"] = label
-        row["confidence"] = f"{confidence:.4f}"
-        row["pred_prob"] = f"{pred_prob:.4f}"
-
-        # ── Step 3: Extract ───────────────────────────────────────────────
-        if label == "useful":
-            print(f"  [INFO] Running LLM extraction...", file=sys.stderr)
-
-            text_for_llm = original_text
-            if len(text_for_llm) > max_chars:
-                text_for_llm = extract_key_sections(text_for_llm, max_chars)
-                print(f"  [INFO] Text trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr)
-
-            try:
-                metrics = extract_metrics_from_text(
-                    text=text_for_llm,
-                    model=llm_model,
-                    num_ctx=num_ctx,
-                )
-                result = save_extraction_result(
-                    metrics=metrics,
-                    source_file=pdf_path,
-                    original_text=original_text,
-                    output_dir=output_dir,
-                )
-
-                m = result["metrics"]
-                row["extraction_status"] = "success"
-                row["species_name"] = m.get("species_name") or ""
-                row["study_location"] = m.get("study_location") or ""
-                row["study_date"] = m.get("study_date") or ""
-                row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"]
-                row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"]
-                row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"]
-                row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"]
-
-            except Exception as e:
-                print(f"  [ERROR] LLM extraction failed: {e}", file=sys.stderr)
-                log.error("LLM extraction failed for %s: %s", pdf_path.name, e)
-                row["extraction_status"] = "extraction_failed"
-
-        else:
-            print(f"  [INFO] Not useful — skipping LLM extraction.", file=sys.stderr)
-            row["extraction_status"] = "skipped_not_useful"
-
-        summary_rows.append(row)
 
     # ── Write summary CSV ─────────────────────────────────────────────────
+    from datetime import datetime
+
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     summaries_dir = output_dir / "summaries"
     summaries_dir.mkdir(parents=True, exist_ok=True)
     summary_path = summaries_dir / f"pipeline_summary_{timestamp}.csv"
 
     fieldnames = [
-        "filename", "classification", "confidence", "pred_prob",
-        "extraction_status", "species_name", "study_location", "study_date",
-        "sample_size", "num_empty_stomachs", "num_nonempty_stomachs", "fraction_feeding",
+        "filename",
+        "classification",
+        "confidence",
+        "pred_prob",
+        "extraction_status",
+        "species_name",
+        "study_location",
+        "study_date",
+        "sample_size",
+        "num_empty_stomachs",
+        "num_nonempty_stomachs",
+        "fraction_feeding",
     ]
     with open(summary_path, "w", newline="", encoding="utf-8") as f:
         writer = csv.DictWriter(f, fieldnames=fieldnames)
@@ -237,12 +333,10 @@ def run_pipeline(
 # CLI entry point
 # ---------------------------------------------------------------------------
 
+
 def main():
     parser = argparse.ArgumentParser(
-        description=(
-            "Classify PDFs as useful/not-useful, then extract structured diet "
-            "metrics from useful ones using an LLM."
-        ),
+        description=("Classify PDFs as useful/not-useful, then extract structured diet " "metrics from useful ones using an LLM."),
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
@@ -301,6 +395,12 @@ def main():
         default=4096,
         help="Context window size for Ollama (default: 4096).",
     )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=1,
+        help="Number of parallel worker processes (default: 1 = sequential).",
+    )
 
     args = parser.parse_args()
 
@@ -321,8 +421,9 @@ def main():
         confidence_threshold=args.confidence_threshold,
         max_chars=args.max_chars,
         num_ctx=args.num_ctx,
+        workers=args.workers,
     )
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/data/results/metrics/Adams_1989_results.json b/data/results/metrics/Adams_1989_results.json
@@ -0,0 +1,14 @@
+{
+  "source_file": "Adams_1989.txt",
+  "file_type": ".txt",
+  "metrics": {
+    "species_name": null,
+    "study_location": "Marion Island, sub-Antarctic",
+    "study_date": null,
+    "num_empty_stomachs": null,
+    "num_nonempty_stomachs": null,
+    "sample_size": null,
+    "fraction_feeding": null,
+    "source_pages": null
+  }
+}
diff --git a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv
@@ -0,0 +1,2 @@
+filename,raw_chars,cleaned_chars,trimmed_chars,extraction_status,species_name,study_location,study_date,sample_size,num_empty_stomachs,num_nonempty_stomachs,fraction_feeding
+Adams_1989.txt,27673,22739,4999,success,,"Marion Island, sub-Antarctic",,,,,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		filename,raw_chars,cleaned_chars,trimmed_chars,extraction_status,species_name,study_location,study_date,sample_size,num_empty_stomachs,num_nonempty_stomachs,fraction_feeding
		Adams_1989.txt,27673,22739,4999,success,,"Marion Island, sub-Antarctic",,,,,