diff --git a/classsify_extract.py b/classsify_extract.py index a37c8d5..4cdaa9f 100644 --- a/classsify_extract.py +++ b/classsify_extract.py @@ -31,6 +31,7 @@ import logging import sys from datetime import datetime +from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path from src.preprocessing.pdf_text_extraction import extract_text_from_pdf @@ -46,8 +47,9 @@ # Pipeline # --------------------------------------------------------------------------- -def run_pipeline( - input_path: Path, + +def _process_single_pdf( + pdf_path: Path, model_dir: str, llm_model: str, output_dir: Path, @@ -74,18 +76,18 @@ def run_pipeline( num_ctx: Context window size for Ollama. """ # ── Collect PDF paths ───────────────────────────────────────────────── - if input_path.is_dir(): - pdf_paths = sorted(input_path.glob("*.pdf")) + if pdf_path.is_dir(): + pdf_paths = sorted(pdf_path.glob("*.pdf")) if not pdf_paths: - print(f"[ERROR] No PDF files found in directory: {input_path}", file=sys.stderr) - log.error("No PDF files found in directory: %s", input_path) + print(f"[ERROR] No PDF files found in directory: {pdf_path}", file=sys.stderr) + log.error("No PDF files found in directory: %s", pdf_path) sys.exit(1) - print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {input_path}", file=sys.stderr) - elif input_path.is_file() and input_path.suffix.lower() == ".pdf": - pdf_paths = [input_path] + print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {pdf_path}", file=sys.stderr) + elif pdf_path.is_file() and pdf_path.suffix.lower() == ".pdf": + pdf_paths = [pdf_path] else: - print(f"[ERROR] Input must be a .pdf file or a directory of PDFs: {input_path}", file=sys.stderr) - log.error("Input must be a .pdf file or a directory of PDFs: %s", input_path) + print(f"[ERROR] pdf must be a .pdf file or a directory of PDFs: {pdf_path}", file=sys.stderr) + log.error("pdf must be a .pdf file or a directory of PDFs: %s", pdf_path) sys.exit(1) # ── Load classifier once (avoid re-reading model artifacts per file) ── @@ -119,92 +121,186 @@ def run_pipeline( "fraction_feeding": "", } - # ── Step 1: Extract text ────────────────────────────────────────── + # ── Step 1: Extract text ────────────────────────────────────────── + try: + original_text = extract_text_from_pdf(str(pdf_path)) + except Exception as e: + print(f" [ERROR] Text extraction failed ({pdf_path.name}): {e}", file=sys.stderr) + log.error("Text extraction failed for %s: %s", pdf_path.name, e) + row["extraction_status"] = "text_extraction_failed" + return row + + if not original_text.strip(): + print(f" [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr) + log.warning("No text extracted from %s — skipping.", pdf_path.name) + row["extraction_status"] = "empty_text" + return row + + print(f" [INFO] {pdf_path.name}: {len(original_text)} chars", file=sys.stderr) + + # ── Step 2: Classify ────────────────────────────────────────────── + clf_model, vectorizer, encoder = load_classifier(model_dir) + label, confidence, pred_prob = classify_text( + text=original_text, + model=clf_model, + vectorizer=vectorizer, + encoder=encoder, + threshold=confidence_threshold, + ) + print(f" [CLASSIFIER] {pdf_path.name} → {label} ({confidence:.2%})", file=sys.stderr) + + row["classification"] = label + row["confidence"] = f"{confidence:.4f}" + row["pred_prob"] = f"{pred_prob:.4f}" + + # ── Step 3: Extract ─────────────────────────────────────────────── + if label == "useful": + print(f" [INFO] {pdf_path.name}: Running LLM extraction...", file=sys.stderr) + + text_for_llm = original_text + if len(text_for_llm) > max_chars: + text_for_llm = extract_key_sections(text_for_llm, max_chars) + print(f" [INFO] {pdf_path.name}: trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr) + try: - original_text = extract_text_from_pdf(str(pdf_path)) + metrics = extract_metrics_from_text( + text=text_for_llm, + model=llm_model, + num_ctx=num_ctx, + ) + result = save_extraction_result( + metrics=metrics, + source_file=pdf_path, + original_text=original_text, + output_dir=output_dir, + ) + + m = result["metrics"] + row["extraction_status"] = "success" + row["species_name"] = m.get("species_name") or "" + row["study_location"] = m.get("study_location") or "" + row["study_date"] = m.get("study_date") or "" + row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"] + row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] + row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] + row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"] + except Exception as e: - print(f" [ERROR] Text extraction failed: {e}", file=sys.stderr) - log.error("Text extraction failed for %s: %s", pdf_path.name, e) - row["extraction_status"] = "text_extraction_failed" - summary_rows.append(row) - continue + print(f" [ERROR] LLM extraction failed ({pdf_path.name}): {e}", file=sys.stderr) + log.error("LLM extraction failed for %s: %s", pdf_path.name, e) + row["extraction_status"] = "extraction_failed" - if not original_text.strip(): - print(f" [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr) - log.warning("No text extracted from %s — skipping.", pdf_path.name) - row["extraction_status"] = "empty_text" + else: + print(f" [INFO] {pdf_path.name}: Not useful — skipping LLM extraction.", file=sys.stderr) + row["extraction_status"] = "skipped_not_useful" + + return row + + +def run_pipeline( + input_path: Path, + model_dir: str, + llm_model: str, + output_dir: Path, + confidence_threshold: float, + max_chars: int, + num_ctx: int, + workers: int = 1, +): + """Run classify → extract pipeline on one or more PDFs. + + For each PDF: + 1. Extract text via PyMuPDF / OCR (pdf_text_extraction.py) + 2. Classify with XGBoost (pdf_classifier.py) + 3. If 'useful': trim text to budget (llm_text.py), run LLM extraction + (llm_client.py), and save result JSON (llm_client.py) + 4. Append a row to the summary CSV regardless of classification outcome + + Args: + input_path: Path to a single PDF or a directory of PDFs. + model_dir: Directory containing classifier model artifacts. + llm_model: Ollama model name for extraction. + output_dir: Where to write JSON results and the summary CSV. + confidence_threshold: Classifier probability threshold for 'useful'. + max_chars: Max characters to send to the LLM. + num_ctx: Context window size for Ollama. + workers: Number of parallel worker processes (default: 1 = sequential). + """ + # ── Collect PDF paths ───────────────────────────────────────────────── + if input_path.is_dir(): + pdf_paths = sorted(input_path.glob("*.pdf")) + if not pdf_paths: + print(f"[ERROR] No PDF files found in directory: {input_path}", file=sys.stderr) + sys.exit(1) + print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {input_path}", file=sys.stderr) + elif input_path.is_file() and input_path.suffix.lower() == ".pdf": + pdf_paths = [input_path] + else: + print(f"[ERROR] Input must be a .pdf file or a directory of PDFs: {input_path}", file=sys.stderr) + sys.exit(1) + + output_dir.mkdir(parents=True, exist_ok=True) + summary_rows = [] + + if workers > 1 and len(pdf_paths) > 1: + print(f"[INFO] Using {workers} worker processes.", file=sys.stderr) + with ProcessPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit( + _process_single_pdf, + pdf_path, + model_dir, + llm_model, + output_dir, + confidence_threshold, + max_chars, + num_ctx, + ): pdf_path + for pdf_path in pdf_paths + } + for future in as_completed(futures): + pdf_path = futures[future] + try: + row = future.result() + except Exception as exc: + print(f" [ERROR] Worker failed for {pdf_path.name}: {exc}", file=sys.stderr) + row = {"filename": pdf_path.name, "extraction_status": "worker_failed"} + summary_rows.append(row) + else: + for idx, pdf_path in enumerate(pdf_paths, start=1): + print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr) + row = _process_single_pdf( + pdf_path, + model_dir, + llm_model, + output_dir, + confidence_threshold, + max_chars, + num_ctx, + ) summary_rows.append(row) - continue - - print(f" [INFO] Text size: {len(original_text)} chars", file=sys.stderr) - - # ── Step 2: Classify ────────────────────────────────────────────── - label, confidence, pred_prob = classify_text( - text=original_text, - model=clf_model, - vectorizer=vectorizer, - encoder=encoder, - threshold=confidence_threshold, - ) - print(f" [CLASSIFIER] → {label} ({confidence:.2%} confidence)", file=sys.stderr) - - row["classification"] = label - row["confidence"] = f"{confidence:.4f}" - row["pred_prob"] = f"{pred_prob:.4f}" - - # ── Step 3: Extract ─────────────────────────────────────────────── - if label == "useful": - print(f" [INFO] Running LLM extraction...", file=sys.stderr) - - text_for_llm = original_text - if len(text_for_llm) > max_chars: - text_for_llm = extract_key_sections(text_for_llm, max_chars) - print(f" [INFO] Text trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr) - - try: - metrics = extract_metrics_from_text( - text=text_for_llm, - model=llm_model, - num_ctx=num_ctx, - ) - result = save_extraction_result( - metrics=metrics, - source_file=pdf_path, - original_text=original_text, - output_dir=output_dir, - ) - - m = result["metrics"] - row["extraction_status"] = "success" - row["species_name"] = m.get("species_name") or "" - row["study_location"] = m.get("study_location") or "" - row["study_date"] = m.get("study_date") or "" - row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"] - row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] - row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] - row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"] - - except Exception as e: - print(f" [ERROR] LLM extraction failed: {e}", file=sys.stderr) - log.error("LLM extraction failed for %s: %s", pdf_path.name, e) - row["extraction_status"] = "extraction_failed" - - else: - print(f" [INFO] Not useful — skipping LLM extraction.", file=sys.stderr) - row["extraction_status"] = "skipped_not_useful" - - summary_rows.append(row) # ── Write summary CSV ───────────────────────────────────────────────── + from datetime import datetime + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") summaries_dir = output_dir / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) summary_path = summaries_dir / f"pipeline_summary_{timestamp}.csv" fieldnames = [ - "filename", "classification", "confidence", "pred_prob", - "extraction_status", "species_name", "study_location", "study_date", - "sample_size", "num_empty_stomachs", "num_nonempty_stomachs", "fraction_feeding", + "filename", + "classification", + "confidence", + "pred_prob", + "extraction_status", + "species_name", + "study_location", + "study_date", + "sample_size", + "num_empty_stomachs", + "num_nonempty_stomachs", + "fraction_feeding", ] with open(summary_path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) @@ -237,12 +333,10 @@ def run_pipeline( # CLI entry point # --------------------------------------------------------------------------- + def main(): parser = argparse.ArgumentParser( - description=( - "Classify PDFs as useful/not-useful, then extract structured diet " - "metrics from useful ones using an LLM." - ), + description=("Classify PDFs as useful/not-useful, then extract structured diet " "metrics from useful ones using an LLM."), formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -301,6 +395,12 @@ def main(): default=4096, help="Context window size for Ollama (default: 4096).", ) + parser.add_argument( + "--workers", + type=int, + default=1, + help="Number of parallel worker processes (default: 1 = sequential).", + ) args = parser.parse_args() @@ -321,8 +421,9 @@ def main(): confidence_threshold=args.confidence_threshold, max_chars=args.max_chars, num_ctx=args.num_ctx, + workers=args.workers, ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/data/results/metrics/Adams_1989_results.json b/data/results/metrics/Adams_1989_results.json new file mode 100644 index 0000000..851776d --- /dev/null +++ b/data/results/metrics/Adams_1989_results.json @@ -0,0 +1,14 @@ +{ + "source_file": "Adams_1989.txt", + "file_type": ".txt", + "metrics": { + "species_name": null, + "study_location": "Marion Island, sub-Antarctic", + "study_date": null, + "num_empty_stomachs": null, + "num_nonempty_stomachs": null, + "sample_size": null, + "fraction_feeding": null, + "source_pages": null + } +} \ No newline at end of file diff --git a/data/results/summaries/txt_pipeline_summary_20260227_211255.csv b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv new file mode 100644 index 0000000..b88726f --- /dev/null +++ b/data/results/summaries/txt_pipeline_summary_20260227_211255.csv @@ -0,0 +1,2 @@ +filename,raw_chars,cleaned_chars,trimmed_chars,extraction_status,species_name,study_location,study_date,sample_size,num_empty_stomachs,num_nonempty_stomachs,fraction_feeding +Adams_1989.txt,27673,22739,4999,success,,"Marion Island, sub-Antarctic",,,,, diff --git a/extract-from-txt.py b/extract-from-txt.py new file mode 100644 index 0000000..0ac01d1 --- /dev/null +++ b/extract-from-txt.py @@ -0,0 +1,395 @@ +"""Extract-from-TXT Pipeline + +Processes pre-classified useful .txt files through noise cleaning, section +filtering, text trimming, and LLM extraction — bypassing the XGBoost +classifier entirely. + +Every .txt file fed to this script is assumed to have already been confirmed +as useful (e.g. by the classifier in classify-extract.py or by manual review). +The pipeline: + + 1. Read raw .txt file + 2. Strip noise (references, acknowledgements, affiliations, captions, …) + via src/preprocessing/text_cleaner.py + 3. Drop irrelevant paragraphs (taxonomy, morphometrics, stats methods, …) + via src/preprocessing/section_filter.py + 4. Trim to the character budget using section-priority ranking + via src/llm/llm_text.py::extract_key_sections() + 5. Call Ollama for structured extraction via src/llm/llm_client.py + 6. Save result JSON per file and a summary CSV + +Usage:: + + # Process the default directory (data/processed-text/) + python extract-from-txt.py + + # Custom input directory + python extract-from-txt.py --input-dir path/to/txt_files/ + + # Full options + python extract-from-txt.py \\ + --input-dir data/processed-text/ \\ + --output-dir data/results/ \\ + --llm-model llama3.1:8b \\ + --max-chars 10000 \\ + --num-ctx 8192 + +Output: + - data/cleaned-text/text_cleaner/_.txt noise-stripped text + - data/cleaned-text/section_filter/_.txt section-filtered text + - data/cleaned-text/llm_text/_.txt trimmed text passed to Ollama + - data/results/metrics/_results.json per file + - data/results/summaries/txt_pipeline_summary_.csv overall +""" + +import argparse +import csv +import sys +from datetime import datetime +from pathlib import Path + +# Ensure the project root is on sys.path regardless of where this script is +# invoked from. +_PROJECT_ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(_PROJECT_ROOT)) + +from src.preprocessing.text_cleaner import clean_text +from src.preprocessing.section_filter import filter_relevant_sections +from src.llm.llm_text import extract_key_sections +from src.llm.llm_client import extract_metrics_from_text, save_extraction_result + + +# --------------------------------------------------------------------------- +# Core pipeline function +# --------------------------------------------------------------------------- + + +def run_txt_pipeline( + input_dir: Path, + output_dir: Path, + llm_model: str, + max_chars: int, + num_ctx: int, + single_file: Path = None, + useful_stems: set = None, +) -> None: + """Process every .txt file in *input_dir* through clean → filter → trim → extract. + + Args: + input_dir: Directory containing pre-classified useful .txt files. + Ignored when *single_file* is provided. + output_dir: Root output directory for JSON results and summary CSV. + llm_model: Ollama model name (e.g. ``"llama3.1:8b"``). + max_chars: Character budget for the text sent to Ollama. + num_ctx: Context window size requested from Ollama. + single_file: If set, process only this one .txt file. + """ + if single_file is not None: + txt_paths = [single_file] + else: + txt_paths = sorted(input_dir.glob("*.txt")) + if useful_stems is not None: + txt_paths = [p for p in txt_paths if p.stem in useful_stems or p.name in useful_stems] + if not txt_paths: + print(f"[ERROR] No .txt files found in: {input_dir}", file=sys.stderr) + sys.exit(1) + + print(f"[INFO] Found {len(txt_paths)} .txt file(s) to process", file=sys.stderr) + output_dir.mkdir(parents=True, exist_ok=True) + cleaner_text_dir = output_dir.parent / "cleaned-text" / "text_cleaner" + filter_text_dir = output_dir.parent / "cleaned-text" / "section_filter" + llm_text_dir = output_dir.parent / "cleaned-text" / "llm_text" + cleaner_text_dir.mkdir(parents=True, exist_ok=True) + filter_text_dir.mkdir(parents=True, exist_ok=True) + llm_text_dir.mkdir(parents=True, exist_ok=True) + summary_rows = [] + + for idx, txt_path in enumerate(txt_paths, start=1): + print(f"\n[{idx}/{len(txt_paths)}] {txt_path.name}", file=sys.stderr) + + row: dict = { + "filename": txt_path.name, + "raw_chars": "", + "cleaned_chars": "", + "filtered_chars": "", + "trimmed_chars": "", + "extraction_status": "", + "species_name": "", + "study_location": "", + "study_date": "", + "sample_size": "", + "num_empty_stomachs": "", + "num_nonempty_stomachs": "", + "fraction_feeding": "", + } + + # ── Step 1: Read ──────────────────────────────────────────────────── + try: + raw_text = txt_path.read_text(encoding="utf-8") + except Exception as exc: + print(f" [ERROR] Could not read file: {exc}", file=sys.stderr) + row["extraction_status"] = "read_failed" + summary_rows.append(row) + continue + + row["raw_chars"] = len(raw_text) + print(f" [INFO] Raw size : {len(raw_text):,} chars", file=sys.stderr) + + if not raw_text.strip(): + print(f" [WARN] File is empty — skipping.", file=sys.stderr) + row["extraction_status"] = "empty_file" + summary_rows.append(row) + continue + + # ── Step 2: Clean ─────────────────────────────────────────────────── + cleaned = clean_text(raw_text) + row["cleaned_chars"] = len(cleaned) + print(f" [INFO] After clean: {len(cleaned):,} chars", file=sys.stderr) + + if not cleaned.strip(): + print(f" [WARN] Nothing left after cleaning — skipping.", file=sys.stderr) + row["extraction_status"] = "empty_after_clean" + summary_rows.append(row) + continue + + # ── Step 3: Save text_cleaner output ──────────────────────────────── + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + cleaner_path = cleaner_text_dir / f"{txt_path.stem}_{ts}.txt" + try: + cleaner_path.write_text(cleaned, encoding="utf-8") + print(f" [INFO] Cleaner text : {cleaner_path.name}", file=sys.stderr) + except Exception as exc: + print(f" [WARN] Could not save cleaner text: {exc}", file=sys.stderr) + + # ── Step 4: Section filter ────────────────────────────────────────── + filtered = filter_relevant_sections(cleaned) + row["filtered_chars"] = len(filtered) + print(f" [INFO] After filter: {len(filtered):,} chars", file=sys.stderr) + + # ── Step 4b: Save section_filter output ───────────────────────────── + filter_path = filter_text_dir / f"{txt_path.stem}_{ts}.txt" + try: + filter_path.write_text(filtered, encoding="utf-8") + print(f" [INFO] Filter text : {filter_path.name}", file=sys.stderr) + except Exception as exc: + print(f" [WARN] Could not save filter text: {exc}", file=sys.stderr) + + # ── Step 5: Trim to LLM budget ────────────────────────────────────── + if len(filtered) > max_chars: + trimmed = extract_key_sections(filtered, max_chars) + print( + f" [INFO] After trim : {len(trimmed):,} chars " f"(budget {max_chars:,})", + file=sys.stderr, + ) + else: + trimmed = filtered + + row["trimmed_chars"] = len(trimmed) + + # ── Step 5b: Save llm_text output ────────────────────────────────── + llm_path = llm_text_dir / f"{txt_path.stem}_{ts}.txt" + try: + llm_path.write_text(trimmed, encoding="utf-8") + print(f" [INFO] LLM text : {llm_path.name}", file=sys.stderr) + except Exception as exc: + print(f" [WARN] Could not save LLM text: {exc}", file=sys.stderr) + + # ── Step 6: LLM extraction ────────────────────────────────────────── + print(f" [INFO] Calling Ollama ({llm_model})…", file=sys.stderr) + try: + metrics = extract_metrics_from_text( + text=trimmed, + model=llm_model, + num_ctx=num_ctx, + ) + except Exception as exc: + print(f" [ERROR] Ollama extraction failed: {exc}", file=sys.stderr) + row["extraction_status"] = "extraction_failed" + summary_rows.append(row) + continue + + # ── Step 7: Save JSON ─────────────────────────────────────────────── + try: + result = save_extraction_result( + metrics=metrics, + source_file=txt_path, + original_text=raw_text, # keep full text for page resolution + output_dir=output_dir, + ) + except Exception as exc: + print(f" [ERROR] Could not save result: {exc}", file=sys.stderr) + row["extraction_status"] = "save_failed" + summary_rows.append(row) + continue + + m = result["metrics"] + row["extraction_status"] = "success" + row["species_name"] = m.get("species_name") or "" + row["study_location"] = m.get("study_location") or "" + row["study_date"] = m.get("study_date") or "" + row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"] + row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"] + row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"] + row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"] + + print( + f" [OK] species={m.get('species_name')} " f"n={m.get('sample_size')} " f"date={m.get('study_date')}", + file=sys.stderr, + ) + + summary_rows.append(row) + + # ── Write summary CSV ─────────────────────────────────────────────────── + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + summaries_dir = output_dir / "summaries" + summaries_dir.mkdir(parents=True, exist_ok=True) + summary_path = summaries_dir / f"txt_pipeline_summary_{timestamp}.csv" + + fieldnames = [ + "filename", + "raw_chars", + "cleaned_chars", + "filtered_chars", + "trimmed_chars", + "extraction_status", + "species_name", + "study_location", + "study_date", + "sample_size", + "num_empty_stomachs", + "num_nonempty_stomachs", + "fraction_feeding", + ] + with open(summary_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(summary_rows) + + # ── Final report ──────────────────────────────────────────────────────── + total = len(summary_rows) + succeeded = sum(1 for r in summary_rows if r["extraction_status"] == "success") + failed = total - succeeded + + print("\n" + "=" * 55, file=sys.stderr) + print("TXT EXTRACTION PIPELINE COMPLETE", file=sys.stderr) + print("=" * 55, file=sys.stderr) + print(f" Files processed : {total}", file=sys.stderr) + print(f" Successful : {succeeded}", file=sys.stderr) + print(f" Failed / skipped : {failed}", file=sys.stderr) + print(f" Summary CSV : {summary_path}", file=sys.stderr) + print("=" * 55, file=sys.stderr) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description=("Extract structured predator-diet metrics from pre-classified " "useful .txt files using Ollama."), + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + Default (data/processed-text/ → data/results/): + python extract-from-txt.py + + Custom directories: + python extract-from-txt.py --input-dir data/useful-txt/ --output-dir out/ + + Different model / tighter budget: + python extract-from-txt.py --llm-model mistral:7b --max-chars 4500 + """, + ) + parser.add_argument( + "--file", + type=str, + default=None, + help="Path to a single .txt file to process. Overrides --input-dir.", + ) + parser.add_argument( + "--input-dir", + type=str, + default="data/processed-text", + help="Directory of .txt files to process (default: data/processed-text).", + ) + parser.add_argument( + "--output-dir", + type=str, + default="data/results", + help="Root output directory for JSON results and CSV summary " "(default: data/results).", + ) + parser.add_argument( + "--llm-model", + type=str, + # default="llama3.1:8b", + default="qwen2.5:7b", + help="Ollama model name (default: qwen2.5:7b).", + ) + parser.add_argument( + "--max-chars", + type=int, + default=10000, + help="Maximum characters to send to Ollama after cleaning (default: 10000).", + ) + parser.add_argument( + "--num-ctx", + type=int, + default=8192, + help="Ollama context window size (default: 8192).", + ) + parser.add_argument( + "--labels", + type=str, + default=None, + help="Path to labels.json. When provided, only files labelled 'useful' are processed.", + ) + + args = parser.parse_args() + + # ── Load label filter ─────────────────────────────────────────────── + useful_stems = None + if args.labels: + import json + + labels_path = Path(args.labels) + if not labels_path.exists(): + print(f"[ERROR] Labels file not found: {labels_path}", file=sys.stderr) + sys.exit(1) + with open(labels_path, encoding="utf-8") as f: + labels = json.load(f) + useful_stems = {k for k, v in labels.items() if v == "useful"} + print(f"[INFO] Labels filter: {len(useful_stems)} useful papers", file=sys.stderr) + + single_file = None + if args.file: + single_file = Path(args.file) + if not single_file.exists(): + print(f"[ERROR] File not found: {single_file}", file=sys.stderr) + sys.exit(1) + if single_file.suffix.lower() != ".txt": + print(f"[ERROR] --file must point to a .txt file: {single_file}", file=sys.stderr) + sys.exit(1) + input_dir = single_file.parent + else: + input_dir = Path(args.input_dir) + if not input_dir.exists(): + print(f"[ERROR] Input directory not found: {input_dir}", file=sys.stderr) + sys.exit(1) + if not input_dir.is_dir(): + print(f"[ERROR] --input-dir must be a directory: {input_dir}", file=sys.stderr) + sys.exit(1) + + run_txt_pipeline( + input_dir=input_dir, + output_dir=Path(args.output_dir), + llm_model=args.llm_model, + max_chars=args.max_chars, + num_ctx=args.num_ctx, + single_file=single_file, + useful_stems=useful_stems, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py index f7ff28f..b03bb4c 100644 --- a/scripts/full_pipeline.py +++ b/scripts/full_pipeline.py @@ -25,8 +25,10 @@ import os import json import argparse +import time +from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError from pathlib import Path -from typing import Dict +from typing import Dict, Tuple import subprocess import sys @@ -49,6 +51,30 @@ from src.preprocessing.pdf_text_extraction import extract_text_from_pdf_bytes +# Module-level flag set once per worker process via initializer +_worker_skip_ocr: bool = False + + +def _init_worker(skip_ocr: bool) -> None: + """Called once per worker process to set shared config.""" + global _worker_skip_ocr + _worker_skip_ocr = skip_ocr + + +def _extract_local_pdf(args: Tuple[Path, str]) -> Tuple[str, str, str | None]: + """Worker: read a local PDF and return (txt_name, label, text | None).""" + pdf_path, label = args + print(f" [STARTED] {pdf_path.name}", flush=True) + try: + with open(pdf_path, "rb") as f: + pdf_bytes = f.read() + text = extract_text_from_pdf_bytes(pdf_bytes, skip_ocr=_worker_skip_ocr) + return (f"{pdf_path.stem}.txt", label, text) + except Exception as e: + print(f"Error processing {pdf_path.name}: {e}") + return (f"{pdf_path.stem}.txt", label, None) + + def run(cmd): print(f"$ {' '.join(cmd)}") r = subprocess.run(cmd) @@ -97,7 +123,7 @@ def process_api_mode(): print(f"Wrote {len(labels)} labeled text files.") -def process_local_mode(data_path: Path): +def process_local_mode(data_path: Path, workers: int = 1, skip_ocr: bool = False, timeout: int = 120): """Process PDFs from local directory.""" if not data_path.exists(): raise RuntimeError(f"Data path does not exist: {data_path}") @@ -112,28 +138,84 @@ def process_local_mode(data_path: Path): out_dir = Path("data/processed-text") out_dir.mkdir(parents=True, exist_ok=True) - labels: Dict[str, str] = {} + # Resume support: load existing labels and skip already-processed files + labels_file = Path("data/labels.json") + if labels_file.exists(): + with labels_file.open("r", encoding="utf-8") as f: + labels: Dict[str, str] = json.load(f) + already_done = {name for name in labels if (out_dir / name).exists()} + print(f"[INFO] Resuming — {len(already_done)} files already processed, skipping them.") + else: + labels = {} + already_done = set() + + # Build work items: (pdf_path, label) + work_items = [] for folder, label in [(useful_dir, "useful"), (not_useful_dir, "not-useful")]: pdf_files = list(folder.glob("*.pdf")) print(f"Found {len(pdf_files)} PDFs in local folder '{label}'") - for pdf_path in pdf_files: - try: - with open(pdf_path, "rb") as f: - pdf_bytes = f.read() - text = extract_text_from_pdf_bytes(pdf_bytes) - stem = pdf_path.stem - txt_name = f"{stem}.txt" + txt_name = f"{pdf_path.stem}.txt" + if txt_name in already_done: + continue + work_items.append((pdf_path, label)) + + if not work_items: + print("[INFO] All files already processed. Nothing to do.") + write_labels(labels, labels_file) + return + + print(f"[INFO] {len(work_items)} PDFs to process.") + if skip_ocr: + print("[INFO] OCR disabled — using embedded text only (fast mode).") + + total = len(work_items) + done = 0 + failed = 0 + t0 = time.time() + + if workers > 1 and len(work_items) > 1: + print(f"[INFO] Using {workers} worker processes for PDF extraction.") + with ProcessPoolExecutor( + max_workers=workers, + initializer=_init_worker, + initargs=(skip_ocr,), + ) as executor: + futures = {executor.submit(_extract_local_pdf, item): item for item in work_items} + for future in as_completed(futures): + pdf_path, label = futures[future] + try: + txt_name, label, text = future.result(timeout=timeout) + except TimeoutError: + print(f" [TIMEOUT] {pdf_path.name} exceeded {timeout}s — skipped") + failed += 1 + continue + except Exception as exc: + print(f" [ERROR] {pdf_path.name}: {exc}") + failed += 1 + continue + done += 1 + if text is not None: + (out_dir / txt_name).write_text(text, encoding="utf-8") + labels[txt_name] = label + elapsed = time.time() - t0 + print(f" [{done + failed}/{total}] Processed {txt_name} ({elapsed:.0f}s elapsed)") + # Checkpoint labels every 50 files + if done % 50 == 0: + write_labels(labels, labels_file) + else: + for item in work_items: + txt_name, label, text = _extract_local_pdf(item) + done += 1 + if text is not None: (out_dir / txt_name).write_text(text, encoding="utf-8") labels[txt_name] = label - print(f"Processed {pdf_path.name}") - except Exception as e: - print(f"Error processing {pdf_path.name}: {e}") - continue + elapsed = time.time() - t0 + print(f" [{done}/{total}] Processed {txt_name} ({elapsed:.0f}s elapsed)") - write_labels(labels, Path("data/labels.json")) - print(f"Wrote {len(labels)} labeled text files.") + write_labels(labels, labels_file) + print(f"Wrote {len(labels)} labeled text files. ({failed} timed out / failed)") def main(): @@ -152,11 +234,30 @@ def main(): group.add_argument("--api", action="store_true", help="Use API mode to download PDFs from Google Drive") group.add_argument("--local", type=Path, metavar="PATH", help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)") + parser.add_argument( + "--workers", + type=int, + default=0, + help="Number of parallel worker processes for PDF extraction (default: 0 = auto-detect CPU count).", + ) + parser.add_argument( + "--skip-ocr", + action="store_true", + help="Skip Tesseract OCR fallback — use only embedded text (much faster, recommended for first pass).", + ) + parser.add_argument( + "--timeout", + type=int, + default=120, + help="Per-PDF timeout in seconds (default: 120). PDFs exceeding this are skipped.", + ) + args = parser.parse_args() + workers = args.workers if args.workers > 0 else os.cpu_count() or 4 if args.local: print(f"Running in LOCAL mode with data path: {args.local}") - process_local_mode(args.local) + process_local_mode(args.local, workers=workers, skip_ocr=args.skip_ocr, timeout=args.timeout) else: # args.api print("Running in API mode (Google Drive)") process_api_mode() diff --git a/src/llm/llm_client.py b/src/llm/llm_client.py index a5bcf19..cd25900 100644 --- a/src/llm/llm_client.py +++ b/src/llm/llm_client.py @@ -32,20 +32,27 @@ def extract_metrics_from_text( text: str, - model: str = "llama3.1:8b", - num_ctx: int = 4096, + # model: str = "llama3.1:8b", + model: str = "qwen2.5:7b", + num_ctx: int = 8192, + _retry: bool = False, ) -> PredatorDietMetrics: """Extract structured metrics from text using Ollama. + On the first call, if any fields come back null the function automatically + retries once with a focused follow-up prompt that gives method-specific + hints for finding the missing data. + Args: text: Preprocessed text content from a scientific publication. model: Name of the Ollama model to use. num_ctx: Context window size to request from Ollama (lower = less memory). + _retry: Internal flag — True when this is the automatic retry attempt. Returns: PredatorDietMetrics object with extracted data. """ - prompt = f"""You are a scientific data extraction assistant. Your task is to read a predator diet survey publication and return a single flat JSON object with exactly these fields: + prompt = f"""You are a scientific data extraction assistant. Your task is to read a predator diet study and return a single flat JSON object with exactly these fields: species_name - string or null study_location - string or null @@ -54,46 +61,67 @@ def extract_metrics_from_text( num_nonempty_stomachs - integer (>= 0) or null sample_size - integer (> 0) or null -Use null for any field whose value cannot be confidently determined from the text. +Use null ONLY when the value truly cannot be determined from any part of the text. FIELD DEFINITIONS -species_name: Binomial Latin name (Genus species) of the PRIMARY PREDATOR whose diet is studied. This is the animal whose stomachs/guts were examined, not its prey. Return exactly one species. If multiple predators are studied, choose the one with the most stomach samples. Capitalize the genus, lowercase the specific epithet (e.g., "Pygoscelis papua"). - -study_location: Geographic area where predator specimens were collected. Include site, region, and country if available (e.g., "Marion Island, sub-Antarctic"). Check Methods, Study Area, or Study Site sections. - -study_date: Year or year-range of specimen collection, NOT publication year. Format "YYYY" or "YYYY-YYYY". Look for phrases like "specimens collected in", "sampling period", "field season", "between [year] and [year]". Return null if only publication year is visible. - -num_empty_stomachs: Number of predators with stomachs containing no food. Synonyms: "empty", "vacant", "without food", "zero prey items", "stomachs with no contents", "N individuals had empty stomachs". - -num_nonempty_stomachs: Number of predators with stomachs containing food. Synonyms: "non-empty", "with food", "containing prey", "with contents", "fed", "N contained food", "N had prey items". - -sample_size: Total number of predator individuals examined. When both num_empty_stomachs and num_nonempty_stomachs are available, sample_size equals their sum. Look for phrases like "N stomachs were examined", "a total of N individuals", "N specimens", "n=", "sample size of N". +species_name: Binomial Latin name (Genus species) of the PRIMARY PREDATOR whose diet is studied. This is the animal being studied, not its prey. Return exactly one species. If multiple predators appear, choose the one with the most samples. Capitalize genus, lowercase epithet (e.g., "Pygoscelis papua"). + +study_location: Geographic area where specimens were collected. Include site, region, and country if available (e.g., "Marion Island, sub-Antarctic"). Check Methods, Study Area, Study Site, and Abstract. + +study_date: Year or year-range of specimen collection, NOT publication year. Format "YYYY" or "YYYY-YYYY". + Where to look: + - "specimens collected in", "sampling period", "field season", "between [year] and [year]" + - "from March 1984 to March 1985" → "1984-1985" + - "Received 23 November 2007" in article info suggests collection was ~2005-2007 + - If the only dates are "Received" or "Accepted" submission dates and no collection dates are stated, estimate the collection period as 1-2 years before submission. + +num_empty_stomachs: Number of predators with NO food in their digestive tract. Apply broadly across study methods: + - Stomach dissection: "empty", "vacant", "without food", "zero prey items" + - Stomach pumping / lavage: "yielded no food", "no contents obtained", "produced no material" + - Scat / fecal analysis: "scats with no identifiable prey", "empty scats" + - Regurgitation: "failed to regurgitate", "no pellet produced" + - Immunoassay / molecular: "tested negative for all prey", "no prey detected" + If the study uses stomach pumping and ALL samples contained food, set this to 0. + +num_nonempty_stomachs: Number of predators with food in their digestive tract. Same method mapping as above: + - "non-empty", "with food", "containing prey", "with contents", "fed" + - Stomach pumping: "food samples collected", "samples containing prey" + - Scat: "scats with identifiable prey remains" + - Immunoassay: "tested positive for prey", "positive reactions" + If study says "a total of N food samples was collected" and implies ALL had food, set num_nonempty_stomachs = N. + +sample_size: Total number of predator individuals examined. Equals num_empty + num_nonempty when both are known. + - "N stomachs examined", "N individuals", "N specimens", "n=N", "a total of N" + - "N food samples" when all sampled animals contributed one sample + - "two groups of 225" → sample_size = 450 + - Check Abstract, Methods, and Results. RULES -- Do not invent data; use null if ambiguous or missing. +- Do not invent data; use null only if truly ambiguous or missing. - Return a single JSON object; do not return arrays. - Ignore page markers [PAGE N]. - Prioritize Abstract, Methods, and Results sections. -- Be especially careful to distinguish collection dates from publication dates. +- Carefully distinguish collection dates from publication/submission dates. +- If ALL samples had food (e.g., stomach pumping where every sample yielded prey), set num_empty_stomachs = 0 and num_nonempty_stomachs = sample_size. EXAMPLES -1. Simple complete case: -{{"species_name": "Pygoscelis papua", "study_location": "Marion Island, sub-Antarctic", "study_date": "1984-1985", "num_empty_stomachs": 5, "num_nonempty_stomachs": 15, "sample_size": 20}} +1. Traditional stomach dissection: +{{"species_name": "Canis lupus", "study_location": "Yellowstone National Park, Wyoming, USA", "study_date": "2019", "num_empty_stomachs": 5, "num_nonempty_stomachs": 47, "sample_size": 52}} + +2. Stomach pumping (all samples had food): +{{"species_name": "Pygoscelis papua", "study_location": "Marion Island, sub-Antarctic", "study_date": "1984-1985", "num_empty_stomachs": 0, "num_nonempty_stomachs": 144, "sample_size": 144}} -2. Missing empty stomach data (can infer from sample_size): -{{"species_name": "Canis lupus", "study_location": "Yellowstone National Park, Wyoming, USA", "study_date": "2019", "num_empty_stomachs": null, "num_nonempty_stomachs": 47, "sample_size": 52}} +3. Immunoassay / molecular detection: +{{"species_name": "Nucella lapillus", "study_location": "Swans Island, Maine, USA", "study_date": "2005-2007", "num_empty_stomachs": null, "num_nonempty_stomachs": null, "sample_size": 450}} -3. Multi-year study: +4. Scat / fecal analysis: {{"species_name": "Vulpes vulpes", "study_location": "Bristol, UK", "study_date": "2015-2018", "num_empty_stomachs": 12, "num_nonempty_stomachs": 88, "sample_size": 100}} -4. Minimal data available: +5. Minimal data: {{"species_name": "Ursus arctos", "study_location": null, "study_date": "2020", "num_empty_stomachs": null, "num_nonempty_stomachs": null, "sample_size": 23}} -5. Only some fields extractable: -{{"species_name": "Zalophus californianus", "study_location": "California coast", "study_date": null, "num_empty_stomachs": 8, "num_nonempty_stomachs": 34, "sample_size": 42}} - TEXT {text} """ @@ -101,9 +129,78 @@ def extract_metrics_from_text( messages=[{"role": "user", "content": prompt}], model=model, format=PredatorDietMetrics.model_json_schema(), + options={"num_ctx": num_ctx}, ) metrics = PredatorDietMetrics.model_validate_json(response.message.content) + + # ── Retry once if any extractable fields are null ─────────────────────── + _retryable = [ + "species_name", + "study_location", + "study_date", + "num_empty_stomachs", + "num_nonempty_stomachs", + "sample_size", + ] + missing = [f for f in _retryable if getattr(metrics, f) is None] + + if not _retry and missing: + print( + f" [INFO] Retry: {', '.join(missing)} came back null — re-prompting", + file=sys.stderr, + ) + + # Build targeted hints for each missing field + _hints = { + "species_name": ("- species_name: Look for the first binomial Latin name (Genus species) " "in the title or abstract. This is the PREDATOR, not its prey.\n"), + "study_location": ("- study_location: Check Methods or Study Area sections for place names, " "islands, countries, or coordinates.\n"), + "study_date": ( + "- study_date: Look for phrases like 'collected in', 'sampled during', " + "'field season', 'from [month] [year] to [month] [year]'. " + "If no collection date is explicit, infer from 'Received [date]' — " + "collection is typically 1-2 years before manuscript submission.\n" + ), + "num_empty_stomachs": ( + "- num_empty_stomachs: Look for 'empty', 'no food', 'no contents', " + "'negative for prey'. If ALL samples had food (e.g., stomach pumping " + "where every sample produced material), return 0.\n" + ), + "num_nonempty_stomachs": ( + "- num_nonempty_stomachs: Look for 'contained food', 'with prey', " "'non-empty', 'food samples collected'. If ALL samples had food, " "this equals sample_size.\n" + ), + "sample_size": ("- sample_size: Look for 'N stomachs', 'N specimens', 'a total of N', " "'n=N', 'N individuals examined', 'two groups of N'. Check Abstract, " "Methods, and Results.\n"), + } + + retry_prompt = ( + "The following fields were returned as null. Please re-read the text " + "carefully — especially the Abstract, Methods, and Results sections — " + "and try harder to find values for them. Think about different study " + "methods (stomach pumping, scat analysis, immunoassays, etc.).\n\n" + f"Missing fields: {', '.join(missing)}\n\n" + "Hints:\n" + ) + for field in missing: + retry_prompt += _hints.get(field, "") + retry_prompt += f"\nTEXT\n{text}" + + retry_response = chat( + messages=[{"role": "user", "content": retry_prompt}], + model=model, + format=PredatorDietMetrics.model_json_schema(), + options={"num_ctx": num_ctx}, + ) + retry_metrics = PredatorDietMetrics.model_validate_json(retry_response.message.content) + + # Merge: prefer retry values for fields that were null, keep originals otherwise + merged = metrics.model_dump() + retry_dict = retry_metrics.model_dump() + for field in _retryable: + if merged.get(field) is None and retry_dict.get(field) is not None: + merged[field] = retry_dict[field] + + metrics = PredatorDietMetrics.model_validate(merged) + return metrics @@ -158,10 +255,11 @@ def save_extraction_result( def main(): parser = argparse.ArgumentParser(description="Extract predator diet metrics from PDFs or text files using LLM") parser.add_argument("input_file", type=str, help="Path to the input file (.pdf or .txt)") - parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") + # parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") + parser.add_argument("--model", type=str, default="qwen2.5:7b", help="Ollama model to use (default: qwen2.5:7b)") parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results/metrics)") - parser.add_argument("--max-chars", type=int, default=48000, help="Maximum characters of text to send to the model (default: 48000). Reduce if you hit CUDA/OOM errors.") - parser.add_argument("--num-ctx", type=int, default=4096, help="Context window size for the model (default: 4096). Lower values use less memory.") + parser.add_argument("--max-chars", type=int, default=12000, help="Maximum characters of text to send to the model (default: 12000). Reduce if you hit CUDA/OOM errors.") + parser.add_argument("--num-ctx", type=int, default=8192, help="Context window size for the model (default: 8192). Lower values use less memory.") args = parser.parse_args() diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py index 6887889..2083115 100644 --- a/src/llm/llm_text.py +++ b/src/llm/llm_text.py @@ -9,12 +9,189 @@ import re import sys from pathlib import Path -from typing import List, Tuple +from typing import List, Optional, Tuple project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) -from src.preprocessing.pdf_text_extraction import extract_text_from_pdf +# NOTE: pdf_text_extraction is imported lazily inside read_file_text() to +# avoid pulling in heavy PDF dependencies (camelot, fitz) when only the +# text-based pipeline is used. + +# --------------------------------------------------------------------------- +# Section-boundary splitting helpers +# --------------------------------------------------------------------------- + +# Optional numeric prefix shared by all section patterns, e.g. "1.", "2.1.", "3.2.1 " +_NUM_PREFIX = r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + +# (pattern, priority) — lower number = kept first when budget is tight +_SECTION_PRIORITIES: List[Tuple[re.Pattern, int]] = [ + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(abstract|summary)\s*[:\.]?\s*$"), 0), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(results?|findings?)\s*[:\.]?\s*$"), 1), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(materials?\s*(?:and|&)\s*methods?|methods?|methodology" r"|study\s*(?:area|site|design|region|period))\s*[:\.]?\s*$"), 2), + (re.compile(r"(?i)^\s*table\s*\d"), 3), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(introduction|background)\s*[:\.]?\s*$"), 4), + (re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(discussion|conclusions?|summary\s+and\s+discussion)\s*[:\.]?\s*$"), 5), +] + +_DROP_SECTION_RE: re.Pattern = re.compile( + r"(?i)^\s*" + r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" # optional numeric prefix + r"(" + r"acknowledge?ments?" + r"|literature\s+cited" + r"|references?\s+cited" + r"|references?" + r"|bibliography" + r"|appendix\b" + r"|supplementary\s+(data|material|information)" + r"|supporting\s+information" + r"|conflict\s+of\s+interest" + r"|competing\s+interests?" + r"|author\s+contributions?" + r"|funding(?:\s+(?:sources?|information))?" + r"|data\s+availability" + r"|ethics\s+(statement|declaration)" + r")\s*[:\.]?\s*$" +) + + +def _section_priority(heading: str) -> int: + """Return the priority integer for a section heading (lower = more important). + Unknown / un-labelled sections get priority 6. + Drop sections return 999 and should be excluded before calling this. + """ + for pat, pri in _SECTION_PRIORITIES: + if pat.match(heading.strip()): + return pri + return 6 + + +# --------------------------------------------------------------------------- +# Paragraph-level keyword scoring +# --------------------------------------------------------------------------- +# Each tuple is (compiled pattern, score weight). A paragraph’s total score +# is the sum of weights for every pattern that matches anywhere in it. +# Higher-scoring paragraphs are packed into the LLM budget first. + +_FIELD_PATTERNS: List[Tuple[re.Pattern, int]] = [ + # sample_size — explicit counts of stomachs / specimens / individuals + ( + re.compile( + r"(?i)(\bn\s*=\s*\d+" + r"|total\s+of\s+\d+" + r"|\d+\s+stomachs?" + r"|\d+\s+specimens?" + r"|\d+\s+individuals?" + r"|\d+\s+birds?" + r"|\d+\s+fish" + r"|\d+\s+samples?" + r"|sample\s+size\s+(of\s+)?\d+" + r"|examined\s+\d+" + r"|\d+\s+(were|was)\s+(examined|collected|analysed|analyzed|sampled))" + ), + 4, + ), + # num_empty_stomachs — explicit empty-stomach language + ( + re.compile( + r"(?i)(empty\s+stomachs?" + r"|stomachs?\s+(were\s+)?empty" + r"|had\s+empty" + r"|without\s+food" + r"|without\s+(stomach\s+)?contents?" + r"|zero\s+prey" + r"|no\s+food\s+(items?|remains?)" + r"|vacuous|vacant\s+stomachs?)" + ), + 5, + ), + # num_nonempty_stomachs / fraction_feeding + ( + re.compile( + r"(?i)(non.?empty" + r"|contained\s+(food|prey|items?)" + r"|with\s+food" + r"|with\s+(stomach\s+)?contents?" + r"|had\s+(food|prey)" + r"|feeding\s+rate" + r"|proportion\s+(feeding|with\s+food)" + r"|percent\s+(feeding|with\s+food)" + r"|\d+\s*%\s+of\s+(stomachs?|individuals?|birds?|fish|specimens?))" + ), + 5, + ), + # frequency of occurrence — standard diet-study metric + ( + re.compile(r"(?i)(frequency\s+of\s+occurrence" r"|occurrence\s+frequency" r"|%\s*fo\b" r"|\bfo\s*=\s*\d" r"|index\s+of\s+relative\s+importance" r"|\biri\b)"), + 3, + ), + # stomach content / diet composition — general relevance signal + ( + re.compile(r"(?i)(stomach\s+content" r"|gut\s+content" r"|diet\s+composition" r"|food\s+habits?" r"|dietary\s+(analysis|study|data))"), + 2, + ), + # general percentage / fraction near gut/stomach context + (re.compile(r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" r"|\d+\s+of\s+\d+\s+(were|had|contained)" r"|proportion\s+of\s+\d+)"), 2), + # study date — collection period + ( + re.compile( + r"(?i)(collected\s+(in|during|between)" + r"|sampled\s+(in|during|between)" + r"|field\s+season" + r"|study\s+period" + r"|between\s+\d{4}\s+and\s+\d{4}" + r"|\d{4}[\-\u2013]\d{4}" + r"|sampling\s+period)" + ), + 2, + ), + # study location + (re.compile(r"(?i)(study\s+(area|site|region)" r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" r"|sampling\s+(location|site|area))"), 2), + # any binomial species name (used as a weak relevance signal) + (re.compile(r"\b[A-Z][a-z]+\s+[a-z]{3,}\b"), 1), +] + +# Maximum characters reserved for the pinned abstract/preamble. Any +# remaining budget is filled by keyword-scored paragraphs. +_ABSTRACT_CAP: int = 2000 + + +def _truncate_at_sentence(text: str, limit: int) -> str: + """Truncate *text* to at most *limit* chars, preferring a sentence boundary. + + Searches backwards from *limit* for the last ``.``, ``!``, or ``?``. If + one is found in the second half of the allowed window it is used; otherwise + the hard character limit is applied. + """ + if len(text) <= limit: + return text + candidate = text[:limit] + for punct in (".", "!", "?"): + pos = candidate.rfind(punct) + if pos > limit // 2: + return candidate[: pos + 1] + return candidate + + +def _score_paragraph(para: str) -> int: + """Return a keyword-relevance score for a single paragraph of text. + + Paragraphs that mention many extraction targets (stomach counts, empty / + non-empty language, sample sizes, dates, locations) score higher and are + preferentially included in the LLM prompt. + """ + score = 0 + for pat, weight in _FIELD_PATTERNS: + if pat.search(para): + score += weight + return score + + +# --------------------------------------------------------------------------- +# Legacy page-split helpers (kept for source-page resolution in llm_client.py) +# --------------------------------------------------------------------------- log = logging.getLogger(__name__) @@ -78,22 +255,34 @@ def classify_page(page_text: str) -> Tuple[bool, int]: def extract_key_sections(text: str, max_chars: int) -> str: """Return the most informative portion of text within the character budget. - Strategy: - 1. Split the paper into pages using [PAGE N] markers - 2. Drop pages belonging to References/Acknowledgements/Appendix - 3. Rank remaining pages by section priority: - Abstract > Results > Methods > Tables > Introduction > Discussion > other - 4. Greedily pack pages in priority order until the budget is spent - 5. Re-order selected pages by their original page number so the LLM - sees them in reading order + Two-phase strategy + ------------------ + Phase 1 — Abstract pin + Always include the preamble (everything before the first section + heading), which almost always contains the abstract, title, and key + study metadata. Capped at ``_ABSTRACT_CAP`` characters so it cannot + crowd out the data-rich content below. + + Phase 2 — Keyword-scored paragraph mining + Split all remaining non-dropped section text into blank-line-separated + paragraphs. Score each paragraph by how many extraction-relevant + keywords it contains (sample counts, empty/non-empty stomach language, + percentages, dates, locations, species names). Pack the + highest-scoring paragraphs first until the remaining budget is full. + Re-order selected paragraphs to their original document position so + the LLM receives coherent, in-order text. + + This approach guarantees that sentences like + “A total of 144 stomach samples… 58% contained food” + are always included regardless of which named section they fall in. Args: - text: Full text of the document - max_chars: Maximum character budget for the output + text: Cleaned text of the document (may contain [PAGE N] markers). + max_chars: Maximum character budget for the output. Returns: - Extracted text containing the most relevant sections within the budget. - If the full text fits within max_chars, it is returned as-is. + Extracted text fitting within *max_chars*. + If the full text already fits, it is returned unchanged. """ if len(text) <= max_chars: return text @@ -114,13 +303,97 @@ def extract_key_sections(text: str, max_chars: int) -> str: if len(page_with_marker) <= budget: selected.append((page_num, page_with_marker)) budget -= len(page_with_marker) + lines = text.split("\n") + + # ── Phase 1: Split document into named sections ──────────────────────── + # Each entry: (start_line_idx, heading, content) + sections: List[Tuple[int, str, str]] = [] + current_heading: str = "[PREAMBLE]" + current_start: int = 0 + current_lines: List[str] = [] + + for i, line in enumerate(lines): + stripped = line.strip() + is_drop = bool(_DROP_SECTION_RE.match(stripped)) if stripped else False + is_known = any(pat.match(stripped) for pat, _ in _SECTION_PRIORITIES) if stripped else False + if is_drop or is_known: + sections.append((current_start, current_heading, "\n".join(current_lines))) + current_heading = stripped + current_start = i + current_lines = [] + else: + current_lines.append(line) + sections.append((current_start, current_heading, "\n".join(current_lines))) + + # ── Phase 1 result: pin the abstract/preamble ────────────────────────── + # Sections whose content is always pinned (counted against _ABSTRACT_CAP): + # the implicit preamble and any explicitly-named Abstract/Summary section. + _ABSTRACT_HEADING_RE = re.compile(r"(?i)^\s*" + _NUM_PREFIX + r"(abstract|summary)\s*[:\.]?\s*$") + + preamble_parts: List[str] = [] + body_sections: List[Tuple[int, str, str]] = [] # (start, heading, content) + for start, heading, content in sections: + if _DROP_SECTION_RE.match(heading.strip()) if heading.strip() else False: + continue # hard-drop references / acknowledgements / appendix + if heading == "[PREAMBLE]" or _ABSTRACT_HEADING_RE.match(heading.strip()): + preamble_parts.append(content.strip()) + else: + body_sections.append((start, heading, content)) + + preamble_text = "\n\n".join(p for p in preamble_parts if p)[:_ABSTRACT_CAP] + + budget = max_chars - len(preamble_text) + + # ── Phase 2: keyword-scored paragraph mining ─────────────────────────── + # Collect every paragraph from ALL body sections (blank-line separated). + # Each paragraph remembers its position so we can restore reading order. + # (start_line, paragraph_text, keyword_score) + raw_paragraphs: List[Tuple[int, str, int]] = [] + + for sec_start, heading, content in body_sections: + # Prepend the section heading so the LLM sees which section it’s in. + block = (f"{heading}\n{content}").strip() if heading else content.strip() + if not block: + continue + # Split on blank lines into paragraphs + para_lines: List[str] = [] + para_start = sec_start + for j, ln in enumerate(block.split("\n")): + if ln.strip(): + para_lines.append(ln) + else: + if para_lines: + para_text = "\n".join(para_lines).strip() + raw_paragraphs.append((sec_start + j, para_text, _score_paragraph(para_text))) + para_lines = [] + para_start = sec_start + j + 1 + if para_lines: + para_text = "\n".join(para_lines).strip() + raw_paragraphs.append((para_start, para_text, _score_paragraph(para_text))) + + # Sort by score descending; use original position as tiebreaker (earlier first) + raw_paragraphs.sort(key=lambda t: (-t[2], t[0])) + + # Greedily fill budget with highest-scoring paragraphs + selected_paras: List[Tuple[int, str]] = [] # (orig_pos, text) + for pos, para_text, score in raw_paragraphs: + if budget <= 0: + break + if len(para_text) <= budget: + selected_paras.append((pos, para_text)) + budget -= len(para_text) elif budget > 200: - selected.append((page_num, page_with_marker[:budget])) + selected_paras.append((pos, _truncate_at_sentence(para_text, budget))) budget = 0 - break - selected.sort(key=lambda t: t[0]) - return "\n".join(chunk for _, chunk in selected) + # Re-sort to original document order so the LLM reads coherent text + selected_paras.sort(key=lambda t: t[0]) + + parts: List[str] = [] + if preamble_text: + parts.append(preamble_text) + parts.extend(p for _, p in selected_paras) + return "\n\n".join(parts) def load_document(file_path: Path) -> str: @@ -139,6 +412,8 @@ def load_document(file_path: Path) -> str: if suffix == '.pdf': print(f"[INFO] Reading PDF file...", file=sys.stderr) + from src.preprocessing.pdf_text_extraction import extract_text_from_pdf + return extract_text_from_pdf(str(file_path)) elif suffix in ['.txt', '.text']: print(f"[INFO] Reading text file...", file=sys.stderr) diff --git a/src/model/train_model.py b/src/model/train_model.py index 4f6e2b5..d9663e7 100644 --- a/src/model/train_model.py +++ b/src/model/train_model.py @@ -93,7 +93,19 @@ def train_pdf_classifier(texts, labels, output_dir="src/model/models"): dtrain = xgb.DMatrix(X_train_vec, label=y_train) dtest = xgb.DMatrix(X_test_vec, label=y_test) + # Use GPU if available (e.g. HPC gpu nodes), fall back to CPU + try: + _probe = xgb.DMatrix(X_train_vec[:1], label=y_train[:1]) + xgb.train({"device": "cuda", "tree_method": "hist"}, _probe, num_boost_round=1) + device, tree_method = "cuda", "hist" + print("[INFO] GPU detected — training with CUDA.") + except xgb.core.XGBoostError: + device, tree_method = "cpu", "hist" + print("[INFO] No GPU available — training on CPU.") + params = { + "device": device, + "tree_method": tree_method, "objective": "binary:logistic", "eval_metric": "logloss", "eta": 0.05, diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index a877b20..b7fa00c 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -35,6 +35,9 @@ # Maximum allowed ratio of misspelled words to total words in a pdf MAX_SPELLING_ERROR_RATE = 0.05 +# Module-level singleton — avoids reloading the dictionary for every PDF +_spell_checker = SpellChecker() + def check_spelling(text: str) -> float: """ @@ -42,11 +45,10 @@ def check_spelling(text: str) -> float: Returns 1 if no words detected in input string """ - spellChecker = SpellChecker() - words = spellChecker.split_words(text) + words = _spell_checker.split_words(text) if len(words) == 0: return 1 - misspelled = spellChecker.unknown(words) + misspelled = _spell_checker.unknown(words) return len(misspelled) / len(words) @@ -183,7 +185,7 @@ def parse_page_embedded(page: fitz.Page, page_num: int) -> str: return f"[PAGE {page_num}]\n{page_text}" -def extract_text_from_pdf(pdf_path: str) -> str: +def extract_text_from_pdf(pdf_path: str, skip_ocr: bool = False) -> str: text = [] try: with fitz.open(pdf_path) as doc: @@ -195,7 +197,8 @@ def extract_text_from_pdf(pdf_path: str) -> str: log.error("Failed to extract text from %s: %s", pdf_path, e) return "" - if check_spelling(text) > MAX_SPELLING_ERROR_RATE: + if not skip_ocr and check_spelling(text) > MAX_SPELLING_ERROR_RATE: + print(f"[INFO] High misspelling rate in {Path(pdf_path).name} — falling back to OCR", file=sys.stderr) log.warning("High spelling error rate in %s — falling back to OCR extraction.", pdf_path) text = [] try: @@ -210,7 +213,7 @@ def extract_text_from_pdf(pdf_path: str) -> str: return text -def extract_text_from_pdf_bytes(data: bytes) -> str: +def extract_text_from_pdf_bytes(data: bytes, skip_ocr: bool = False) -> str: """Extract text from an in-memory PDF without writing the PDF to disk.""" text = [] try: @@ -223,7 +226,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: log.error("Failed to extract text from PDF bytes: %s", e) return "" - if check_spelling(text) > MAX_SPELLING_ERROR_RATE: + if not skip_ocr and check_spelling(text) > MAX_SPELLING_ERROR_RATE: + print(f"[INFO] High misspelling rate — falling back to OCR", file=sys.stderr) log.warning("High spelling error rate in in-memory PDF — falling back to OCR extraction.") text = [] try: diff --git a/src/preprocessing/section_filter.py b/src/preprocessing/section_filter.py new file mode 100644 index 0000000..5ba03d3 --- /dev/null +++ b/src/preprocessing/section_filter.py @@ -0,0 +1,392 @@ +"""Coarse relevance filtering for cleaned scientific-paper text. + +Runs between ``text_cleaner.clean_text()`` and +``llm_text.extract_key_sections()`` to drop entire paragraphs that are very +unlikely to contain any of the target extraction metrics: + + - predator species names + - study locations + - collection dates + - sample sizes (n=, stomachs examined) + - empty / non-empty stomach counts + - feeding fraction data + +Paragraphs about pure taxonomy debates, phylogenetic analysis, habitat ecology +without location data, literature reviews of other studies, detailed prey-ID +methodology, morphometric measurements, or statistical test descriptions are +dropped. + +**Conservative by design** — a paragraph is kept unless it matches at least one +negative pattern *and* scores zero on every positive pattern. Borderline +paragraphs are always retained. + +Exposes one primary function:: + + filter_relevant_sections(text: str) -> str + +The function preserves ``[PAGE N]`` markers and section headings so the +downstream ``extract_key_sections()`` can still do its section-priority ranking. +""" + +import re +from typing import List, Tuple + +# --------------------------------------------------------------------------- +# Positive (keep) patterns — if ANY matches, the paragraph is kept regardless +# of negative signals. Mirrors the field patterns in llm_text.py but cast as +# a binary keep gate rather than a weighted scorer. +# --------------------------------------------------------------------------- + +_POSITIVE_PATTERNS: List[re.Pattern] = [ + # sample_size — explicit counts of stomachs / specimens / individuals + re.compile( + r"(?i)(\bn\s*=\s*\d+" + r"|total\s+of\s+\d+" + r"|\d+\s+stomachs?" + r"|\d+\s+specimens?" + r"|\d+\s+individuals?" + r"|\d+\s+birds?" + r"|\d+\s+fish" + r"|\d+\s+samples?" + r"|sample\s+size\s+(of\s+)?\d+" + r"|examined\s+\d+" + r"|\d+\s+(were|was)\s+(examined|collected|analysed|analyzed|sampled))" + ), + # empty-stomach language + re.compile( + r"(?i)(empty\s+stomachs?" + r"|stomachs?\s+(were\s+)?empty" + r"|had\s+empty" + r"|without\s+food" + r"|without\s+(stomach\s+)?contents?" + r"|zero\s+prey" + r"|no\s+food\s+(items?|remains?)" + r"|vacuous|vacant\s+stomachs?)" + ), + # non-empty / fraction feeding + re.compile( + r"(?i)(non.?empty" + r"|contained\s+(food|prey|items?)" + r"|with\s+food" + r"|with\s+(stomach\s+)?contents?" + r"|had\s+(food|prey)" + r"|feeding\s+rate" + r"|proportion\s+(feeding|with\s+food)" + r"|percent\s+(feeding|with\s+food)" + r"|\d+\s*%\s+of\s+(stomachs?|individuals?|birds?|fish|specimens?))" + ), + # percentage / fraction near gut/stomach context + re.compile(r"(?i)(\d+\.?\d*\s*%|\d+\s+percent" r"|\d+\s+of\s+\d+\s+(were|had|contained)" r"|proportion\s+of\s+\d+)"), + # study date — collection period + re.compile( + r"(?i)(collected\s+(in|during|between)" + r"|sampled\s+(in|during|between)" + r"|field\s+season" + r"|study\s+period" + r"|between\s+\d{4}\s+and\s+\d{4}" + r"|\d{4}[\-\u2013]\d{4}" + r"|sampling\s+period)" + ), + # study location + re.compile(r"(?i)(study\s+(area|site|region)" r"|specimens?\s+(were\s+)?(obtained|collected|caught)\s+(from|at|in)" r"|sampling\s+(location|site|area))"), + # gut / stomach / diet core vocabulary + re.compile( + r"(?i)(stomach\s+content" + r"|diet\s+(composition|analysis|study)" + r"|gut\s+content" + r"|food\s+(items?|habits?|composition)" + r"|prey\s+(items?|composition|species|frequency)" + r"|trophic\s+(level|ecology|niche)" + r"|feeding\s+(ecology|habits?|behaviour|behavior)" + r"|gastrointestinal|foregut|hindgut|crop\s+content)" + ), + # predator species name — binomial in a sentence with diet/food/stomach + re.compile(r"(?i)\b[A-Z][a-z]+\s+[a-z]{3,}\b.{0,80}" r"(stomach|diet|prey|food|feeding|gut|trophic|forag)"), + # geographic coordinates or explicit lat/lon + re.compile(r"(\d{1,3}[°º]\s*\d{0,2}['′]?\s*[NS]" r"|\d{1,3}[°º]\s*\d{0,2}['′]?\s*[EW]" r"|latitude|longitude" r"|\d+\.\d+\s*[°º]?\s*[NS],?\s*\d+\.\d+\s*[°º]?\s*[EW])"), + # table-like numeric data (rows of numbers separated by whitespace/tabs) + re.compile(r"(?m)^.*(\d+\s*[\t|]\s*){2,}\d+"), +] + +# --------------------------------------------------------------------------- +# Negative (drop-candidate) patterns — a paragraph is dropped ONLY when it +# matches at least one negative pattern AND matches ZERO positive patterns. +# --------------------------------------------------------------------------- + +_NEGATIVE_PATTERNS: List[re.Pattern] = [ + # taxonomy / systematics debates + re.compile( + r"(?i)(phylogenet(ic|ics)" + r"|cladistic" + r"|taxonom(y|ic)" + r"|systemat(ic|ics)" + r"|monophyl(y|etic)" + r"|paraphyl(y|etic)" + r"|polyphyl(y|etic)" + r"|sister\s+(group|taxon|clade)" + r"|molecular\s+(clock|phylogen)" + r"|bayesian\s+(inference|analysis|tree)" + r"|maximum\s+likelihood\s+tree" + r"|bootstrap\s+(support|value)" + r"|posterior\s+probabilit)" + ), + # habitat ecology descriptions (without study-site info) + re.compile( + r"(?i)(habitat\s+(type|preference|selection|use|suitability)" + r"|home\s+range\s+(size|area|overlap)" + r"|territory\s+(size|defense|overlap)" + r"|canopy\s+(cover|closure|height)" + r"|vegetation\s+(type|structure|cover|composition|survey)" + r"|understory|understorey" + r"|basal\s+area" + r"|tree\s+(density|dbh|diameter)" + r"|forest\s+type\s+(was|is|include))" + ), + # literature review / citation-heavy passages + re.compile( + r"(?i)(several\s+(studies|authors|investigators)\s+(have|found|reported|showed)" + r"|previous(ly)?\s+(reported|described|documented|found|studied)" + r"|has\s+been\s+(reported|documented|described)\s+by" + r"|according\s+to\s+\w+\s*(\(\d{4}\)|\d{4})" + r"|consistent\s+with\s+(the\s+)?(findings?|results?)\s+of" + r"|\(\s*see\s+(also\s+)?\w+\s*(et\s+al\.?)?\s*,?\s*\d{4}\)" + r"|reviewed\s+(by|in)\s+\w+)" + ), + # detailed prey identification methodology + re.compile( + r"(?i)(prey\s+(were\s+)?identified\s+(to|using|by|under)" + r"|identification\s+(key|guide|manual)" + r"|taxonomic\s+(key|identification)" + r"|dichotomous\s+key" + r"|stereomicroscope" + r"|dissecting\s+microscope" + r"|otolith\s+(identification|catalogue|reference)" + r"|diagnostic\s+(bones?|fragments?|structures?)" + r"|reference\s+collection" + r"|hard\s+(parts?|remains?)\s+(were\s+)?(identified|compared|matched))" + ), + # morphometric / biometric measurements + re.compile( + r"(?i)(morphometric" + r"|snout[\-\s]vent\s+length" + r"|total\s+length\s+(was\s+)?measured" + r"|body\s+(mass|weight)\s+(was\s+)?measured" + r"|wing\s+(chord|length)\s+(was\s+)?measured" + r"|bill\s+(length|depth|width)\s+(was\s+)?measured" + r"|tarsus\s+length" + r"|carapace\s+(length|width)" + r"|standard\s+length\s+\(SL\)" + r"|fork\s+length\s+\(FL\)" + r"|total\s+length\s+\(TL\))" + ), + # statistical methods (not results) + re.compile( + r"(?i)(anova|ancova|manova" + r"|chi[\-\s]?squared?\s+test" + r"|kruskal[\-\s]wallis" + r"|mann[\-\s]whitney" + r"|wilcoxon" + r"|tukey('?s)?\s+(hsd|post[\-\s]?hoc)" + r"|bonferroni\s+correction" + r"|generali[sz]ed\s+linear\s+(model|mixed)" + r"|linear\s+regression\s+(was|were)\s+used" + r"|principal\s+component\s+analysis" + r"|canonical\s+correspondence" + r"|multivariate\s+analysis\s+of" + r"|permutational\s+anova" + r"|rarefaction\s+curve" + r"|shannon[\-\s]wiener|simpson('?s)?\s+(diversity|index))" + ), + # conservation / management policy (not data) + re.compile( + r"(?i)(conservation\s+(implication|strateg|management|priority|action)" + r"|management\s+(implication|recommendation|strateg|plan)" + r"|red\s+list\s+(status|categor)" + r"|endangered\s+species\s+act" + r"|iucn\s+(status|categor|red\s+list)" + r"|population\s+(viability|modelling|decline|trend)" + r"|threat\s+(status|assessment|categor))" + ), + # genetic / molecular methods (not diet data) + re.compile( + r"(?i)(pcr\s+(amplif|reaction|protocol|conditions)" + r"|dna\s+(extract|amplif|sequenc|barcod)" + r"|mitochondrial\s+(dna|gene|region|marker)" + r"|microsatellite" + r"|primer\s+(pair|sequence|set)" + r"|gel\s+electrophoresis" + r"|nucleotide\s+(sequence|substitution)" + r"|genbank\s+accession)" + ), +] + +# --------------------------------------------------------------------------- +# Section headings — recognise both known-priority and drop-candidate headers +# so we can preserve them in output while filtering paragraph content. +# --------------------------------------------------------------------------- + +_NUM_PREFIX = r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + +_SECTION_HEADING_RE: re.Pattern = re.compile( + r"(?i)^\s*" + _NUM_PREFIX + r"(" + r"abstract|summary" + r"|introduction|background" + r"|methods?|materials?\s*(?:and|&)\s*methods?" + r"|methodology" + r"|study\s*(?:area|site|design|region|period)" + r"|results?|findings?" + r"|discussion|conclusions?" + r"|summary\s+and\s+discussion" + r"|acknowledge?ments?" + r"|literature\s+cited" + r"|references?\s+cited" + r"|references?" + r"|bibliography" + r"|appendix" + r"|supplementary\s+(data|material|information)" + r"|supporting\s+information" + r"|conflict\s+of\s+interest" + r"|competing\s+interests?" + r"|author\s+contributions?" + r"|funding(?:\s+(?:sources?|information))?" + r"|data\s+availability" + r"|ethics\s+(statement|declaration)" + r"|table\s*\d" + r")\s*[:\.\-]?\s*$", +) + +# Page markers are always preserved. +_PAGE_MARKER_RE: re.Pattern = re.compile(r"^\s*\[PAGE\s+\d+\]\s*$") + + +# --------------------------------------------------------------------------- +# Long-document threshold — above this many chars the filter becomes stricter, +# dropping paragraphs that have *no* signal at all (neither positive nor +# negative). This prevents very long papers from passing through unfiltered +# when most paragraphs simply lack a negative keyword. +# --------------------------------------------------------------------------- + +_LONG_DOC_THRESHOLD: int = 15_000 + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def filter_relevant_sections(text: str) -> str: + """Remove paragraphs unlikely to contain target diet metrics. + + The function splits *text* into blank-line-separated paragraphs, scores + each one against positive (keep) and negative (drop) pattern lists, and + removes paragraphs that trigger negative patterns while scoring zero on + positive patterns. + + For documents longer than ``_LONG_DOC_THRESHOLD`` characters, paragraphs + with *no signal at all* (neither positive nor negative) are also dropped. + This prevents very long papers from passing through almost unfiltered. + + ``[PAGE N]`` markers and section headings are always preserved so + ``extract_key_sections()`` can still perform section-priority ranking + downstream. + + Args: + text: Noise-cleaned text (output of ``clean_text()``). May contain + ``[PAGE N]`` markers and section headings. + + Returns: + Filtered text with irrelevant paragraphs removed. All structural + markers are preserved. + """ + if not text or not text.strip(): + return text + + strict = len(text) > _LONG_DOC_THRESHOLD + + # Split into blocks on blank lines while tracking structure + blocks = _split_into_blocks(text) + + kept: List[str] = [] + for block in blocks: + stripped = block.strip() + if not stripped: + # Preserve blank-line spacing + kept.append("") + continue + + # Always keep page markers and section headings + if _PAGE_MARKER_RE.match(stripped) or _SECTION_HEADING_RE.match(stripped): + kept.append(block) + continue + + # Score the paragraph + if _should_keep(stripped, strict=strict): + kept.append(block) + + result = "\n\n".join(kept) + + # Collapse excessive blank lines (more than 2 newlines → 2) + result = re.sub(r"\n{3,}", "\n\n", result) + + return result.strip() + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _split_into_blocks(text: str) -> List[str]: + """Split text into paragraph blocks separated by blank lines. + + ``[PAGE N]`` markers that appear on their own line are treated as their + own block so they are never merged with surrounding text. + """ + # First, ensure PAGE markers are separated by blank lines from content + # so they end up in their own block. + normalized = re.sub( + r"(? bool: + """Return True if *text* matches any positive/keep pattern.""" + for pat in _POSITIVE_PATTERNS: + if pat.search(text): + return True + return False + + +def _has_negative_signal(text: str) -> bool: + """Return True if *text* matches any negative/drop-candidate pattern.""" + for pat in _NEGATIVE_PATTERNS: + if pat.search(text): + return True + return False + + +def _should_keep(text: str, *, strict: bool = False) -> bool: + """Decide whether a paragraph should be kept. + + Decision logic (conservative — defaults to keep): + 1. If the paragraph has ANY positive signal → **keep**. + 2. If the paragraph has a negative signal AND no positive signal → **drop**. + 3. If ``strict`` is True and there is NO signal either way → **drop**. + 4. Otherwise (no signal, not strict) → **keep** (borderline). + + Args: + text: Paragraph text to evaluate. + strict: When True (used for long documents), paragraphs with zero + signal are dropped rather than kept by default. + """ + if _has_positive_signal(text): + return True + if _has_negative_signal(text): + return False + # No signal either way + if strict: + return False # long doc — drop borderline paragraphs + return True diff --git a/src/preprocessing/text_cleaner.py b/src/preprocessing/text_cleaner.py new file mode 100644 index 0000000..e41b067 --- /dev/null +++ b/src/preprocessing/text_cleaner.py @@ -0,0 +1,343 @@ +"""Noise removal for raw .txt files extracted from scientific PDFs. + +Strips content that adds no value to structured data extraction: + - Reference / bibliography sections (everything from the header onward) + - Acknowledgement and funding sections + - Author affiliation blocks (department names, university lines, emails) + - Figure and table captions + - Copyright and licence lines + - Standalone page-number lines + - DOI / URL lines + - Journal metadata lines (volume, issue, ISSN, received/accepted dates) + - Excessive blank lines and leading/trailing whitespace + +Exposes one primary function:: + + clean_text(text: str) -> str + +The function is safe to call on text that already has ``[PAGE N]`` markers; +those markers are preserved so the downstream ``extract_key_sections()`` +function can still do page-priority ranking. + +Usage (standalone):: + + python text_cleaner.py path/to/file.txt + python text_cleaner.py path/to/file.txt --max-chars 5000 +""" + +import re +import sys +from pathlib import Path +from typing import List + +# Optional numeric section prefix shared with llm_text.py patterns. +# Matches e.g. "1.", "2.1.", "3.2.1 " so that "1. References" is also caught. +_NUM_PREFIX = r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + +# --------------------------------------------------------------------------- +# Section-level drop patterns +# When a line matches one of these, the entire remainder of that page/section +# block is discarded (until the next [PAGE N] marker or end of text). +# --------------------------------------------------------------------------- +_SECTION_DROP_HEADERS: re.Pattern = re.compile( + r"(?i)^\s*" + _NUM_PREFIX + r"(" + r"acknowledge?ments?" + r"|literature\s+cited" + r"|references?\s+cited" + r"|references?" + r"|bibliography" + r"|appendix\b" + r"|supplementary\s+(data|material|information)" + r"|supporting\s+information" + r"|conflict\s+of\s+interest" + r"|competing\s+interests?" + r"|author\s+contributions?" + r"|funding(?:\s+(?:sources?|information))?" + r"|data\s+availability" + r"|ethics\s+(statement|declaration)" + r")\s*[:\.]?\s*$" +) + +# Structured-header block patterns common in two-column journal PDFs. +# When a line matches, subsequent short lines (< 60 chars) that look like +# keyword / metadata values are dropped until a long content line resumes. +# NOTE: requires actual whitespace between letters to avoid matching "Abstract". +_STRUCTURED_HEADER_START: re.Pattern = re.compile( + r"(?i)^\s*(" + r"a\s+b\s+s\s+t\s+r\s+a\s+c\s+t" # spaced "A B S T R A C T" + r"|a\s+r\s+t\s+i\s+c\s+l\s+e\s+i\s+n\s+f\s+o" # spaced "A R T I C L E I N F O" + r")\s*$" +) + +# --------------------------------------------------------------------------- +# Line-level drop patterns +# Lines that individually match one of these are removed regardless of where +# they appear in the document. +# --------------------------------------------------------------------------- +_LINE_DROP_PATTERNS: List[re.Pattern] = [ + # Standalone page numbers (digit-only line, 1–4 digits, optional spaces) + re.compile(r"^\s*\d{1,4}\s*$"), + # Reference list entries: "[1] Smith ...", "1. Smith ...", "1) Smith ..." + re.compile(r"^\s*\[\d+\]\s+[A-Z]"), + re.compile(r"^\s*\d{1,3}[.)]\s{1,4}[A-Z][a-z]"), + # DOI and bare URLs (doi.org/, bare doi:, https://, www.) + re.compile(r"(?i)(https?://|doi\.org/|\bdoi:\s*10\.|www\.)\S*"), + # Email addresses + re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"), + # Copyright / licence lines + re.compile(r"(?i)(©|\(c\)\s*\d{4}|copyright\s+\d{4}|all\s+rights\s+reserved" r"|published\s+by\s+elsevier|creative\s+commons|open\s+access" r"|this\s+(article|paper|is)\s+(is\s+)?published)"), + # Journal metadata: volume, issue, ISSN, page range + re.compile(r"(?i)^\s*(vol(ume)?\.?\s*\d|issue\s*\d|pp\.\s*\d|issn\s*[\d\-]" r"|journal\s+of|proceedings\s+of)"), + # Received / accepted / revised / available-online timestamps + # Match with or without trailing colon/semicolon + re.compile(r"(?i)^\s*(received|accepted|revised|available\s+online|" r"published\s+online|handling\s+editor)" r"(\s*[:;]|\s+\d|\s+in)"), + # Article history block header + re.compile(r"(?i)^\s*article\s+history\s*[:\.]?\s*$"), + # Keywords header line AND single-word keyword-style lines that follow it + re.compile(r"(?i)^\s*key\s*-?\s*words?\s*[:\-]"), + # Journal / publisher metadata lines + re.compile(r"(?i)^\s*(contents?\s+lists?\s+available|journal\s+homepage|" r"elsevier\.com|sciencedirect\.com|springer\.|wiley\.com)"), + # Figure / table / plate captions + re.compile(r"(?i)^\s*(fig(ure)?\.?\s*\d|table\s*\d|plate\s*\d|" r"fig\.\s*s\d|supplemental?\s+(table|figure)\s*\d)" r"[\s.\-–—:]"), + # Author affiliation lines (institution / department / lab names) + # Allow for leading special characters (e.g. ⁎, *, †) + re.compile( + r"(?i)^[\s\*⁎†‡#]*" + r"(department\s+of|faculty\s+of|institute\s+(of|for)|" + r"division\s+of|school\s+of|laboratory\s+of|lab\s+of|" + r"centre?\s+(for|of)|program(me)?\s+(in|of)|" + r"universidad|universit[éy]|université|universidade|" + r"university\s+of|college\s+of)" + ), + # Running page headers / footers: short all-caps lines that are NOT + # known section headings (those are whitelisted in _drop_line below). + re.compile(r"^[A-Z\s\d\.\-–—:,]{5,60}$"), +] + +# --------------------------------------------------------------------------- +# Section-header whitelist +# Lines matching this pattern are structural anchors the LLM needs; they must +# never be removed even if they look like all-caps noise. +# --------------------------------------------------------------------------- +_SECTION_HEADER_WHITELIST: re.Pattern = re.compile( + r"(?i)^\s*" + # optional section number prefix e.g. "1.", "2.1.", "3.2.1 " + r"(?:\d{1,2}(?:\.\d{1,2})*\.?\s+)?" + r"(" + r"abstract" + r"|summary" + r"|introduction" + r"|background" + r"|methods?" + r"|materials?\s*(?:and|&)\s*methods?" + r"|methodology" + r"|study\s*(?:area|site|design|region|period)" + r"|results?" + r"|findings?" + r"|discussion" + r"|conclusions?" + r"|conclusions?\s+and\s+discussion" + r"|discussion\s+and\s+conclusions?" + r"|summary\s+and\s+discussion" + r")\s*[:\.\-]?\s*$" +) + +# --------------------------------------------------------------------------- +# Whitespace normalisation +# --------------------------------------------------------------------------- +_MULTI_BLANK = re.compile(r"\n{3,}") +_TRAILING_SPACES = re.compile(r"[ \t]+\n") +# Line ending with a soft hyphen or mid-word break (next line starts lowercase +# or with punctuation that continues the word). +_SOFT_HYPHEN_END = re.compile(r"-$") +_CONTINUATION_LINE = re.compile(r"^[a-z,\.\)\];:!\?]") + + +def _rejoin_broken_lines(text: str) -> str: + """Rejoin lines that were broken mid-sentence or mid-word by column wrapping. + + Two cases are handled: + 1. Hard hyphenation: line ends with ``-`` and next line starts with a + lowercase letter → join without any space (remove the hyphen). + 2. Soft wrap: line ends without sentence-terminating punctuation and next + line starts with lowercase → join with a single space. + + ``[PAGE N]`` marker lines are never joined. + """ + lines = text.split("\n") + out: List[str] = [] + i = 0 + while i < len(lines): + line = lines[i] + # Never merge PAGE markers + if re.match(r"\[PAGE\s+\d+\]", line.strip()): + out.append(line) + i += 1 + continue + + rstripped = line.rstrip() + if i + 1 < len(lines): + next_line = lines[i + 1].lstrip() + next_is_page = re.match(r"\[PAGE\s+\d+\]", next_line) + if not next_is_page and next_line and _CONTINUATION_LINE.match(next_line): + # Hard hyphen: remove hyphen and join directly + if _SOFT_HYPHEN_END.search(rstripped): + out.append(rstripped[:-1] + next_line) + i += 2 + continue + # Soft wrap: line doesn't end with sentence-ending punctuation + if rstripped and rstripped[-1] not in ".!?:;)]": + out.append(rstripped + " " + next_line) + i += 2 + continue + out.append(line) + i += 1 + return "\n".join(out) + + +def _drop_line(line: str) -> bool: + """Return True if this line should be removed from the output.""" + stripped = line.strip() + if not stripped: + return False # preserve blank lines for now; collapsed later + # Never drop structural section headings — the LLM and section-ranker + # both depend on them to orient in the document. + if _SECTION_HEADER_WHITELIST.match(stripped): + return False + for pat in _LINE_DROP_PATTERNS: + if pat.search(stripped): + return True + return False + + +def clean_text(text: str) -> str: + """Strip noise from raw extracted text. + + Processes the text in two passes: + + 1. **Section-level pass** — scans for section headers that mark the start + of non-useful content (References, Acknowledgements, etc.) and discards + everything from that header to the next ``[PAGE N]`` marker (or end of + text). This preserves other sections on subsequent pages. + + 2. **Line-level pass** — removes individual noisy lines (page numbers, + DOIs, email addresses, figure captions, affiliation lines, copyright + notices, reference-list entries, journal metadata). + + Finally, excess blank lines are collapsed to a single blank line and + leading/trailing whitespace is stripped. + + Args: + text: Raw text content, optionally containing ``[PAGE N]`` markers. + + Returns: + Cleaned text with noise removed. + """ + # ── Pass 1: section-level drop ────────────────────────────────────────── + # Split on [PAGE N] markers and process each page block independently. + # Within each block, once a drop-section header is found, discard the + # rest of that block. + page_split = re.split(r"(\[PAGE\s+\d+\])", text) + # page_split alternates: [pre-marker-text, marker, text, marker, text, ...] + + cleaned_parts: List[str] = [] + for segment in page_split: + if re.match(r"\[PAGE\s+\d+\]", segment): + # Keep the marker itself + cleaned_parts.append(segment) + continue + + lines = segment.split("\n") + kept: List[str] = [] + in_drop_section = False + for line in lines: + if in_drop_section: + # Skip until end of this page block + continue + if _SECTION_DROP_HEADERS.match(line): + in_drop_section = True + continue + kept.append(line) + cleaned_parts.append("\n".join(kept)) + + after_section_pass = "".join(cleaned_parts) + + # ── Pass 1b: strip structured-header blocks (spaced-letter headers + + # short keyword/metadata lines that follow them in two-column PDFs) ────── + structured_lines: List[str] = [] + in_structured_block = False + for line in after_section_pass.split("\n"): + stripped = line.strip() + if _STRUCTURED_HEADER_START.match(stripped): + in_structured_block = True + continue # drop the header itself + if in_structured_block: + # Exit block once a long line (real content) appears + if len(stripped) > 60: + in_structured_block = False + structured_lines.append(line) + # Drop short keyword/metadata values + continue + structured_lines.append(line) + after_section_pass = "\n".join(structured_lines) + + # ── Pass 2: line-level drop ────────────────────────────────────────── + output_lines: List[str] = [] + for line in after_section_pass.split("\n"): + if _drop_line(line): + continue + output_lines.append(line) + + after_line_pass = "\n".join(output_lines) + + # ── Normalise whitespace ──────────────────────────────────────────────── + after_line_pass = _TRAILING_SPACES.sub("\n", after_line_pass) + after_line_pass = _rejoin_broken_lines(after_line_pass) + after_line_pass = _MULTI_BLANK.sub("\n\n", after_line_pass) + return after_line_pass.strip() + + +# --------------------------------------------------------------------------- +# Standalone usage +# --------------------------------------------------------------------------- + + +def main() -> None: # pragma: no cover + import argparse + + parser = argparse.ArgumentParser(description="Strip noise from a raw .txt file extracted from a scientific PDF.") + parser.add_argument("input", type=str, help="Path to the .txt file to clean.") + parser.add_argument( + "--max-chars", + type=int, + default=None, + help="If set, truncate the cleaned output to this many characters.", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Optional path to write the cleaned text. Defaults to stdout.", + ) + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"[ERROR] File not found: {input_path}", file=sys.stderr) + sys.exit(1) + + raw = input_path.read_text(encoding="utf-8") + cleaned = clean_text(raw) + + if args.max_chars and len(cleaned) > args.max_chars: + cleaned = cleaned[: args.max_chars] + + if args.output: + Path(args.output).write_text(cleaned, encoding="utf-8") + print(f"[INFO] Cleaned text written to {args.output} ({len(cleaned)} chars)", file=sys.stderr) + else: + print(cleaned) + + +if __name__ == "__main__": + main() diff --git a/tests/test_section_filter.py b/tests/test_section_filter.py new file mode 100644 index 0000000..423bbae --- /dev/null +++ b/tests/test_section_filter.py @@ -0,0 +1,424 @@ +"""Unit tests for src/preprocessing/section_filter.py""" + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from src.preprocessing.section_filter import ( + filter_relevant_sections, + _has_positive_signal, + _has_negative_signal, + _should_keep, + _split_into_blocks, + _LONG_DOC_THRESHOLD, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_page(n: int, body: str) -> str: + return f"[PAGE {n}]\n{body}" + + +# --------------------------------------------------------------------------- +# Structural preservation +# --------------------------------------------------------------------------- + + +class TestStructuralPreservation: + """Page markers and section headings must never be removed.""" + + def test_preserves_page_markers(self): + text = "[PAGE 1]\nSome content.\n\n[PAGE 2]\nMore content." + result = filter_relevant_sections(text) + assert "[PAGE 1]" in result + assert "[PAGE 2]" in result + + def test_preserves_section_headings(self): + text = "Abstract\n\nWe studied diets.\n\n" "Methods\n\nWe sampled fish.\n\n" "Results\n\nN=42 stomachs examined." + result = filter_relevant_sections(text) + assert "Abstract" in result + assert "Methods" in result + assert "Results" in result + + def test_preserves_numbered_section_headings(self): + text = "1. Introduction\n\nBackground text.\n\n2. Methods\n\nSampling." + result = filter_relevant_sections(text) + assert "1. Introduction" in result + assert "2. Methods" in result + + def test_empty_input_returns_empty(self): + assert filter_relevant_sections("") == "" + assert filter_relevant_sections(" ") == " " + assert filter_relevant_sections(None) is None + + def test_short_text_preserved_fully(self): + text = "A brief note about Canis lupus diet." + result = filter_relevant_sections(text) + assert result == text + + +# --------------------------------------------------------------------------- +# Positive-signal paragraphs are always kept +# --------------------------------------------------------------------------- + + +class TestPositiveSignalKept: + """Paragraphs with target-metric language must always be retained.""" + + def test_keeps_sample_size_paragraph(self): + para = "A total of 144 stomach samples were collected from Canis lupus." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "144 stomach samples" in result + + def test_keeps_empty_stomach_paragraph(self): + para = "Of the 100 stomachs examined, 23 were empty." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "23 were empty" in result + + def test_keeps_nonempty_paragraph(self): + para = "58% of stomachs contained food items." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "58%" in result + + def test_keeps_feeding_rate(self): + para = "The feeding rate was 72% across all seasons." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "feeding rate" in result + + def test_keeps_collection_date(self): + para = "Specimens were collected between 2005 and 2010." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "collected between 2005 and 2010" in result + + def test_keeps_study_location(self): + para = "The study area was located in the Okavango Delta, Botswana." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "Okavango Delta" in result + + def test_keeps_diet_composition(self): + para = "Diet composition of Panthera leo was dominated by ungulates." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "Diet composition" in result + + def test_keeps_percentage_data(self): + para = "Mammals constituted 45% of the total prey items." + text = f"Results\n\n{para}" + result = filter_relevant_sections(text) + assert "45%" in result + + def test_keeps_n_equals(self): + para = "We analysed gut contents (n = 87) from adult specimens." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "n = 87" in result + + def test_keeps_coordinates(self): + para = "The sampling site (34°15'S, 18°29'E) was coastal." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "34°15'S" in result + + +# --------------------------------------------------------------------------- +# Negative-signal paragraphs are dropped (when no positive signal) +# --------------------------------------------------------------------------- + + +class TestNegativeSignalDropped: + """Paragraphs with only negative signals should be removed.""" + + def test_drops_phylogenetic_paragraph(self): + para = "Bayesian inference of the phylogenetic relationships among " "the sister group taxa revealed strong bootstrap support for " "the monophyletic clade." + text = f"Discussion\n\n{para}" + result = filter_relevant_sections(text) + assert "Bayesian inference" not in result + + def test_drops_habitat_description(self): + para = "Vegetation type was classified as tropical dry forest with " "canopy cover dense and understory dominated by shrubs." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "canopy cover" not in result + + def test_drops_literature_review(self): + para = "Several studies have reported similar findings. " "Previously reported by Smith (2001) and consistent with " "the findings of Jones et al. (2005)." + text = f"Discussion\n\n{para}" + result = filter_relevant_sections(text) + assert "Several studies have reported" not in result + + def test_drops_prey_id_methodology(self): + para = "Prey were identified to the lowest taxonomic level using " "a stereomicroscope and reference collection of diagnostic bones." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "stereomicroscope" not in result + + def test_drops_morphometric_paragraph(self): + para = "Snout-vent length was measured to the nearest 0.1 mm. " "Total length was measured for each individual using calipers." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "Snout-vent length" not in result + + def test_drops_statistical_methods(self): + para = "Differences were tested using Kruskal-Wallis tests with " "Bonferroni correction for multiple comparisons. A generalized " "linear model was used to assess trends." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "Kruskal-Wallis" not in result + + def test_drops_conservation_policy(self): + para = "Conservation implications suggest the species should be " "upgraded to a higher IUCN Red List category given the " "observed population decline." + text = f"Discussion\n\n{para}" + result = filter_relevant_sections(text) + assert "IUCN Red List" not in result + + def test_drops_genetic_methods(self): + para = "DNA was extracted using a commercial kit. PCR amplification " "was performed using primers targeting the mitochondrial gene " "region. Products were separated by gel electrophoresis." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "PCR amplification" not in result + + +# --------------------------------------------------------------------------- +# Conservative behaviour — borderline paragraphs kept +# --------------------------------------------------------------------------- + + +class TestConservativeBehaviour: + """Paragraphs with no signals should be kept (borderline = safe).""" + + def test_keeps_neutral_paragraph(self): + para = "The region experiences a temperate maritime climate." + text = f"Introduction\n\n{para}" + result = filter_relevant_sections(text) + assert "temperate maritime climate" in result + + def test_keeps_paragraph_with_mixed_signals(self): + """Positive signal overrides co-occurring negative signal.""" + para = "A total of 50 stomachs were examined. Prey were identified " "using a stereomicroscope and diagnostic bones from a " "reference collection." + text = f"Methods\n\n{para}" + result = filter_relevant_sections(text) + assert "50 stomachs" in result + assert "stereomicroscope" in result + + def test_keeps_species_diet_sentence(self): + para = "Feeding ecology of Vulpes vulpes in agricultural landscapes." + text = f"Abstract\n\n{para}" + result = filter_relevant_sections(text) + assert "Vulpes vulpes" in result + + +# --------------------------------------------------------------------------- +# Integration-style tests with multi-section documents +# --------------------------------------------------------------------------- + + +class TestFullDocumentFiltering: + """Test the filter on realistic multi-section document text.""" + + def test_realistic_document(self): + text = ( + "[PAGE 1]\n" + "Abstract\n\n" + "We examined the diet of Canis lupus in Yellowstone.\n\n" + "[PAGE 2]\n" + "Methods\n\n" + "Specimens were collected between 2010 and 2015 from Yellowstone " + "National Park (44°36'N, 110°30'W).\n\n" + "Snout-vent length was measured to the nearest millimetre " + "using digital calipers.\n\n" + "DNA was extracted from tissue samples. PCR amplification of " + "mitochondrial DNA markers was performed.\n\n" + "[PAGE 3]\n" + "Results\n\n" + "A total of 200 stomachs were examined, of which 42 were empty.\n\n" + "Diet composition included 65% ungulates and 20% lagomorphs.\n\n" + "[PAGE 4]\n" + "Discussion\n\n" + "Previous work has reported broadly concordant results. Previously " + "reported by Mech (1970) and consistent with findings of " + "Black et al. (2003).\n\n" + "Conservation implications suggest continued monitoring of wolf " + "population viability in the Greater Yellowstone area.\n" + ) + result = filter_relevant_sections(text) + + # Structural markers preserved + assert "[PAGE 1]" in result + assert "[PAGE 2]" in result + assert "[PAGE 3]" in result + assert "[PAGE 4]" in result + assert "Abstract" in result + assert "Methods" in result + assert "Results" in result + assert "Discussion" in result + + # Positive-signal paragraphs kept + assert "diet of Canis lupus" in result + assert "200 stomachs" in result + assert "42 were empty" in result + assert "65%" in result + assert "collected between 2010 and 2015" in result + + # Negative-only paragraphs dropped + assert "Snout-vent length" not in result + assert "PCR amplification" not in result + assert "Previous work has reported" not in result + assert "population viability" not in result + + def test_does_not_over_filter_short_doc(self): + """A short document with mostly positive content should lose very little.""" + text = "Abstract\n\n" "We examined stomach contents of 50 Accipiter nisus.\n\n" "Results\n\n" "Sample size was n=50. Of these, 12 stomachs were empty.\n" "Birds comprised 78% of prey items.\n" + result = filter_relevant_sections(text) + # Everything here is positive — nothing should be lost + assert "50 Accipiter nisus" in result + assert "n=50" in result + assert "12 stomachs were empty" in result + assert "78%" in result + + +# --------------------------------------------------------------------------- +# Internal helper tests +# --------------------------------------------------------------------------- + + +class TestInternalHelpers: + """Test the internal scoring/splitting functions.""" + + def test_has_positive_signal_true(self): + assert _has_positive_signal("A total of 30 stomachs were examined.") + assert _has_positive_signal("Specimens were collected during 2018.") + assert _has_positive_signal("The study area was in Kenya.") + assert _has_positive_signal("Empty stomachs accounted for 15%.") + + def test_has_positive_signal_false(self): + assert not _has_positive_signal("The weather was mild that year.") + assert not _has_positive_signal("Bayesian inference supported the monophyletic clade.") + + def test_has_negative_signal_true(self): + assert _has_negative_signal("Phylogenetic analysis using maximum likelihood trees.") + assert _has_negative_signal("Habitat type was classified as open grassland.") + assert _has_negative_signal("Mann-Whitney tests were used for comparisons.") + + def test_has_negative_signal_false(self): + assert not _has_negative_signal("We counted 30 prey items in the gut.") + assert not _has_negative_signal("The study was conducted in Brazil.") + + def test_should_keep_positive_overrides_negative(self): + text = "A total of 100 stomachs were examined using a stereomicroscope " "and reference collection." + assert _should_keep(text) is True + + def test_should_keep_neutral_kept(self): + assert _should_keep("The area has a subtropical climate.") is True + + def test_should_keep_negative_only_dropped(self): + assert _should_keep("Bayesian inference of phylogenetic relationships.") is False + + def test_split_into_blocks_basic(self): + text = "Para one.\n\nPara two.\n\nPara three." + blocks = _split_into_blocks(text) + assert len(blocks) == 3 + assert "Para one." in blocks[0] + assert "Para two." in blocks[1] + assert "Para three." in blocks[2] + + def test_split_into_blocks_page_markers(self): + text = "[PAGE 1]\nContent one.\n\n[PAGE 2]\nContent two." + blocks = _split_into_blocks(text) + page_blocks = [b for b in blocks if "[PAGE" in b] + assert len(page_blocks) >= 2 + + +# --------------------------------------------------------------------------- +# Long-document strict filtering +# --------------------------------------------------------------------------- + + +class TestLongDocumentStrictMode: + """Documents > _LONG_DOC_THRESHOLD drop zero-signal paragraphs.""" + + def _make_long_doc(self, extra_paragraphs: list[str]) -> str: + """Build a document that exceeds the threshold with filler + extras.""" + # Use positive-signal filler so the base text is kept + filler_line = "A total of 50 specimens were examined." + repeat_count = (_LONG_DOC_THRESHOLD // len(filler_line)) + 2 + filler = "\n\n".join([filler_line] * repeat_count) + return filler + "\n\n" + "\n\n".join(extra_paragraphs) + + def test_neutral_para_kept_in_short_doc(self): + """Below threshold, neutral paragraphs survive.""" + neutral = "The weather was mild and skies were overcast." + text = f"Abstract\n\n{neutral}" + assert len(text) < _LONG_DOC_THRESHOLD + result = filter_relevant_sections(text) + assert neutral in result + + def test_neutral_para_dropped_in_long_doc(self): + """Above threshold, neutral paragraphs are dropped.""" + neutral = "The weather was mild and skies were overcast." + text = self._make_long_doc([neutral]) + assert len(text) > _LONG_DOC_THRESHOLD + result = filter_relevant_sections(text) + assert neutral not in result + + def test_positive_para_kept_in_long_doc(self): + """Positive-signal paragraphs are STILL kept in long docs.""" + positive = "A total of 200 stomachs were examined." + text = self._make_long_doc([positive]) + result = filter_relevant_sections(text) + assert positive in result + + def test_negative_para_dropped_in_long_doc(self): + """Negative-signal paragraphs still dropped in long docs.""" + negative = "Bayesian inference of the phylogenetic relationships " "revealed strong bootstrap support." + text = self._make_long_doc([negative]) + result = filter_relevant_sections(text) + assert "Bayesian inference" not in result + + def test_headings_preserved_in_long_doc(self): + """Section headings never dropped, even in strict mode.""" + text = self._make_long_doc(["Methods", "Results"]) + result = filter_relevant_sections(text) + assert "Methods" in result + assert "Results" in result + + def test_page_markers_preserved_in_long_doc(self): + text = self._make_long_doc(["[PAGE 99]"]) + result = filter_relevant_sections(text) + assert "[PAGE 99]" in result + + def test_should_keep_strict_drops_neutral(self): + """Direct test of _should_keep with strict=True.""" + neutral = "The area has a subtropical climate." + assert _should_keep(neutral, strict=False) is True + assert _should_keep(neutral, strict=True) is False + + def test_should_keep_strict_keeps_positive(self): + positive = "A total of 100 stomachs were analyzed." + assert _should_keep(positive, strict=True) is True + + def test_long_doc_reduces_size(self): + """A long document with lots of neutral text gets meaningfully reduced.""" + neutral = "The atmospheric conditions were unremarkable that day." + positive = "We examined 85 stomachs from Vulpes vulpes." + # 50 neutral paras + a few positive ones + paras = [neutral] * 50 + [positive] * 3 + text = "\n\n".join(paras) + # Make sure it's over the threshold + while len(text) < _LONG_DOC_THRESHOLD: + text += f"\n\n{neutral}" + result = filter_relevant_sections(text) + assert len(result) < len(text) + assert positive in result + assert neutral not in result diff --git a/tests/test_text_cleaner.py b/tests/test_text_cleaner.py new file mode 100644 index 0000000..da6798d --- /dev/null +++ b/tests/test_text_cleaner.py @@ -0,0 +1,222 @@ +"""Unit tests for src/preprocessing/text_cleaner.py""" + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from src.preprocessing.text_cleaner import clean_text + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_page(n: int, body: str) -> str: + return f"[PAGE {n}]\n{body}" + + +# --------------------------------------------------------------------------- +# Reference / bibliography section removal +# --------------------------------------------------------------------------- + + +class TestReferenceSectionRemoval: + def test_drops_references_header_and_trailing_content(self): + text = "Methods\nWe examined 50 stomachs from Canis lupus.\n\n" "References\nSmith J. 2001. J Ecol 10:1-5.\nDoe A. 2002. Nature 400:1.\n" + result = clean_text(text) + assert "Smith J." not in result + assert "Doe A." not in result + + def test_keeps_content_before_references(self): + text = "Results\nSample size was 30.\n\n" "References\n1. Author A. 2000. Title. Journal.\n" + result = clean_text(text) + assert "Sample size was 30" in result + + def test_drops_literature_cited(self): + text = "Discussion\nSee below.\n\nLiterature Cited\nFoo B. 1999.\n" + result = clean_text(text) + assert "Foo B." not in result + + def test_drops_bibliography(self): + text = "Results\nN=42.\n\nBibliography\nBar C. 2005.\n" + result = clean_text(text) + assert "Bar C." not in result + + def test_references_on_separate_page_doesnt_poison_next_page(self): + text = "[PAGE 3]\nResults\nSample size = 30.\n" "[PAGE 4]\nReferences\n1. Smith 2001.\n" "[PAGE 5]\nDiscussion\nThis study found important results.\n" + result = clean_text(text) + assert "Sample size = 30" in result + assert "Smith 2001" not in result + assert "This study found important results" in result + + +# --------------------------------------------------------------------------- +# Acknowledgement / funding section removal +# --------------------------------------------------------------------------- + + +class TestAcknowledgementRemoval: + def test_drops_acknowledgements(self): + text = "Methods\nWe sampled fish.\n\nAcknowledgements\nWe thank our funders.\n" + result = clean_text(text) + assert "We thank our funders" not in result + + def test_drops_acknowledgments_us_spelling(self): + text = "Results\nN=10.\n\nAcknowledgments\nFunded by NSF.\n" + result = clean_text(text) + assert "Funded by NSF" not in result + + def test_drops_funding_section(self): + text = "Abstract\nWe studied diets.\n\nFunding\nGrant XYZ-123.\n" + result = clean_text(text) + assert "Grant XYZ-123" not in result + + def test_keeps_content_before_acknowledgements(self): + text = "Results\n42 stomachs examined.\n\nAcknowledgements\nThanks.\n" + result = clean_text(text) + assert "42 stomachs examined" in result + + +# --------------------------------------------------------------------------- +# Line-level noise removal +# --------------------------------------------------------------------------- + + +class TestPageNumberRemoval: + def test_removes_standalone_page_number(self): + text = "Results\n\n42\n\nMore text here.\n" + result = clean_text(text) + assert "\n42\n" not in result + + def test_keeps_number_inside_sentence(self): + text = "We examined 42 stomachs.\n" + result = clean_text(text) + assert "We examined 42 stomachs" in result + + +class TestReferenceEntryRemoval: + def test_removes_bracketed_ref_entry(self): + result = clean_text("[1] Smith J. 2001. Nature.\n") + assert "[1] Smith" not in result + + def test_removes_numbered_ref_entry(self): + result = clean_text("1. Jones A. 1999. J Ecol.\n") + assert "Jones A." not in result + + def test_keeps_regular_sentences_starting_with_number(self): + # Sentences like "40 specimens were examined" should stay + text = "A total of 40 specimens were examined in 2005.\n" + result = clean_text(text) + assert "40 specimens" in result + + +class TestUrlAndDoiRemoval: + def test_removes_http_url(self): + result = clean_text("See https://www.example.com/paper for details.\n") + assert "https://" not in result + + def test_removes_doi(self): + result = clean_text("doi.org/10.1016/j.biocon.2001.01.001\n") + assert "doi.org" not in result + + +class TestEmailRemoval: + def test_removes_email_address(self): + result = clean_text("Contact: author@university.edu for more info.\n") + assert "@university.edu" not in result + + +class TestCopyrightRemoval: + def test_removes_copyright_symbol(self): + result = clean_text("© 2021 Elsevier Ltd. All rights reserved.\n") + assert "Elsevier" not in result + + def test_removes_copyright_word(self): + result = clean_text("Copyright 2020 The Authors.\n") + assert "Copyright 2020" not in result + + +class TestFigureCaptionRemoval: + def test_removes_figure_caption(self): + result = clean_text("Figure 1. Map of study area showing sampling sites.\n") + assert "Map of study area" not in result + + def test_removes_table_caption(self): + result = clean_text("Table 2. Diet composition of Vulpes vulpes.\n") + assert "Diet composition" not in result + + def test_removes_fig_abbreviation(self): + result = clean_text("Fig. 3. Distribution of stomach contents.\n") + assert "Distribution of stomach" not in result + + +class TestAffiliationRemoval: + def test_removes_department_line(self): + result = clean_text("Department of Ecology, University of Oslo, Norway.\n") + assert "Department of Ecology" not in result + + def test_removes_university_of_line(self): + result = clean_text("University of Cambridge, Cambridge CB2 1TN, UK.\n") + assert "University of Cambridge" not in result + + +class TestReceivedAcceptedRemoval: + def test_removes_received_line(self): + result = clean_text("Received: 12 March 2023; Accepted: 5 June 2023\n") + assert "12 March 2023" not in result + + +class TestKeywordsRemoval: + def test_removes_keywords_line(self): + result = clean_text("Keywords: predator, diet, stomach contents, ecology\n") + assert "predator, diet" not in result + + +# --------------------------------------------------------------------------- +# Whitespace normalisation +# --------------------------------------------------------------------------- + + +class TestWhitespaceNormalisation: + def test_collapses_multiple_blank_lines(self): + text = "Line one.\n\n\n\n\nLine two.\n" + result = clean_text(text) + assert "\n\n\n" not in result + + def test_strips_leading_and_trailing_whitespace(self): + text = " \n\nSome content.\n\n " + result = clean_text(text) + assert result == result.strip() + + +# --------------------------------------------------------------------------- +# Pass-through for clean text +# --------------------------------------------------------------------------- + + +class TestCleanPassThrough: + def test_clean_text_passes_through_unaffected_content(self): + text = ( + "Abstract\nThis study examined stomach contents of Canis lupus.\n\n" + "Methods\nWe collected 50 specimens from Yellowstone, USA.\n\n" + "Results\nOf 50 stomachs, 8 were empty and 42 contained prey.\n" + ) + result = clean_text(text) + for fragment in ["Canis lupus", "50 specimens", "Yellowstone", "8 were empty"]: + assert fragment in result, f"Expected '{fragment}' to be preserved" + + def test_page_markers_preserved(self): + text = "[PAGE 1]\nAbstract\nDiet study.\n[PAGE 2]\nMethods\nWe sampled.\n" + result = clean_text(text) + assert "[PAGE 1]" in result + assert "[PAGE 2]" in result + + def test_empty_string_returns_empty(self): + assert clean_text("") == "" + + def test_whitespace_only_returns_empty(self): + assert clean_text(" \n\n ") == ""