Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9042c78
feat: cleans noisy features (doi, numbered references, figure caption…
raymondcen Feb 28, 2026
fa61459
fix: text cleaning left blank lines
raymondcen Feb 28, 2026
688deaa
feat: temp pipeline for testing
raymondcen Feb 28, 2026
af2836f
fix: section headers being removed
raymondcen Feb 28, 2026
0687e62
use section priority rankings instead
raymondcen Feb 28, 2026
317387d
save cleaned text to folder
raymondcen Feb 28, 2026
17b7b55
use parargraph scoring
raymondcen Feb 28, 2026
f738892
update data for comparison later
raymondcen Feb 28, 2026
ff34c74
added another filter to drop entire paragraphs that contain irrelevan…
raymondcen Mar 2, 2026
b3b4abf
update instructions on files
raymondcen Mar 2, 2026
af63973
added retry logic to retry null returns
raymondcen Mar 2, 2026
af6aa3a
increased char and num-ctx limit on ollama
raymondcen Mar 2, 2026
cde64a1
drops paragraphs with no neg or pos signal
raymondcen Mar 2, 2026
00bcf41
added long doc test
raymondcen Mar 2, 2026
d6efaf3
switched to qwen2.5:7b
raymondcen Mar 2, 2026
b3dcf46
rewrote the system prompt to handle diverse study methods beyond stom…
raymondcen Mar 2, 2026
ad49f3c
reformat
raymondcen Mar 2, 2026
0b3a3d8
improved truncation
raymondcen Mar 12, 2026
f66d48d
added multi cpu processing option
raymondcen Mar 12, 2026
208029d
added --workers arg
raymondcen Mar 12, 2026
4a01b9d
added sequqnetial pdf processing
raymondcen Mar 12, 2026
33adda5
loads SpellChecker() only once
raymondcen Mar 12, 2026
823fd7f
xgboost trinaing set to gpu if available
raymondcen Mar 12, 2026
72388c1
added bypass OCR because it froze workers
raymondcen Mar 12, 2026
2263116
--labels option to process all useful papers
raymondcen Mar 12, 2026
3320e37
fixed name scanning in labels.json
raymondcen Mar 12, 2026
251d86c
reformat
raymondcen Mar 14, 2026
c961734
Merge branch 'feat/improve-logging' into feat/xgboost-rework
raymondcen Mar 15, 2026
610ba72
Merge remote-tracking branch 'origin/main' into feat/xgboost-rework
raymondcen Mar 15, 2026
6920c95
Delete data/cleaned-text/llm_text/Adams_1989_20260312_115204.txt
raymondcen Mar 15, 2026
41c49a4
Delete data/cleaned-text/section_filter/Adams_1989_20260312_115204.txt
raymondcen Mar 15, 2026
46dabc1
deleted .txt files
raymondcen Mar 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 191 additions & 90 deletions classsify_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import logging
import sys
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

from src.preprocessing.pdf_text_extraction import extract_text_from_pdf
Expand All @@ -46,8 +47,9 @@
# Pipeline
# ---------------------------------------------------------------------------

def run_pipeline(
input_path: Path,

def _process_single_pdf(
pdf_path: Path,
model_dir: str,
llm_model: str,
output_dir: Path,
Expand All @@ -74,18 +76,18 @@ def run_pipeline(
num_ctx: Context window size for Ollama.
"""
# ── Collect PDF paths ─────────────────────────────────────────────────
if input_path.is_dir():
pdf_paths = sorted(input_path.glob("*.pdf"))
if pdf_path.is_dir():
pdf_paths = sorted(pdf_path.glob("*.pdf"))
if not pdf_paths:
print(f"[ERROR] No PDF files found in directory: {input_path}", file=sys.stderr)
log.error("No PDF files found in directory: %s", input_path)
print(f"[ERROR] No PDF files found in directory: {pdf_path}", file=sys.stderr)
log.error("No PDF files found in directory: %s", pdf_path)
sys.exit(1)
print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {input_path}", file=sys.stderr)
elif input_path.is_file() and input_path.suffix.lower() == ".pdf":
pdf_paths = [input_path]
print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {pdf_path}", file=sys.stderr)
elif pdf_path.is_file() and pdf_path.suffix.lower() == ".pdf":
pdf_paths = [pdf_path]
else:
print(f"[ERROR] Input must be a .pdf file or a directory of PDFs: {input_path}", file=sys.stderr)
log.error("Input must be a .pdf file or a directory of PDFs: %s", input_path)
print(f"[ERROR] pdf must be a .pdf file or a directory of PDFs: {pdf_path}", file=sys.stderr)
log.error("pdf must be a .pdf file or a directory of PDFs: %s", pdf_path)
sys.exit(1)

# ── Load classifier once (avoid re-reading model artifacts per file) ──
Expand Down Expand Up @@ -119,92 +121,186 @@ def run_pipeline(
"fraction_feeding": "",
}

# ── Step 1: Extract text ──────────────────────────────────────────
# ── Step 1: Extract text ──────────────────────────────────────────
try:
original_text = extract_text_from_pdf(str(pdf_path))
except Exception as e:
print(f" [ERROR] Text extraction failed ({pdf_path.name}): {e}", file=sys.stderr)
log.error("Text extraction failed for %s: %s", pdf_path.name, e)
row["extraction_status"] = "text_extraction_failed"
return row

if not original_text.strip():
print(f" [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr)
log.warning("No text extracted from %s — skipping.", pdf_path.name)
row["extraction_status"] = "empty_text"
return row

print(f" [INFO] {pdf_path.name}: {len(original_text)} chars", file=sys.stderr)

# ── Step 2: Classify ──────────────────────────────────────────────
clf_model, vectorizer, encoder = load_classifier(model_dir)
label, confidence, pred_prob = classify_text(
text=original_text,
model=clf_model,
vectorizer=vectorizer,
encoder=encoder,
threshold=confidence_threshold,
)
print(f" [CLASSIFIER] {pdf_path.name} → {label} ({confidence:.2%})", file=sys.stderr)

row["classification"] = label
row["confidence"] = f"{confidence:.4f}"
row["pred_prob"] = f"{pred_prob:.4f}"

# ── Step 3: Extract ───────────────────────────────────────────────
if label == "useful":
print(f" [INFO] {pdf_path.name}: Running LLM extraction...", file=sys.stderr)

text_for_llm = original_text
if len(text_for_llm) > max_chars:
text_for_llm = extract_key_sections(text_for_llm, max_chars)
print(f" [INFO] {pdf_path.name}: trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr)

try:
original_text = extract_text_from_pdf(str(pdf_path))
metrics = extract_metrics_from_text(
text=text_for_llm,
model=llm_model,
num_ctx=num_ctx,
)
result = save_extraction_result(
metrics=metrics,
source_file=pdf_path,
original_text=original_text,
output_dir=output_dir,
)

m = result["metrics"]
row["extraction_status"] = "success"
row["species_name"] = m.get("species_name") or ""
row["study_location"] = m.get("study_location") or ""
row["study_date"] = m.get("study_date") or ""
row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"]
row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"]
row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"]
row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"]

except Exception as e:
print(f" [ERROR] Text extraction failed: {e}", file=sys.stderr)
log.error("Text extraction failed for %s: %s", pdf_path.name, e)
row["extraction_status"] = "text_extraction_failed"
summary_rows.append(row)
continue
print(f" [ERROR] LLM extraction failed ({pdf_path.name}): {e}", file=sys.stderr)
log.error("LLM extraction failed for %s: %s", pdf_path.name, e)
row["extraction_status"] = "extraction_failed"

if not original_text.strip():
print(f" [WARN] No text extracted from {pdf_path.name}. Skipping.", file=sys.stderr)
log.warning("No text extracted from %s — skipping.", pdf_path.name)
row["extraction_status"] = "empty_text"
else:
print(f" [INFO] {pdf_path.name}: Not useful — skipping LLM extraction.", file=sys.stderr)
row["extraction_status"] = "skipped_not_useful"

return row


def run_pipeline(
input_path: Path,
model_dir: str,
llm_model: str,
output_dir: Path,
confidence_threshold: float,
max_chars: int,
num_ctx: int,
workers: int = 1,
):
"""Run classify → extract pipeline on one or more PDFs.

For each PDF:
1. Extract text via PyMuPDF / OCR (pdf_text_extraction.py)
2. Classify with XGBoost (pdf_classifier.py)
3. If 'useful': trim text to budget (llm_text.py), run LLM extraction
(llm_client.py), and save result JSON (llm_client.py)
4. Append a row to the summary CSV regardless of classification outcome

Args:
input_path: Path to a single PDF or a directory of PDFs.
model_dir: Directory containing classifier model artifacts.
llm_model: Ollama model name for extraction.
output_dir: Where to write JSON results and the summary CSV.
confidence_threshold: Classifier probability threshold for 'useful'.
max_chars: Max characters to send to the LLM.
num_ctx: Context window size for Ollama.
workers: Number of parallel worker processes (default: 1 = sequential).
"""
# ── Collect PDF paths ─────────────────────────────────────────────────
if input_path.is_dir():
pdf_paths = sorted(input_path.glob("*.pdf"))
if not pdf_paths:
print(f"[ERROR] No PDF files found in directory: {input_path}", file=sys.stderr)
sys.exit(1)
print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {input_path}", file=sys.stderr)
elif input_path.is_file() and input_path.suffix.lower() == ".pdf":
pdf_paths = [input_path]
else:
print(f"[ERROR] Input must be a .pdf file or a directory of PDFs: {input_path}", file=sys.stderr)
sys.exit(1)

output_dir.mkdir(parents=True, exist_ok=True)
summary_rows = []

if workers > 1 and len(pdf_paths) > 1:
print(f"[INFO] Using {workers} worker processes.", file=sys.stderr)
with ProcessPoolExecutor(max_workers=workers) as executor:
futures = {
executor.submit(
_process_single_pdf,
pdf_path,
model_dir,
llm_model,
output_dir,
confidence_threshold,
max_chars,
num_ctx,
): pdf_path
for pdf_path in pdf_paths
}
for future in as_completed(futures):
pdf_path = futures[future]
try:
row = future.result()
except Exception as exc:
print(f" [ERROR] Worker failed for {pdf_path.name}: {exc}", file=sys.stderr)
row = {"filename": pdf_path.name, "extraction_status": "worker_failed"}
summary_rows.append(row)
else:
for idx, pdf_path in enumerate(pdf_paths, start=1):
print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr)
row = _process_single_pdf(
pdf_path,
model_dir,
llm_model,
output_dir,
confidence_threshold,
max_chars,
num_ctx,
)
summary_rows.append(row)
continue

print(f" [INFO] Text size: {len(original_text)} chars", file=sys.stderr)

# ── Step 2: Classify ──────────────────────────────────────────────
label, confidence, pred_prob = classify_text(
text=original_text,
model=clf_model,
vectorizer=vectorizer,
encoder=encoder,
threshold=confidence_threshold,
)
print(f" [CLASSIFIER] → {label} ({confidence:.2%} confidence)", file=sys.stderr)

row["classification"] = label
row["confidence"] = f"{confidence:.4f}"
row["pred_prob"] = f"{pred_prob:.4f}"

# ── Step 3: Extract ───────────────────────────────────────────────
if label == "useful":
print(f" [INFO] Running LLM extraction...", file=sys.stderr)

text_for_llm = original_text
if len(text_for_llm) > max_chars:
text_for_llm = extract_key_sections(text_for_llm, max_chars)
print(f" [INFO] Text trimmed to {len(text_for_llm)} chars (budget {max_chars})", file=sys.stderr)

try:
metrics = extract_metrics_from_text(
text=text_for_llm,
model=llm_model,
num_ctx=num_ctx,
)
result = save_extraction_result(
metrics=metrics,
source_file=pdf_path,
original_text=original_text,
output_dir=output_dir,
)

m = result["metrics"]
row["extraction_status"] = "success"
row["species_name"] = m.get("species_name") or ""
row["study_location"] = m.get("study_location") or ""
row["study_date"] = m.get("study_date") or ""
row["sample_size"] = "" if m.get("sample_size") is None else m["sample_size"]
row["num_empty_stomachs"] = "" if m.get("num_empty_stomachs") is None else m["num_empty_stomachs"]
row["num_nonempty_stomachs"] = "" if m.get("num_nonempty_stomachs") is None else m["num_nonempty_stomachs"]
row["fraction_feeding"] = "" if m.get("fraction_feeding") is None else m["fraction_feeding"]

except Exception as e:
print(f" [ERROR] LLM extraction failed: {e}", file=sys.stderr)
log.error("LLM extraction failed for %s: %s", pdf_path.name, e)
row["extraction_status"] = "extraction_failed"

else:
print(f" [INFO] Not useful — skipping LLM extraction.", file=sys.stderr)
row["extraction_status"] = "skipped_not_useful"

summary_rows.append(row)

# ── Write summary CSV ─────────────────────────────────────────────────
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summaries_dir = output_dir / "summaries"
summaries_dir.mkdir(parents=True, exist_ok=True)
summary_path = summaries_dir / f"pipeline_summary_{timestamp}.csv"

fieldnames = [
"filename", "classification", "confidence", "pred_prob",
"extraction_status", "species_name", "study_location", "study_date",
"sample_size", "num_empty_stomachs", "num_nonempty_stomachs", "fraction_feeding",
"filename",
"classification",
"confidence",
"pred_prob",
"extraction_status",
"species_name",
"study_location",
"study_date",
"sample_size",
"num_empty_stomachs",
"num_nonempty_stomachs",
"fraction_feeding",
]
with open(summary_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
Expand Down Expand Up @@ -237,12 +333,10 @@ def run_pipeline(
# CLI entry point
# ---------------------------------------------------------------------------


def main():
parser = argparse.ArgumentParser(
description=(
"Classify PDFs as useful/not-useful, then extract structured diet "
"metrics from useful ones using an LLM."
),
description=("Classify PDFs as useful/not-useful, then extract structured diet " "metrics from useful ones using an LLM."),
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
Expand Down Expand Up @@ -301,6 +395,12 @@ def main():
default=4096,
help="Context window size for Ollama (default: 4096).",
)
parser.add_argument(
"--workers",
type=int,
default=1,
help="Number of parallel worker processes (default: 1 = sequential).",
)

args = parser.parse_args()

Expand All @@ -321,8 +421,9 @@ def main():
confidence_threshold=args.confidence_threshold,
max_chars=args.max_chars,
num_ctx=args.num_ctx,
workers=args.workers,
)


if __name__ == "__main__":
main()
main()
14 changes: 14 additions & 0 deletions data/results/metrics/Adams_1989_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"source_file": "Adams_1989.txt",
"file_type": ".txt",
"metrics": {
"species_name": null,
"study_location": "Marion Island, sub-Antarctic",
"study_date": null,
"num_empty_stomachs": null,
"num_nonempty_stomachs": null,
"sample_size": null,
"fraction_feeding": null,
"source_pages": null
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
filename,raw_chars,cleaned_chars,trimmed_chars,extraction_status,species_name,study_location,study_date,sample_size,num_empty_stomachs,num_nonempty_stomachs,fraction_feeding
Adams_1989.txt,27673,22739,4999,success,,"Marion Island, sub-Antarctic",,,,,
Loading
Loading