From fd042decd35df317d21e04e3d466be67bd03e358 Mon Sep 17 00:00:00 2001 From: SHANKAR_CHAVAN Date: Wed, 25 Mar 2026 00:39:17 +0530 Subject: [PATCH 1/3] Fix: prevent crash when appending to non-list JSON field --- src/llm.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/llm.py b/src/llm.py index 70937f9..a57e7bd 100644 --- a/src/llm.py +++ b/src/llm.py @@ -97,11 +97,13 @@ def add_response_to_json(self, field, value): if ";" in value: parsed_value = self.handle_plural_values(value) - if field in self._json.keys(): - self._json[field].append(parsed_value) - else: - self._json[field] = parsed_value - + if field in self._json: + if isinstance(self._json[field], list): + self._json[field].append(parsed_value) + else: + self._json[field] = [self._json[field], parsed_value] +else: + self._json[field] = parsed_value return def handle_plural_values(self, plural_value): From 1ef7d685222a5ad2e69b9c08774d68e768f80976 Mon Sep 17 00:00:00 2001 From: SHANKAR_CHAVAN Date: Sat, 28 Mar 2026 17:06:35 +0530 Subject: [PATCH 2/3] feat: add structured logging and replace print() diagnostics --- src/file_manipulator.py | 26 ++++++++--------- src/llm.py | 63 ++++++++++++++--------------------------- src/logger.py | 25 ++++++++++++++++ 3 files changed, 59 insertions(+), 55 deletions(-) create mode 100644 src/logger.py diff --git a/src/file_manipulator.py b/src/file_manipulator.py index b7815cc..fe38f21 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -1,8 +1,11 @@ import os from src.filler import Filler from src.llm import LLM +from src.logger import setup_logger from commonforms import prepare_form +logger = setup_logger(__name__) + class FileManipulator: def __init__(self): @@ -22,26 +25,21 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str): It receives the raw data, runs the PDF filling logic, and returns the path to the newly created file. """ - print("[1] Received request from frontend.") - print(f"[2] PDF template path: {pdf_form_path}") + logger.info("Received request from frontend.") + logger.info(f"PDF template path: {pdf_form_path}") if not os.path.exists(pdf_form_path): - print(f"Error: PDF template not found at {pdf_form_path}") - return None # Or raise an exception + logger.error(f"PDF template not found at {pdf_form_path}") + return None - print("[3] Starting extraction and PDF filling process...") + logger.info("Starting extraction and PDF filling process...") try: self.llm._target_fields = fields self.llm._transcript_text = user_input output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm) - - print("\n----------------------------------") - print("✅ Process Complete.") - print(f"Output saved to: {output_name}") - + logger.info("Process Complete.") + logger.info(f"Output saved to: {output_name}") return output_name - except Exception as e: - print(f"An error occurred during PDF generation: {e}") - # Re-raise the exception so the frontend can handle it - raise e + logger.error(f"An error occurred during PDF generation: {e}") + raise e \ No newline at end of file diff --git a/src/llm.py b/src/llm.py index a57e7bd..87ab743 100644 --- a/src/llm.py +++ b/src/llm.py @@ -2,14 +2,17 @@ import os import requests +from src.logger import setup_logger +logger = setup_logger(__name__) + class LLM: def __init__(self, transcript_text=None, target_fields=None, json=None): if json is None: json = {} - self._transcript_text = transcript_text # str - self._target_fields = target_fields # List, contains the template field. - self._json = json # dictionary + self._transcript_text = transcript_text + self._target_fields = target_fields + self._json = json def type_check_all(self): if type(self._transcript_text) is not str: @@ -24,10 +27,6 @@ def type_check_all(self): ) def build_prompt(self, current_field): - """ - This method is in charge of the prompt engineering. It creates a specific prompt for each target field. - @params: current_field -> represents the current element of the json that is being prompted. - """ prompt = f""" SYSTEM PROMPT: You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. @@ -41,22 +40,18 @@ def build_prompt(self, current_field): TEXT: {self._transcript_text} """ - return prompt def main_loop(self): - # self.type_check_all() for field in self._target_fields.keys(): prompt = self.build_prompt(field) - # print(prompt) - # ollama_url = "http://localhost:11434/api/generate" ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/") ollama_url = f"{ollama_host}/api/generate" payload = { "model": "mistral", "prompt": prompt, - "stream": False, # don't really know why --> look into this later. + "stream": False, } try: @@ -70,24 +65,18 @@ def main_loop(self): except requests.exceptions.HTTPError as e: raise RuntimeError(f"Ollama returned an error: {e}") - # parse response json_data = response.json() parsed_response = json_data["response"] - # print(parsed_response) self.add_response_to_json(field, parsed_response) - print("----------------------------------") - print("\t[LOG] Resulting JSON created from the input text:") - print(json.dumps(self._json, indent=2)) - print("--------- extracted data ---------") + logger.info("----------------------------------") + logger.info("Resulting JSON created from the input text:") + logger.info(json.dumps(self._json, indent=2)) + logger.info("--------- extracted data ---------") return self def add_response_to_json(self, field, value): - """ - this method adds the following value under the specified field, - or under a new field if the field doesn't exist, to the json dict - """ value = value.strip().replace('"', "") parsed_value = None @@ -97,41 +86,33 @@ def add_response_to_json(self, field, value): if ";" in value: parsed_value = self.handle_plural_values(value) - if field in self._json: - if isinstance(self._json[field], list): - self._json[field].append(parsed_value) - else: - self._json[field] = [self._json[field], parsed_value] -else: - self._json[field] = parsed_value + if field in self._json: + if isinstance(self._json[field], list): + self._json[field].append(parsed_value) + else: + self._json[field] = [self._json[field], parsed_value] + else: + self._json[field] = parsed_value + return def handle_plural_values(self, plural_value): - """ - This method handles plural values. - Takes in strings of the form 'value1; value2; value3; ...; valueN' - returns a list with the respective values -> [value1, value2, value3, ..., valueN] - """ if ";" not in plural_value: raise ValueError( f"Value is not plural, doesn't have ; separator, Value: {plural_value}" ) - print( - f"\t[LOG]: Formating plural values for JSON, [For input {plural_value}]..." - ) + logger.info(f"Formatting plural values for JSON, input: {plural_value}") values = plural_value.split(";") - # Remove trailing leading whitespace for i in range(len(values)): current = i + 1 if current < len(values): clean_value = values[current].lstrip() values[current] = clean_value - print(f"\t[LOG]: Resulting formatted list of values: {values}") - + logger.info(f"Resulting formatted list of values: {values}") return values def get_data(self): - return self._json + return self._json \ No newline at end of file diff --git a/src/logger.py b/src/logger.py new file mode 100644 index 0000000..d8eab83 --- /dev/null +++ b/src/logger.py @@ -0,0 +1,25 @@ +import logging + + +def setup_logger(name: str): + """ + Sets up and returns a logger with the given name. + Avoids adding duplicate handlers if logger already exists. + """ + logger = logging.getLogger(name) + + if logger.handlers: + return logger + + logger.setLevel(logging.INFO) + + handler = logging.StreamHandler() + + formatter = logging.Formatter( + "%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger \ No newline at end of file From 9fdcd5f53dd53fc0cbcd0f3592858e49e180ff03 Mon Sep 17 00:00:00 2001 From: SHANKAR_CHAVAN Date: Sun, 29 Mar 2026 18:54:18 +0530 Subject: [PATCH 3/3] feat: add per-field confidence scoring and requires_review flag to LLM extraction --- src/llm.py | 86 ++++++++++++++++++++++++++++++++++--- src/test/test_confidence.py | 77 +++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 src/test/test_confidence.py diff --git a/src/llm.py b/src/llm.py index 87ab743..ab370e1 100644 --- a/src/llm.py +++ b/src/llm.py @@ -1,5 +1,6 @@ import json import os +import re import requests from src.logger import setup_logger @@ -27,17 +28,17 @@ def type_check_all(self): ) def build_prompt(self, current_field): - prompt = f""" + prompt = f""" SYSTEM PROMPT: - You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. - You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return - only a single string containing the identified value for the JSON field. + You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. + You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return + only a single string containing the identified value for the JSON field. If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";". If you don't identify the value in the provided text, return "-1". --- DATA: Target JSON field to find in text: {current_field} - + TEXT: {self._transcript_text} """ return prompt @@ -69,14 +70,20 @@ def main_loop(self): parsed_response = json_data["response"] self.add_response_to_json(field, parsed_response) + # ── NEW: build confidence-scored result ── + extraction_result = self.build_extraction_result() logger.info("----------------------------------") - logger.info("Resulting JSON created from the input text:") - logger.info(json.dumps(self._json, indent=2)) + logger.info("Resulting JSON with confidence scores:") + logger.info(json.dumps(extraction_result, indent=2)) logger.info("--------- extracted data ---------") return self def add_response_to_json(self, field, value): + """ + Adds the value under the specified field. + Existing PR #337 fix preserved — handles non-list fields safely. + """ value = value.strip().replace('"', "") parsed_value = None @@ -114,5 +121,70 @@ def handle_plural_values(self, plural_value): logger.info(f"Resulting formatted list of values: {values}") return values + def _compute_field_confidence(self, value) -> float: + """ + Heuristic confidence scoring for an extracted field value. + Returns a float between 0.0 and 1.0. + """ + if value is None or value == "" or value == "-1": + return 0.0 + if isinstance(value, list): + return 0.85 if len(value) > 0 else 0.0 + if isinstance(value, str): + vague_patterns = [ + r"not (specified|mentioned|provided|found|available)", + r"^n/?a$", + r"^\?+$", + r"^unknown$" + ] + for pattern in vague_patterns: + if re.search(pattern, value.strip(), re.IGNORECASE): + return 0.2 + if len(value.strip()) < 3: + return 0.3 + return 0.9 + return 0.8 + + def build_extraction_result(self) -> dict: + """ + Wraps each extracted field with a confidence score. + Adds top-level _meta block with requires_review flag + when any field confidence is below threshold. + """ + CONFIDENCE_THRESHOLD = 0.5 + result = {} + low_confidence_fields = [] + + for field, value in self._json.items(): + score = self._compute_field_confidence(value) + result[field] = { + "value": value, + "confidence": round(score, 2) + } + if score < CONFIDENCE_THRESHOLD: + low_confidence_fields.append(field) + + total_fields = len(self._json) + overall = ( + round(sum(result[f]["confidence"] for f in result) / total_fields, 2) + if total_fields > 0 else 0.0 + ) + + result["_meta"] = { + "requires_review": len(low_confidence_fields) > 0, + "low_confidence_fields": low_confidence_fields, + "overall_confidence": overall + } + + if result["_meta"]["requires_review"]: + logger.warning( + "Extraction requires human review. Low confidence fields: %s", + low_confidence_fields + ) + else: + logger.info("Extraction complete. All fields passed confidence threshold.") + + return result + def get_data(self): return self._json \ No newline at end of file diff --git a/src/test/test_confidence.py b/src/test/test_confidence.py new file mode 100644 index 0000000..908d76f --- /dev/null +++ b/src/test/test_confidence.py @@ -0,0 +1,77 @@ +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) + +import pytest +from src.llm import LLM + + +@pytest.fixture +def llm(): + fields = {"name": None, "date": None, "location": None} + return LLM(transcript_text="John went to Mumbai on 2024-01-15", target_fields=fields) + + +# --- _compute_field_confidence tests --- + +def test_confidence_none_value(llm): + assert llm._compute_field_confidence(None) == 0.0 + +def test_confidence_empty_string(llm): + assert llm._compute_field_confidence("") == 0.0 + +def test_confidence_minus_one(llm): + assert llm._compute_field_confidence("-1") == 0.0 + +def test_confidence_normal_string(llm): + assert llm._compute_field_confidence("John Smith") >= 0.8 + +def test_confidence_vague_not_specified(llm): + assert llm._compute_field_confidence("not specified") < 0.5 + +def test_confidence_vague_na(llm): + assert llm._compute_field_confidence("N/A") < 0.5 + +def test_confidence_vague_unknown(llm): + assert llm._compute_field_confidence("unknown") < 0.5 + +def test_confidence_short_string(llm): + assert llm._compute_field_confidence("ab") < 0.5 + +def test_confidence_plural_list(llm): + assert llm._compute_field_confidence(["val1", "val2"]) == 0.85 + +def test_confidence_empty_list(llm): + assert llm._compute_field_confidence([]) == 0.0 + + +# --- build_extraction_result tests --- + +def test_requires_review_true_when_low_confidence(llm): + llm._json = {"name": "John", "date": None, "location": "N/A"} + result = llm.build_extraction_result() + assert result["_meta"]["requires_review"] is True + assert "date" in result["_meta"]["low_confidence_fields"] + +def test_requires_review_false_when_all_confident(llm): + llm._json = {"name": "John Smith", "date": "2024-01-15", "location": "Mumbai"} + result = llm.build_extraction_result() + assert result["_meta"]["requires_review"] is False + assert result["_meta"]["low_confidence_fields"] == [] + +def test_overall_confidence_is_average(llm): + llm._json = {"name": "Alice", "date": "2024-03-01"} + result = llm.build_extraction_result() + assert 0.0 <= result["_meta"]["overall_confidence"] <= 1.0 + +def test_each_field_has_value_and_confidence(llm): + llm._json = {"name": "Bob"} + result = llm.build_extraction_result() + assert "value" in result["name"] + assert "confidence" in result["name"] + +def test_empty_json_returns_zero_confidence(llm): + llm._json = {} + result = llm.build_extraction_result() + assert result["_meta"]["overall_confidence"] == 0.0 + assert result["_meta"]["requires_review"] is False \ No newline at end of file