diff --git a/api/main.py b/api/main.py index d0b8c79..583e8c3 100644 --- a/api/main.py +++ b/api/main.py @@ -1,7 +1,13 @@ from fastapi import FastAPI from api.routes import templates, forms +from api.errors.handlers import register_exception_handlers app = FastAPI() +# Register custom exception handlers so AppError is turned into a proper +# JSON response (e.g. {"error": "Template not found"} with status 404) +# instead of crashing with an unhandled 500. +register_exception_handlers(app) + app.include_router(templates.router) app.include_router(forms.router) \ No newline at end of file diff --git a/api/routes/forms.py b/api/routes/forms.py index f3430ed..36a8262 100644 --- a/api/routes/forms.py +++ b/api/routes/forms.py @@ -9,17 +9,25 @@ router = APIRouter(prefix="/forms", tags=["forms"]) + @router.post("/fill", response_model=FormFillResponse) def fill_form(form: FormFill, db: Session = Depends(get_db)): - if not get_template(db, form.template_id): - raise AppError("Template not found", status_code=404) - + # Single DB hit — store the result and reuse it. + # Previously get_template() was called twice (once to check, once to fetch), + # which doubled the DB round-trips for every request. fetched_template = get_template(db, form.template_id) + if not fetched_template: + raise AppError("Template not found", status_code=404) - controller = Controller() - path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path) + try: + controller = Controller() + path = controller.fill_form( + user_input=form.input_text, + fields=fetched_template.fields, + pdf_form_path=fetched_template.pdf_path, + ) + except FileNotFoundError as e: + raise AppError(str(e), status_code=422) submission = FormSubmission(**form.model_dump(), output_pdf_path=path) return create_form(db, submission) - - diff --git a/src/file_manipulator.py b/src/file_manipulator.py index b7815cc..90e213c 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -7,7 +7,11 @@ class FileManipulator: def __init__(self): self.filler = Filler() - self.llm = LLM() + # NOTE: We intentionally do NOT store a shared LLM instance here. + # LLM holds per-request mutable state (_transcript_text, _target_fields, _json). + # Sharing one instance across concurrent requests would cause a race condition + # where two requests overwrite each other's data. A fresh LLM is created + # inside fill_form() so each request owns its own isolated instance. def create_template(self, pdf_path: str): """ @@ -17,31 +21,32 @@ def create_template(self, pdf_path: str): prepare_form(pdf_path, template_path) return template_path - def fill_form(self, user_input: str, fields: list, pdf_form_path: str): + def fill_form(self, user_input: str, fields: dict, pdf_form_path: str): """ - It receives the raw data, runs the PDF filling logic, - and returns the path to the newly created file. + Receives the raw transcript + template fields, runs the LLM extraction + + PDF filling pipeline, and returns the path to the newly created filled PDF. + + A new LLM instance is created on every call to guarantee full isolation + between concurrent requests — no shared mutable state. """ print("[1] Received request from frontend.") print(f"[2] PDF template path: {pdf_form_path}") if not os.path.exists(pdf_form_path): - print(f"Error: PDF template not found at {pdf_form_path}") - return None # Or raise an exception + raise FileNotFoundError( + f"PDF template not found at '{pdf_form_path}'. " + "Please verify the template path stored in the database is correct." + ) print("[3] Starting extraction and PDF filling process...") - try: - self.llm._target_fields = fields - self.llm._transcript_text = user_input - output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm) - print("\n----------------------------------") - print("✅ Process Complete.") - print(f"Output saved to: {output_name}") + # Fresh LLM instance scoped to this request only. + llm = LLM(transcript_text=user_input, target_fields=fields) + + output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=llm) - return output_name + print("\n----------------------------------") + print("✅ Process Complete.") + print(f"Output saved to: {output_name}") - except Exception as e: - print(f"An error occurred during PDF generation: {e}") - # Re-raise the exception so the frontend can handle it - raise e + return output_name diff --git a/src/filler.py b/src/filler.py index e31e535..c9a427b 100644 --- a/src/filler.py +++ b/src/filler.py @@ -7,10 +7,31 @@ class Filler: def __init__(self): pass + @staticmethod + def _decode_pdf_field_name(annot_T) -> str: + """ + pdfrw stores field names as PDF string objects like b'(Employee Name)' or '(date)'. + This helper strips the surrounding parentheses to get the plain field name string. + """ + raw = str(annot_T) + if raw.startswith("(") and raw.endswith(")"): + return raw[1:-1] + return raw + def fill_form(self, pdf_form: str, llm: LLM): """ - Fill a PDF form with values from user_input using LLM. - Fields are filled in the visual order (top-to-bottom, left-to-right). + Fill a PDF form with values extracted by the LLM. + + Matching strategy: field-name-based (not positional). + + For every Widget annotation in the PDF we read annot.T (the PDF field name), + look that name up directly in the LLM-produced answers dict, and write the + matched value. This is safe regardless of annotation order because we never + rely on position/index to pair a value with a field. + + If the PDF field name has no match in the LLM result we leave it blank + rather than silently writing a wrong value. Plural answers (lists) are + joined with '; ' so the field stays human-readable. """ output_pdf = ( pdf_form[:-4] @@ -19,34 +40,54 @@ def fill_form(self, pdf_form: str, llm: LLM): + "_filled.pdf" ) - # Generate dictionary of answers from your original function - t2j = llm.main_loop() - textbox_answers = t2j.get_data() # This is a dictionary + # Run the LLM extraction pipeline → {field_name: value | list | None} + answers: dict = llm.main_loop().get_data() + + print(f"\t[LOG] Filler received {len(answers)} answer(s) from LLM.") + + # Build a lowercase-keyed lookup so minor capitalisation differences + # between the stored template and the PDF's own field labels don't block a match. + normalised_answers = {k.lower().strip(): v for k, v in answers.items()} - answers_list = list(textbox_answers.values()) + unmatched_pdf_fields = [] # Read PDF pdf = PdfReader(pdf_form) - # Loop through pages for page in pdf.pages: - if page.Annots: - sorted_annots = sorted( - page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0])) - ) - - i = 0 - for annot in sorted_annots: - if annot.Subtype == "/Widget" and annot.T: - if i < len(answers_list): - annot.V = f"{answers_list[i]}" - annot.AP = None - i += 1 - else: - # Stop if we run out of answers - break + if not page.Annots: + continue - PdfWriter().write(output_pdf, pdf) + for annot in page.Annots: + if annot.Subtype != "/Widget" or not annot.T: + continue + + pdf_field_name = self._decode_pdf_field_name(annot.T) + lookup_key = pdf_field_name.lower().strip() + + if lookup_key in normalised_answers: + raw_value = normalised_answers[lookup_key] + + if raw_value is None: + # LLM could not find the value — write empty string, not "None" + annot.V = "" + elif isinstance(raw_value, list): + # Plural values (e.g. multiple engines) → join for readability + annot.V = "; ".join(str(v) for v in raw_value if v is not None) + else: + annot.V = str(raw_value) - # Your main.py expects this function to return the path + # Clear the pre-rendered appearance stream so the viewer + # re-renders with the new value. + annot.AP = None + else: + unmatched_pdf_fields.append(pdf_field_name) + + if unmatched_pdf_fields: + print( + f"\t[WARN] {len(unmatched_pdf_fields)} PDF field(s) had no matching " + f"LLM answer and were left blank: {unmatched_pdf_fields}" + ) + + PdfWriter().write(output_pdf, pdf) return output_pdf diff --git a/src/llm.py b/src/llm.py index 70937f9..ad18321 100644 --- a/src/llm.py +++ b/src/llm.py @@ -46,17 +46,23 @@ def build_prompt(self, current_field): def main_loop(self): # self.type_check_all() + + # Resolve host and model once outside the loop — they are constant + # for the entire request and there is no need to re-read env vars + # on every field iteration. + ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/") + ollama_url = f"{ollama_host}/api/generate" + # OLLAMA_MODEL lets operators swap models (llama3, phi3, etc.) via environment + # variable without touching source code. Defaults to "mistral" for compatibility. + ollama_model = os.getenv("OLLAMA_MODEL", "mistral") + for field in self._target_fields.keys(): prompt = self.build_prompt(field) - # print(prompt) - # ollama_url = "http://localhost:11434/api/generate" - ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/") - ollama_url = f"{ollama_host}/api/generate" payload = { - "model": "mistral", + "model": ollama_model, "prompt": prompt, - "stream": False, # don't really know why --> look into this later. + "stream": False, } try: