Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from fastapi import FastAPI
from api.routes import templates, forms
from api.errors.handlers import register_exception_handlers

app = FastAPI()

# Register custom exception handlers so AppError is turned into a proper
# JSON response (e.g. {"error": "Template not found"} with status 404)
# instead of crashing with an unhandled 500.
register_exception_handlers(app)

app.include_router(templates.router)
app.include_router(forms.router)
22 changes: 15 additions & 7 deletions api/routes/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,25 @@

router = APIRouter(prefix="/forms", tags=["forms"])


@router.post("/fill", response_model=FormFillResponse)
def fill_form(form: FormFill, db: Session = Depends(get_db)):
if not get_template(db, form.template_id):
raise AppError("Template not found", status_code=404)

# Single DB hit — store the result and reuse it.
# Previously get_template() was called twice (once to check, once to fetch),
# which doubled the DB round-trips for every request.
fetched_template = get_template(db, form.template_id)
if not fetched_template:
raise AppError("Template not found", status_code=404)

controller = Controller()
path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path)
try:
controller = Controller()
path = controller.fill_form(
user_input=form.input_text,
fields=fetched_template.fields,
pdf_form_path=fetched_template.pdf_path,
)
except FileNotFoundError as e:
raise AppError(str(e), status_code=422)

submission = FormSubmission(**form.model_dump(), output_pdf_path=path)
return create_form(db, submission)


41 changes: 23 additions & 18 deletions src/file_manipulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
class FileManipulator:
def __init__(self):
self.filler = Filler()
self.llm = LLM()
# NOTE: We intentionally do NOT store a shared LLM instance here.
# LLM holds per-request mutable state (_transcript_text, _target_fields, _json).
# Sharing one instance across concurrent requests would cause a race condition
# where two requests overwrite each other's data. A fresh LLM is created
# inside fill_form() so each request owns its own isolated instance.

def create_template(self, pdf_path: str):
"""
Expand All @@ -17,31 +21,32 @@ def create_template(self, pdf_path: str):
prepare_form(pdf_path, template_path)
return template_path

def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
def fill_form(self, user_input: str, fields: dict, pdf_form_path: str):
"""
It receives the raw data, runs the PDF filling logic,
and returns the path to the newly created file.
Receives the raw transcript + template fields, runs the LLM extraction +
PDF filling pipeline, and returns the path to the newly created filled PDF.

A new LLM instance is created on every call to guarantee full isolation
between concurrent requests — no shared mutable state.
"""
print("[1] Received request from frontend.")
print(f"[2] PDF template path: {pdf_form_path}")

if not os.path.exists(pdf_form_path):
print(f"Error: PDF template not found at {pdf_form_path}")
return None # Or raise an exception
raise FileNotFoundError(
f"PDF template not found at '{pdf_form_path}'. "
"Please verify the template path stored in the database is correct."
)

print("[3] Starting extraction and PDF filling process...")
try:
self.llm._target_fields = fields
self.llm._transcript_text = user_input
output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)

print("\n----------------------------------")
print("✅ Process Complete.")
print(f"Output saved to: {output_name}")
# Fresh LLM instance scoped to this request only.
llm = LLM(transcript_text=user_input, target_fields=fields)

output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=llm)

return output_name
print("\n----------------------------------")
print("✅ Process Complete.")
print(f"Output saved to: {output_name}")

except Exception as e:
print(f"An error occurred during PDF generation: {e}")
# Re-raise the exception so the frontend can handle it
raise e
return output_name
89 changes: 65 additions & 24 deletions src/filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,31 @@ class Filler:
def __init__(self):
pass

@staticmethod
def _decode_pdf_field_name(annot_T) -> str:
"""
pdfrw stores field names as PDF string objects like b'(Employee Name)' or '(date)'.
This helper strips the surrounding parentheses to get the plain field name string.
"""
raw = str(annot_T)
if raw.startswith("(") and raw.endswith(")"):
return raw[1:-1]
return raw

def fill_form(self, pdf_form: str, llm: LLM):
"""
Fill a PDF form with values from user_input using LLM.
Fields are filled in the visual order (top-to-bottom, left-to-right).
Fill a PDF form with values extracted by the LLM.

Matching strategy: field-name-based (not positional).

For every Widget annotation in the PDF we read annot.T (the PDF field name),
look that name up directly in the LLM-produced answers dict, and write the
matched value. This is safe regardless of annotation order because we never
rely on position/index to pair a value with a field.

If the PDF field name has no match in the LLM result we leave it blank
rather than silently writing a wrong value. Plural answers (lists) are
joined with '; ' so the field stays human-readable.
"""
output_pdf = (
pdf_form[:-4]
Expand All @@ -19,34 +40,54 @@ def fill_form(self, pdf_form: str, llm: LLM):
+ "_filled.pdf"
)

# Generate dictionary of answers from your original function
t2j = llm.main_loop()
textbox_answers = t2j.get_data() # This is a dictionary
# Run the LLM extraction pipeline → {field_name: value | list | None}
answers: dict = llm.main_loop().get_data()

print(f"\t[LOG] Filler received {len(answers)} answer(s) from LLM.")

# Build a lowercase-keyed lookup so minor capitalisation differences
# between the stored template and the PDF's own field labels don't block a match.
normalised_answers = {k.lower().strip(): v for k, v in answers.items()}

answers_list = list(textbox_answers.values())
unmatched_pdf_fields = []

# Read PDF
pdf = PdfReader(pdf_form)

# Loop through pages
for page in pdf.pages:
if page.Annots:
sorted_annots = sorted(
page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0]))
)

i = 0
for annot in sorted_annots:
if annot.Subtype == "/Widget" and annot.T:
if i < len(answers_list):
annot.V = f"{answers_list[i]}"
annot.AP = None
i += 1
else:
# Stop if we run out of answers
break
if not page.Annots:
continue

PdfWriter().write(output_pdf, pdf)
for annot in page.Annots:
if annot.Subtype != "/Widget" or not annot.T:
continue

pdf_field_name = self._decode_pdf_field_name(annot.T)
lookup_key = pdf_field_name.lower().strip()

if lookup_key in normalised_answers:
raw_value = normalised_answers[lookup_key]

if raw_value is None:
# LLM could not find the value — write empty string, not "None"
annot.V = ""
elif isinstance(raw_value, list):
# Plural values (e.g. multiple engines) → join for readability
annot.V = "; ".join(str(v) for v in raw_value if v is not None)
else:
annot.V = str(raw_value)

# Your main.py expects this function to return the path
# Clear the pre-rendered appearance stream so the viewer
# re-renders with the new value.
annot.AP = None
else:
unmatched_pdf_fields.append(pdf_field_name)

if unmatched_pdf_fields:
print(
f"\t[WARN] {len(unmatched_pdf_fields)} PDF field(s) had no matching "
f"LLM answer and were left blank: {unmatched_pdf_fields}"
)

PdfWriter().write(output_pdf, pdf)
return output_pdf
18 changes: 12 additions & 6 deletions src/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,23 @@ def build_prompt(self, current_field):

def main_loop(self):
# self.type_check_all()

# Resolve host and model once outside the loop — they are constant
# for the entire request and there is no need to re-read env vars
# on every field iteration.
ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/")
ollama_url = f"{ollama_host}/api/generate"
# OLLAMA_MODEL lets operators swap models (llama3, phi3, etc.) via environment
# variable without touching source code. Defaults to "mistral" for compatibility.
ollama_model = os.getenv("OLLAMA_MODEL", "mistral")

for field in self._target_fields.keys():
prompt = self.build_prompt(field)
# print(prompt)
# ollama_url = "http://localhost:11434/api/generate"
ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/")
ollama_url = f"{ollama_host}/api/generate"

payload = {
"model": "mistral",
"model": ollama_model,
"prompt": prompt,
"stream": False, # don't really know why --> look into this later.
"stream": False,
}

try:
Expand Down