Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions src/file_manipulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from src.filler import Filler
from src.llm import LLM
from commonforms import prepare_form

from src.privacy import PrivacyManager

class FileManipulator:
def __init__(self):
Expand Down Expand Up @@ -31,9 +31,23 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str):

print("[3] Starting extraction and PDF filling process...")
try:
# --- PRIVACY INTERCEPTION START ---
privacy = PrivacyManager()
safe_input = privacy.tokenize(user_input)

self.llm._target_fields = fields
self.llm._transcript_text = user_input
output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)
self.llm._transcript_text = safe_input

# Execute LLM here
self.llm.main_loop()
tokenized_dict = self.llm.get_data()

# Unmask data back to real values
real_data_dict = privacy.detokenize(tokenized_dict)
# --- PRIVACY INTERCEPTION END ---

# Pass the unmasked dictionary to the filler
output_name = self.filler.fill_form(pdf_form=pdf_form_path, manual_data=real_data_dict)

print("\n----------------------------------")
print("✅ Process Complete.")
Expand All @@ -43,5 +57,4 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str):

except Exception as e:
print(f"An error occurred during PDF generation: {e}")
# Re-raise the exception so the frontend can handle it
raise e
9 changes: 3 additions & 6 deletions src/filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class Filler:
def __init__(self):
pass

def fill_form(self, pdf_form: str, llm: LLM):
def fill_form(self, pdf_form: str, manual_data: dict): # Changed parameter
"""
Fill a PDF form with values from user_input using LLM.
Fields are filled in the visual order (top-to-bottom, left-to-right).
Expand All @@ -19,11 +19,8 @@ def fill_form(self, pdf_form: str, llm: LLM):
+ "_filled.pdf"
)

# Generate dictionary of answers from your original function
t2j = llm.main_loop()
textbox_answers = t2j.get_data() # This is a dictionary

answers_list = list(textbox_answers.values())
# Generate list from the real_data dictionary passed in
answers_list = list(manual_data.values())

# Read PDF
pdf = PdfReader(pdf_form)
Expand Down
29 changes: 29 additions & 0 deletions src/privacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import re
import uuid
import json

class PrivacyManager:
def __init__(self):
self._pii_map = {}
# Simple regex for emails and phone numbers
self.patterns = {
"EMAIL": r'[\w\.-]+@[\w\.-]+\.\w+',
"PHONE": r'\b(?:\+?\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b'
}

def tokenize(self, text: str) -> str:
tokenized_text = text
for label, pattern in self.patterns.items():
matches = re.findall(pattern, tokenized_text)
for match in matches:
token = f"TOKEN_{label}_{uuid.uuid4().hex[:6].upper()}"
self._pii_map[token] = match
tokenized_text = tokenized_text.replace(match, token)
return tokenized_text

def detokenize(self, tokenized_data: dict) -> dict:
# Convert dict to string, replace tokens, convert back to dict
dumped = json.dumps(tokenized_data)
for token, original_value in self._pii_map.items():
dumped = dumped.replace(token, original_value)
return json.loads(dumped)