diff --git a/src/file_manipulator.py b/src/file_manipulator.py index b7815cc..da7c5c2 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -2,7 +2,7 @@ from src.filler import Filler from src.llm import LLM from commonforms import prepare_form - +from src.privacy import PrivacyManager class FileManipulator: def __init__(self): @@ -31,9 +31,23 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str): print("[3] Starting extraction and PDF filling process...") try: + # --- PRIVACY INTERCEPTION START --- + privacy = PrivacyManager() + safe_input = privacy.tokenize(user_input) + self.llm._target_fields = fields - self.llm._transcript_text = user_input - output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm) + self.llm._transcript_text = safe_input + + # Execute LLM here + self.llm.main_loop() + tokenized_dict = self.llm.get_data() + + # Unmask data back to real values + real_data_dict = privacy.detokenize(tokenized_dict) + # --- PRIVACY INTERCEPTION END --- + + # Pass the unmasked dictionary to the filler + output_name = self.filler.fill_form(pdf_form=pdf_form_path, manual_data=real_data_dict) print("\n----------------------------------") print("✅ Process Complete.") @@ -43,5 +57,4 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str): except Exception as e: print(f"An error occurred during PDF generation: {e}") - # Re-raise the exception so the frontend can handle it raise e diff --git a/src/filler.py b/src/filler.py index e31e535..d87bdd7 100644 --- a/src/filler.py +++ b/src/filler.py @@ -7,7 +7,7 @@ class Filler: def __init__(self): pass - def fill_form(self, pdf_form: str, llm: LLM): + def fill_form(self, pdf_form: str, manual_data: dict): # Changed parameter """ Fill a PDF form with values from user_input using LLM. Fields are filled in the visual order (top-to-bottom, left-to-right). @@ -19,11 +19,8 @@ def fill_form(self, pdf_form: str, llm: LLM): + "_filled.pdf" ) - # Generate dictionary of answers from your original function - t2j = llm.main_loop() - textbox_answers = t2j.get_data() # This is a dictionary - - answers_list = list(textbox_answers.values()) + # Generate list from the real_data dictionary passed in + answers_list = list(manual_data.values()) # Read PDF pdf = PdfReader(pdf_form) diff --git a/src/privacy.py b/src/privacy.py new file mode 100644 index 0000000..067a7bd --- /dev/null +++ b/src/privacy.py @@ -0,0 +1,29 @@ +import re +import uuid +import json + +class PrivacyManager: + def __init__(self): + self._pii_map = {} + # Simple regex for emails and phone numbers + self.patterns = { + "EMAIL": r'[\w\.-]+@[\w\.-]+\.\w+', + "PHONE": r'\b(?:\+?\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b' + } + + def tokenize(self, text: str) -> str: + tokenized_text = text + for label, pattern in self.patterns.items(): + matches = re.findall(pattern, tokenized_text) + for match in matches: + token = f"TOKEN_{label}_{uuid.uuid4().hex[:6].upper()}" + self._pii_map[token] = match + tokenized_text = tokenized_text.replace(match, token) + return tokenized_text + + def detokenize(self, tokenized_data: dict) -> dict: + # Convert dict to string, replace tokens, convert back to dict + dumped = json.dumps(tokenized_data) + for token, original_value in self._pii_map.items(): + dumped = dumped.replace(token, original_value) + return json.loads(dumped) \ No newline at end of file