♻️ Refactor history sanitizing (#12)

Nativu5 · web-flow · commit b2f35b6b7e1d · 2025-06-26T09:34:54.000+08:00
diff --git a/app/server/chat.py b/app/server/chat.py
@@ -108,14 +108,14 @@ async def create_chat_completion(
         logger.exception(f"Error generating content from Gemini API: {e}")
         raise
 
-    # Format and clean the output
+    # Format the response from API
     model_output = GeminiClientWrapper.extract_output(response, include_thoughts=True)
     stored_output = GeminiClientWrapper.extract_output(response, include_thoughts=False)
 
-    # After cleaning, persist the conversation
+    # After formatting, persist the conversation to LMDB
     try:
         last_message = Message(role="assistant", content=stored_output)
-        cleaned_history = db.clean_assistant_messages(request.messages)
+        cleaned_history = db.sanitize_assistant_messages(request.messages)
         conv = ConversationInStore(
             model=model.model_name,
             client_id=client.id,
diff --git a/app/services/client.py b/app/services/client.py
@@ -16,7 +16,9 @@ def __init__(self, client_id: str, **kwargs):
         self.id = client_id
 
     async def init(self, **kwargs):
-        # Inject default configuration values
+        """
+        Inject default configuration values.
+        """
         kwargs.setdefault("timeout", g_config.gemini.timeout)
         kwargs.setdefault("auto_refresh", g_config.gemini.auto_refresh)
         kwargs.setdefault("verbose", g_config.gemini.verbose)
@@ -67,7 +69,9 @@ async def process_message(
         return model_input, files
 
     @staticmethod
-    async def process_conversation(messages: list[Message], tempdir: Path | None = None):
+    async def process_conversation(
+        messages: list[Message], tempdir: Path | None = None
+    ) -> tuple[str, list[Path | str]]:
         """
         Process the entire conversation and return a formatted string and list of
         files. The last message is assumed to be the assistant's response.
@@ -86,7 +90,7 @@ async def process_conversation(messages: list[Message], tempdir: Path | None = N
         return "\n".join(conversation), files
 
     @staticmethod
-    def extract_output(response: ModelOutput, include_thoughts: bool = True):
+    def extract_output(response: ModelOutput, include_thoughts: bool = True) -> str:
         """
         Extract and format the output text from the Gemini response.
         """
diff --git a/app/services/lmdb.py b/app/services/lmdb.py
@@ -1,4 +1,5 @@
 import hashlib
+import re
 from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
@@ -11,31 +12,24 @@
 from ..models import ConversationInStore, Message
 from ..utils import g_config
 from ..utils.singleton import Singleton
-import re
 
-def _normalize_content(content: str) -> str:
-    """Remove <think>...</think> tags and strip whitespace from content."""
-    # Remove think tags
-    cleaned_content = re.sub(r"<think>.*?</think>\n?", "", content, flags=re.DOTALL)
-    # Strip leading/trailing whitespace
-    return cleaned_content.strip()
 
-def hash_message(message: Message) -> str:
+def _hash_message(message: Message) -> str:
     """Generate a hash for a single message."""
     # Convert message to dict and sort keys for consistent hashing
     message_dict = message.model_dump(mode="json")
     message_bytes = orjson.dumps(message_dict, option=orjson.OPT_SORT_KEYS)
     return hashlib.sha256(message_bytes).hexdigest()
 
 
-def hash_conversation(client_id: str, model: str, messages: List[Message]) -> str:
+def _hash_conversation(client_id: str, model: str, messages: List[Message]) -> str:
     """Generate a hash for a list of messages and client id."""
     # Create a combined hash from all individual message hashes
     combined_hash = hashlib.sha256()
     combined_hash.update(client_id.encode("utf-8"))
     combined_hash.update(model.encode("utf-8"))
     for message in messages:
-        message_hash = hash_message(message)
+        message_hash = _hash_message(message)
         combined_hash.update(message_hash.encode("utf-8"))
     return combined_hash.hexdigest()
 
@@ -123,7 +117,7 @@ def store(
             raise ValueError("Messages list cannot be empty")
 
         # Generate hash for the message list
-        message_hash = hash_conversation(conv.client_id, conv.model, conv.messages)
+        message_hash = _hash_conversation(conv.client_id, conv.model, conv.messages)
         storage_key = custom_key or message_hash
 
         # Prepare data for storage
@@ -178,23 +172,6 @@ def get(self, key: str) -> Optional[ConversationInStore]:
             logger.error(f"Failed to retrieve messages for key {key}: {e}")
             return None
 
-    def clean_assistant_messages(self, messages: List[Message]) -> List[Message]:
-        """Create a new list of messages with assistant content cleaned."""
-        cleaned_messages = []
-        for msg in messages:
-            if msg.role == "assistant" and isinstance(msg.content, str):
-                # Create a new Message object with cleaned content
-                normalized_content = _normalize_content(msg.content)
-                # Only create a new object if content actually changed
-                if normalized_content != msg.content:
-                    cleaned_msg = Message(role=msg.role, content=normalized_content, name=msg.name)
-                    cleaned_messages.append(cleaned_msg)
-                else:
-                    cleaned_messages.append(msg)
-            else:
-                cleaned_messages.append(msg)
-        return cleaned_messages
-
     def find(self, model: str, messages: List[Message]) -> Optional[ConversationInStore]:
         """
         Search conversation data by message list.
@@ -215,7 +192,7 @@ def find(self, model: str, messages: List[Message]) -> Optional[ConversationInSt
             return conv
 
         # --- Find with cleaned messages ---
-        cleaned_messages = self.clean_assistant_messages(messages)
+        cleaned_messages = self.sanitize_assistant_messages(messages)
         if conv := self._find_by_message_list(model, cleaned_messages):
             logger.debug("Found conversation with cleaned message history.")
             return conv
@@ -228,14 +205,12 @@ def _find_by_message_list(
     ) -> Optional[ConversationInStore]:
         """Internal find implementation based on a message list."""
         for c in g_config.gemini.clients:
-            message_hash = hash_conversation(c.id, model, messages)
+            message_hash = _hash_conversation(c.id, model, messages)
 
             key = f"{self.HASH_LOOKUP_PREFIX}{message_hash}"
             try:
                 with self._get_transaction(write=False) as txn:
-                    mapped = txn.get(key.encode("utf-8"))
-                    if mapped:
-                        logger.debug(f"Found mapped key '{mapped.decode('utf-8')}' for hash '{message_hash}'.")
+                    if mapped := txn.get(key.encode("utf-8")):  # type: ignore
                         return self.get(mapped.decode("utf-8"))  # type: ignore
             except Exception as e:
                 logger.error(
@@ -283,7 +258,7 @@ def delete(self, key: str) -> Optional[ConversationInStore]:
 
                 storage_data = orjson.loads(data)  # type: ignore
                 conv = ConversationInStore.model_validate(storage_data)
-                message_hash = hash_conversation(conv.client_id, conv.model, conv.messages)
+                message_hash = _hash_conversation(conv.client_id, conv.model, conv.messages)
 
                 # Delete main data
                 txn.delete(key.encode("utf-8"))
@@ -362,3 +337,32 @@ def close(self) -> None:
     def __del__(self):
         """Cleanup on destruction."""
         self.close()
+
+    @staticmethod
+    def remove_think_tags(text: str) -> str:
+        """
+        Remove <think>...</think> tags at the start of text and strip whitespace.
+        """
+        cleaned_content = re.sub(r"^(\s*<think>.*?</think>\n?)", "", text, flags=re.DOTALL)
+        return cleaned_content.strip()
+
+    @staticmethod
+    def sanitize_assistant_messages(messages: list[Message]) -> list[Message]:
+        """
+        Create a new list of messages with assistant content cleaned of <think> tags.
+        This is useful for store the chat history.
+        """
+        cleaned_messages = []
+        for msg in messages:
+            if msg.role == "assistant" and isinstance(msg.content, str):
+                normalized_content = LMDBConversationStore.remove_think_tags(msg.content)
+                # Only create a new object if content actually changed
+                if normalized_content != msg.content:
+                    cleaned_msg = Message(role=msg.role, content=normalized_content, name=msg.name)
+                    cleaned_messages.append(cleaned_msg)
+                else:
+                    cleaned_messages.append(msg)
+            else:
+                cleaned_messages.append(msg)
+
+        return cleaned_messages
diff --git a/app/utils/helper.py b/app/utils/helper.py
@@ -16,8 +16,8 @@ def add_tag(role: str, content: str, unclose: bool = False) -> str:
 
 
 def estimate_tokens(text: str) -> int:
-    # TODO: Refactor this function to use a proper tokenizer
-    return len(text.split())
+    """Estimate the number of tokens heuristically based on character count"""
+    return int(len(text) / 3)
 
 
 async def save_file_to_tempfile(