Cinnamon · MesaAnpo · Jul 8, 2025
diff --git a/docs/development/data-components.md b/docs/development/data-components.md
@@ -32,3 +32,10 @@ The data & data structure components include:
 
 - ChromaVectorStore
 - InMemoryVectorStore
+
+### Document serialization
+
+`Document.model_dump()` now accepts a `serialize` flag. When `serialize=True`,
+NumPy arrays such as embeddings are converted to Python lists for safe external
+serialization. Omitting the flag preserves the raw arrays, which is useful for
+internal operations like copying models.
diff --git a/libs/kotaemon/kotaemon/base/schema.py b/libs/kotaemon/kotaemon/base/schema.py
@@ -2,11 +2,13 @@
 
 from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar
 
+import numpy as np
 from langchain.schema.messages import AIMessage as LCAIMessage
 from langchain.schema.messages import HumanMessage as LCHumanMessage
 from langchain.schema.messages import SystemMessage as LCSystemMessage
 from llama_index.core.bridge.pydantic import Field
 from llama_index.core.schema import Document as BaseDocument
+from pydantic import ConfigDict
 
 if TYPE_CHECKING:
     from haystack.schema import Document as HaystackDocument
@@ -36,6 +38,8 @@ class Document(BaseDocument):
             - debug: show in debug panel
     """
 
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
     content: Any = None
     source: Optional[str] = None
     channel: Optional[Literal["chat", "info", "index", "debug", "plot"]] = None
@@ -45,15 +49,31 @@ def __init__(self, content: Optional[Any] = None, *args, **kwargs):
             if kwargs.get("text", None) is not None:
                 kwargs["content"] = kwargs["text"]
             elif kwargs.get("embedding", None) is not None:
-                kwargs["content"] = kwargs["embedding"]
+                emb = kwargs["embedding"]
+                if isinstance(emb, np.ndarray):
+                    emb = emb.tolist()
+                kwargs["embedding"] = emb
+                kwargs["content"] = emb
                 # default text indicating this document only contains embedding
                 kwargs["text"] = "<EMBEDDING>"
         elif isinstance(content, Document):
             # TODO: simplify the Document class
-            temp_ = content.dict()
-            temp_.update(kwargs)
-            kwargs = temp_
+            if hasattr(content, "model_copy"):
+                obj = content.model_copy(update=kwargs)
+            else:
+                obj = content.copy(update=kwargs)
+            data = obj.__dict__.copy()
+            emb = content.embedding
+            if isinstance(emb, np.ndarray):
+                data["embedding"] = emb.tolist()
+                super().__init__(*args, **data)
+                object.__setattr__(self, "embedding", emb)
+            else:
+                super().__init__(*args, **data)
+            return
         else:
+            if isinstance(content, np.ndarray):
+                content = content.tolist()
             kwargs["content"] = content
             if content:
                 kwargs["text"] = str(content)
@@ -83,16 +103,30 @@ def to_haystack_format(self) -> "HaystackDocument":
     def __str__(self):
         return str(self.content)
 
+    def model_dump(self, *, serialize: bool = False, **kwargs):
+        data = super().model_dump(**kwargs)
+        if serialize:
 
-class DocumentWithEmbedding(Document):
-    """Subclass of Document which must contains embedding
+            def _convert(obj):
+                if isinstance(obj, np.ndarray):
+                    return obj.tolist()
+                if isinstance(obj, dict):
+                    return {k: _convert(v) for k, v in obj.items()}
+                if isinstance(obj, list):
+                    return [_convert(v) for v in obj]
+                return obj
 
-    Use this if you want to enforce component's IOs to must contain embedding.
-    """
+            data = _convert(data)
+        return data
+
+
+class DocumentWithEmbedding(Document):
+    """Document subclass that enforces an embedding field."""
 
-    def __init__(self, embedding: list[float], *args, **kwargs):
+    def __init__(self, embedding: list[float] | np.ndarray, *args, **kwargs):
         kwargs["embedding"] = embedding
         super().__init__(*args, **kwargs)
+        object.__setattr__(self, "embedding", embedding)
 
 
 class BaseMessage(Document):
@@ -103,19 +137,83 @@ def to_openai_format(self) -> "ChatCompletionMessageParam":
         raise NotImplementedError
 
 
-class SystemMessage(BaseMessage, LCSystemMessage):
-    def to_openai_format(self) -> "ChatCompletionMessageParam":
-        return {"role": "system", "content": self.content}
-
-
-class AIMessage(BaseMessage, LCAIMessage):
-    def to_openai_format(self) -> "ChatCompletionMessageParam":
-        return {"role": "assistant", "content": self.content}
-
+if TYPE_CHECKING:
 
-class HumanMessage(BaseMessage, LCHumanMessage):
-    def to_openai_format(self) -> "ChatCompletionMessageParam":
-        return {"role": "user", "content": self.content}
+    class SystemMessage(BaseMessage, LCSystemMessage):
+        def to_openai_format(self) -> "ChatCompletionMessageParam":
+            ...
+
+    class AIMessage(BaseMessage, LCAIMessage):
+        def to_openai_format(self) -> "ChatCompletionMessageParam":
+            ...
+
+    class HumanMessage(BaseMessage, LCHumanMessage):
+        def to_openai_format(self) -> "ChatCompletionMessageParam":
+            ...
+
+else:
+    try:
+        SystemMessage = type(
+            "SystemMessage",
+            (BaseMessage, LCSystemMessage),
+            {
+                "to_openai_format": lambda self: {
+                    "role": "system",
+                    "content": self.content,
+                }
+            },
+        )  # type: ignore[misc, valid-type]
+        AIMessage = type(
+            "AIMessage",
+            (BaseMessage, LCAIMessage),
+            {
+                "to_openai_format": lambda self: {
+                    "role": "assistant",
+                    "content": self.content,
+                }
+            },
+        )  # type: ignore[misc, valid-type]
+        HumanMessage = type(
+            "HumanMessage",
+            (BaseMessage, LCHumanMessage),
+            {
+                "to_openai_format": lambda self: {
+                    "role": "user",
+                    "content": self.content,
+                }
+            },
+        )  # type: ignore[misc, valid-type]
+    except TypeError:
+        SystemMessage = type(
+            "SystemMessage",
+            (BaseMessage,),
+            {
+                "to_openai_format": lambda self: {
+                    "role": "system",
+                    "content": self.content,
+                }
+            },
+        )  # type: ignore[misc, valid-type]
+        AIMessage = type(
+            "AIMessage",
+            (BaseMessage,),
+            {
+                "to_openai_format": lambda self: {
+                    "role": "assistant",
+                    "content": self.content,
+                }
+            },
+        )  # type: ignore[misc, valid-type]
+        HumanMessage = type(
+            "HumanMessage",
+            (BaseMessage,),
+            {
+                "to_openai_format": lambda self: {
+                    "role": "user",
+                    "content": self.content,
+                }
+            },
+        )  # type: ignore[misc, valid-type]
 
 
 class RetrievedDocument(Document):
@@ -132,14 +230,14 @@ class RetrievedDocument(Document):
     retrieval_metadata: dict = Field(default={})
 
 
-class LLMInterface(AIMessage):
+class LLMInterface(AIMessage):  # type: ignore[misc, valid-type]
     candidates: list[str] = Field(default_factory=list)
     completion_tokens: int = -1
     total_tokens: int = -1
     prompt_tokens: int = -1
     total_cost: float = 0
     logits: list[list[float]] = Field(default_factory=list)
-    messages: list[AIMessage] = Field(default_factory=list)
+    messages: list[AIMessage] = Field(default_factory=list)  # type: ignore[valid-type]
     logprobs: list[float] = []
 
 

diff --git a/libs/kotaemon/kotaemon/loaders/excel_loader.py b/libs/kotaemon/kotaemon/loaders/excel_loader.py
@@ -3,6 +3,7 @@
 Pandas parser for .xlsx files.
 
 """
+
 from pathlib import Path
 from typing import Any, List, Optional, Union
 
@@ -82,21 +83,27 @@ def load_data(
             sheet = []
             if include_sheetname:
                 sheet.append([key])
-            dfs[key] = dfs[key].dropna(axis=0, how="all")
-            dfs[key] = dfs[key].dropna(axis=0, how="all")
-            dfs[key].fillna("", inplace=True)
-            sheet.extend(dfs[key].values.astype(str).tolist())
+            df = dfs[key].dropna(axis=0, how="all")
+            df = df.dropna(axis=0, how="all")
+            for col in df.select_dtypes(exclude=["number"]).columns:
+                df[col].fillna("", inplace=True)
+            sheet.extend(df.values.tolist())
             df_sheets.append(sheet)
 
         text_list = list(
             itertools.chain.from_iterable(df_sheets)
         )  # flatten list of lists
 
+        output_text = self._row_joiner.join(
+            self._col_joiner.join(
+                "" if pd.isna(cell) else str(cell) for cell in sublist
+            )
+            for sublist in text_list
+        )
+
         output = [
             Document(
-                text=self._row_joiner.join(
-                    self._col_joiner.join(sublist) for sublist in text_list
-                ),
+                text=output_text,
                 metadata=extra_info or {},
             )
         ]
@@ -175,14 +182,17 @@ def load_data(
         output = []
 
         for idx, key in enumerate(sheet_names):
-            dfs[key] = dfs[key].dropna(axis=0, how="all")
-            dfs[key] = dfs[key].dropna(axis=0, how="all")
-            dfs[key] = dfs[key].astype("object")
-            dfs[key].fillna("", inplace=True)
+            df = dfs[key].dropna(axis=0, how="all")
+            df = df.dropna(axis=0, how="all")
+            for col in df.select_dtypes(exclude=["number"]).columns:
+                df[col].fillna("", inplace=True)
 
-            rows = dfs[key].values.astype(str).tolist()
+            rows = df.values.tolist()
             content = self._row_joiner.join(
-                self._col_joiner.join(row).strip() for row in rows
+                self._col_joiner.join(
+                    "" if pd.isna(cell) else str(cell) for cell in row
+                ).strip()
+                for row in rows
             ).strip()
             if include_sheetname:
                 content = f"(Sheet {key} of file {file.name})\n{content}"

diff --git a/libs/kotaemon/kotaemon/storages/vectorstores/base.py b/libs/kotaemon/kotaemon/storages/vectorstores/base.py
@@ -3,6 +3,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Optional
 
+import numpy as np
 from llama_index.core.schema import NodeRelationship, RelatedNodeInfo
 from llama_index.core.vector_stores.types import BasePydanticVectorStore
 from llama_index.core.vector_stores.types import VectorStore as LIVectorStore
@@ -19,14 +20,14 @@ def __init__(self, *args, **kwargs):
     @abstractmethod
     def add(
         self,
-        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        embeddings: list[list[float]] | np.ndarray | list[DocumentWithEmbedding],
         metadatas: Optional[list[dict]] = None,
         ids: Optional[list[str]] = None,
     ) -> list[str]:
         """Add vector embeddings to vector stores
 
         Args:
-            embeddings: List of embeddings
+            embeddings: List of embeddings or numpy array
             metadatas: List of metadata of the embeddings
             ids: List of ids of the embeddings
             kwargs: meant for vectorstore-specific parameters
@@ -49,15 +50,15 @@ def delete(self, ids: list[str], **kwargs):
     @abstractmethod
     def query(
         self,
-        embedding: list[float],
+        embedding: list[float] | np.ndarray,
         top_k: int = 1,
         ids: Optional[list[str]] = None,
         **kwargs,
     ) -> tuple[list[list[float]], list[float], list[str]]:
         """Return the top k most similar vector embeddings
 
         Args:
-            embedding: List of embeddings
+            embedding: List of embeddings or numpy array
             top_k: Number of most similar embeddings to return
             ids: List of ids of the embeddings to be queried
 
@@ -112,10 +113,13 @@ def __getattr__(self, name: str) -> Any:
 
     def add(
         self,
-        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        embeddings: list[list[float]] | np.ndarray | list[DocumentWithEmbedding],
         metadatas: Optional[list[dict]] = None,
         ids: Optional[list[str]] = None,
     ):
+        if isinstance(embeddings, np.ndarray):
+            embeddings = embeddings.tolist()
+
         if isinstance(embeddings[0], list):
             nodes: list[DocumentWithEmbedding] = [
                 DocumentWithEmbedding(embedding=embedding) for embedding in embeddings
@@ -140,15 +144,15 @@ def delete(self, ids: list[str], **kwargs):
 
     def query(
         self,
-        embedding: list[float],
+        embedding: list[float] | np.ndarray,
         top_k: int = 1,
         ids: Optional[list[str]] = None,
         **kwargs,
     ) -> tuple[list[list[float]], list[float], list[str]]:
         """Return the top k most similar vector embeddings
 
         Args:
-            embedding: List of embeddings
+            embedding: List of embeddings or numpy array
             top_k: Number of most similar embeddings to return
             ids: List of ids of the embeddings to be queried
             kwargs: extra query parameters. Depending on the name, these parameters
@@ -166,6 +170,9 @@ def query(
             else:
                 vs_kwargs[kwkey] = kwvalue
 
+        if isinstance(embedding, np.ndarray):
+            embedding = embedding.tolist()
+
         output = self._client.query(
             query=VectorStoreQuery(
                 query_embedding=embedding,