Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/development/data-components.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ The data & data structure components include:

- ChromaVectorStore
- InMemoryVectorStore

### Document serialization

`Document.model_dump()` now accepts a `serialize` flag. When `serialize=True`,
NumPy arrays such as embeddings are converted to Python lists for safe external
serialization. Omitting the flag preserves the raw arrays, which is useful for
internal operations like copying models.
144 changes: 121 additions & 23 deletions libs/kotaemon/kotaemon/base/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar

import numpy as np
from langchain.schema.messages import AIMessage as LCAIMessage
from langchain.schema.messages import HumanMessage as LCHumanMessage
from langchain.schema.messages import SystemMessage as LCSystemMessage
from llama_index.core.bridge.pydantic import Field
from llama_index.core.schema import Document as BaseDocument
from pydantic import ConfigDict

if TYPE_CHECKING:
from haystack.schema import Document as HaystackDocument
Expand Down Expand Up @@ -36,6 +38,8 @@ class Document(BaseDocument):
- debug: show in debug panel
"""

model_config = ConfigDict(arbitrary_types_allowed=True)

content: Any = None
source: Optional[str] = None
channel: Optional[Literal["chat", "info", "index", "debug", "plot"]] = None
Expand All @@ -45,15 +49,31 @@ def __init__(self, content: Optional[Any] = None, *args, **kwargs):
if kwargs.get("text", None) is not None:
kwargs["content"] = kwargs["text"]
elif kwargs.get("embedding", None) is not None:
kwargs["content"] = kwargs["embedding"]
emb = kwargs["embedding"]
if isinstance(emb, np.ndarray):
emb = emb.tolist()
kwargs["embedding"] = emb
kwargs["content"] = emb
# default text indicating this document only contains embedding
kwargs["text"] = "<EMBEDDING>"
elif isinstance(content, Document):
# TODO: simplify the Document class
temp_ = content.dict()
temp_.update(kwargs)
kwargs = temp_
if hasattr(content, "model_copy"):
obj = content.model_copy(update=kwargs)
else:
obj = content.copy(update=kwargs)
data = obj.__dict__.copy()
emb = content.embedding
if isinstance(emb, np.ndarray):
data["embedding"] = emb.tolist()
super().__init__(*args, **data)
object.__setattr__(self, "embedding", emb)
else:
super().__init__(*args, **data)
return
else:
if isinstance(content, np.ndarray):
content = content.tolist()
kwargs["content"] = content
if content:
kwargs["text"] = str(content)
Expand Down Expand Up @@ -83,16 +103,30 @@ def to_haystack_format(self) -> "HaystackDocument":
def __str__(self):
return str(self.content)

def model_dump(self, *, serialize: bool = False, **kwargs):
data = super().model_dump(**kwargs)
if serialize:

class DocumentWithEmbedding(Document):
"""Subclass of Document which must contains embedding
def _convert(obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
if isinstance(obj, dict):
return {k: _convert(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_convert(v) for v in obj]
return obj

Use this if you want to enforce component's IOs to must contain embedding.
"""
data = _convert(data)
return data


class DocumentWithEmbedding(Document):
"""Document subclass that enforces an embedding field."""

def __init__(self, embedding: list[float], *args, **kwargs):
def __init__(self, embedding: list[float] | np.ndarray, *args, **kwargs):
kwargs["embedding"] = embedding
super().__init__(*args, **kwargs)
object.__setattr__(self, "embedding", embedding)


class BaseMessage(Document):
Expand All @@ -103,19 +137,83 @@ def to_openai_format(self) -> "ChatCompletionMessageParam":
raise NotImplementedError


class SystemMessage(BaseMessage, LCSystemMessage):
def to_openai_format(self) -> "ChatCompletionMessageParam":
return {"role": "system", "content": self.content}


class AIMessage(BaseMessage, LCAIMessage):
def to_openai_format(self) -> "ChatCompletionMessageParam":
return {"role": "assistant", "content": self.content}

if TYPE_CHECKING:

class HumanMessage(BaseMessage, LCHumanMessage):
def to_openai_format(self) -> "ChatCompletionMessageParam":
return {"role": "user", "content": self.content}
class SystemMessage(BaseMessage, LCSystemMessage):
def to_openai_format(self) -> "ChatCompletionMessageParam":
...

class AIMessage(BaseMessage, LCAIMessage):
def to_openai_format(self) -> "ChatCompletionMessageParam":
...

class HumanMessage(BaseMessage, LCHumanMessage):
def to_openai_format(self) -> "ChatCompletionMessageParam":
...

else:
try:
SystemMessage = type(
"SystemMessage",
(BaseMessage, LCSystemMessage),
{
"to_openai_format": lambda self: {
"role": "system",
"content": self.content,
}
},
) # type: ignore[misc, valid-type]
AIMessage = type(
"AIMessage",
(BaseMessage, LCAIMessage),
{
"to_openai_format": lambda self: {
"role": "assistant",
"content": self.content,
}
},
) # type: ignore[misc, valid-type]
HumanMessage = type(
"HumanMessage",
(BaseMessage, LCHumanMessage),
{
"to_openai_format": lambda self: {
"role": "user",
"content": self.content,
}
},
) # type: ignore[misc, valid-type]
except TypeError:
SystemMessage = type(
"SystemMessage",
(BaseMessage,),
{
"to_openai_format": lambda self: {
"role": "system",
"content": self.content,
}
},
) # type: ignore[misc, valid-type]
AIMessage = type(
"AIMessage",
(BaseMessage,),
{
"to_openai_format": lambda self: {
"role": "assistant",
"content": self.content,
}
},
) # type: ignore[misc, valid-type]
HumanMessage = type(
"HumanMessage",
(BaseMessage,),
{
"to_openai_format": lambda self: {
"role": "user",
"content": self.content,
}
},
) # type: ignore[misc, valid-type]


class RetrievedDocument(Document):
Expand All @@ -132,14 +230,14 @@ class RetrievedDocument(Document):
retrieval_metadata: dict = Field(default={})


class LLMInterface(AIMessage):
class LLMInterface(AIMessage): # type: ignore[misc, valid-type]
candidates: list[str] = Field(default_factory=list)
completion_tokens: int = -1
total_tokens: int = -1
prompt_tokens: int = -1
total_cost: float = 0
logits: list[list[float]] = Field(default_factory=list)
messages: list[AIMessage] = Field(default_factory=list)
messages: list[AIMessage] = Field(default_factory=list) # type: ignore[valid-type]
logprobs: list[float] = []


Expand Down
36 changes: 23 additions & 13 deletions libs/kotaemon/kotaemon/loaders/excel_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Pandas parser for .xlsx files.

"""

from pathlib import Path
from typing import Any, List, Optional, Union

Expand Down Expand Up @@ -82,21 +83,27 @@ def load_data(
sheet = []
if include_sheetname:
sheet.append([key])
dfs[key] = dfs[key].dropna(axis=0, how="all")
dfs[key] = dfs[key].dropna(axis=0, how="all")
dfs[key].fillna("", inplace=True)
sheet.extend(dfs[key].values.astype(str).tolist())
df = dfs[key].dropna(axis=0, how="all")
df = df.dropna(axis=0, how="all")
for col in df.select_dtypes(exclude=["number"]).columns:
df[col].fillna("", inplace=True)
sheet.extend(df.values.tolist())
df_sheets.append(sheet)

text_list = list(
itertools.chain.from_iterable(df_sheets)
) # flatten list of lists

output_text = self._row_joiner.join(
self._col_joiner.join(
"" if pd.isna(cell) else str(cell) for cell in sublist
)
for sublist in text_list
)

output = [
Document(
text=self._row_joiner.join(
self._col_joiner.join(sublist) for sublist in text_list
),
text=output_text,
metadata=extra_info or {},
)
]
Expand Down Expand Up @@ -175,14 +182,17 @@ def load_data(
output = []

for idx, key in enumerate(sheet_names):
dfs[key] = dfs[key].dropna(axis=0, how="all")
dfs[key] = dfs[key].dropna(axis=0, how="all")
dfs[key] = dfs[key].astype("object")
dfs[key].fillna("", inplace=True)
df = dfs[key].dropna(axis=0, how="all")
df = df.dropna(axis=0, how="all")
for col in df.select_dtypes(exclude=["number"]).columns:
df[col].fillna("", inplace=True)

rows = dfs[key].values.astype(str).tolist()
rows = df.values.tolist()
content = self._row_joiner.join(
self._col_joiner.join(row).strip() for row in rows
self._col_joiner.join(
"" if pd.isna(cell) else str(cell) for cell in row
).strip()
for row in rows
).strip()
if include_sheetname:
content = f"(Sheet {key} of file {file.name})\n{content}"
Expand Down
21 changes: 14 additions & 7 deletions libs/kotaemon/kotaemon/storages/vectorstores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from abc import ABC, abstractmethod
from typing import Any, Optional

import numpy as np
from llama_index.core.schema import NodeRelationship, RelatedNodeInfo
from llama_index.core.vector_stores.types import BasePydanticVectorStore
from llama_index.core.vector_stores.types import VectorStore as LIVectorStore
Expand All @@ -19,14 +20,14 @@ def __init__(self, *args, **kwargs):
@abstractmethod
def add(
self,
embeddings: list[list[float]] | list[DocumentWithEmbedding],
embeddings: list[list[float]] | np.ndarray | list[DocumentWithEmbedding],
metadatas: Optional[list[dict]] = None,
ids: Optional[list[str]] = None,
) -> list[str]:
"""Add vector embeddings to vector stores

Args:
embeddings: List of embeddings
embeddings: List of embeddings or numpy array
metadatas: List of metadata of the embeddings
ids: List of ids of the embeddings
kwargs: meant for vectorstore-specific parameters
Expand All @@ -49,15 +50,15 @@ def delete(self, ids: list[str], **kwargs):
@abstractmethod
def query(
self,
embedding: list[float],
embedding: list[float] | np.ndarray,
top_k: int = 1,
ids: Optional[list[str]] = None,
**kwargs,
) -> tuple[list[list[float]], list[float], list[str]]:
"""Return the top k most similar vector embeddings

Args:
embedding: List of embeddings
embedding: List of embeddings or numpy array
top_k: Number of most similar embeddings to return
ids: List of ids of the embeddings to be queried

Expand Down Expand Up @@ -112,10 +113,13 @@ def __getattr__(self, name: str) -> Any:

def add(
self,
embeddings: list[list[float]] | list[DocumentWithEmbedding],
embeddings: list[list[float]] | np.ndarray | list[DocumentWithEmbedding],
metadatas: Optional[list[dict]] = None,
ids: Optional[list[str]] = None,
):
if isinstance(embeddings, np.ndarray):
embeddings = embeddings.tolist()

if isinstance(embeddings[0], list):
nodes: list[DocumentWithEmbedding] = [
DocumentWithEmbedding(embedding=embedding) for embedding in embeddings
Expand All @@ -140,15 +144,15 @@ def delete(self, ids: list[str], **kwargs):

def query(
self,
embedding: list[float],
embedding: list[float] | np.ndarray,
top_k: int = 1,
ids: Optional[list[str]] = None,
**kwargs,
) -> tuple[list[list[float]], list[float], list[str]]:
"""Return the top k most similar vector embeddings

Args:
embedding: List of embeddings
embedding: List of embeddings or numpy array
top_k: Number of most similar embeddings to return
ids: List of ids of the embeddings to be queried
kwargs: extra query parameters. Depending on the name, these parameters
Expand All @@ -166,6 +170,9 @@ def query(
else:
vs_kwargs[kwkey] = kwvalue

if isinstance(embedding, np.ndarray):
embedding = embedding.tolist()

output = self._client.query(
query=VectorStoreQuery(
query_embedding=embedding,
Expand Down
Loading