diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml new file mode 100644 index 000000000..3ad174752 --- /dev/null +++ b/.github/workflows/stale.yaml @@ -0,0 +1,25 @@ +name: Close stale issues +on: + schedule: + - cron: "0 0 * * *" # Runs at 00:00 UTC every day + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v9 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + stale-issue-message: | + Hi folk 😊, it looks like this issue has been inactive for a while. Is there any update or further action + needed? If not, we might consider closing it soon to keep our board clean and focused. + + But don't worry, you can reopen it anytime when needed.Thank you for your contributions to Kotaemon🪴 + + days-before-issue-stale: 30 + days-before-issue-close: 3 + days-before-pr-stale: 90 + days-before-pr-close: -1 + exempt-issue-labels: "documentation,tutorial,TODO" + operations-per-run: 300 # The maximum number of operations per run, used to control rate limiting. diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 000000000..d23cd4071 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,9 @@ +services: + kotaemon: + volumes: + - "./ktem_app_data:/app/ktem_app_data" + - "./libs/kotaemon:/app/kotaemon" + - "./libs/ktem:/app/ktem" + - "./flowsettings.py:/app/flowsettings.py" + ports: + - "7860:7860" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..ca873dc22 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,30 @@ +services: + kotaemon: + build: + context: . + target: lite + dockerfile: Dockerfile + env_file: .env + environment: + - GRADIO_SERVER_NAME=0.0.0.0 + - GRADIO_SERVER_PORT=7860 + ports: + - "7860:7860" + networks: + - backend + # gocr: + # image: ghcr.io/phv2312/got-ocr2.0:main + # ports: + # - "8881:8881" + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # count: 1 + # capabilities: [gpu] + # networks: + # - backend +networks: + backend: + driver: bridge diff --git a/integration/got-ocr2.md b/integration/got-ocr2.md new file mode 100644 index 000000000..999cca714 --- /dev/null +++ b/integration/got-ocr2.md @@ -0,0 +1,30 @@ +## Extension Manager and GOT-OCR2.0 Loader + +## Key Features + +### 1. **GOCR2 as Image Reader** + +- **GOCR2ImageReader** is a new class designed to read images using the [**GOCR-2.0** OCR engine](https://github.com/Ucas-HaoranWei/GOT-OCR2.0). +- This reader is initialized with an endpoint that defaults to `http://localhost:8881/ai/infer/` for the OCR service, but can be configured through an environment variable `GOCR2_ENDPOINT` or passed explicitly. +- It uses exponential backoff retry mechanisms to ensure robustness during API calls. +- Supports loading image files and extracting their text content, returning structured document data. + +#### Setup + +- We provide the docker image, with fastapi for serving the GOT-OCR2.0. Pull the image from: + +```bash +docker run -d --gpus all -p 8881:8881 ghcr.io/phv2312/got-ocr2.0:main +``` + +- Detail implementation is placed at [ocr_loader.py](/libs/kotaemon/kotaemon/loaders/ocr_loader.py) + +### 2. **Extension Manager** + +- ExtensionManager allows users to dynamically manage multiple loaders for different file types. + +- Users can switch between multiple loaders for the same file extension, such as using the GOCR2ImageReader or a + different unstructured data parser for .png files. This provides the flexibility to choose the best-suited loader for the task at hand. + +- To change the default loader, go to **Settings**, then **Loader settings**. It displays a grid of extensions and + its supported loaders. Any modification will be saved to DB as other settings do. diff --git a/libs/kotaemon/kotaemon/indices/ingests/__init__.py b/libs/kotaemon/kotaemon/indices/ingests/__init__.py deleted file mode 100644 index 064f20623..000000000 --- a/libs/kotaemon/kotaemon/indices/ingests/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .files import DocumentIngestor - -__all__ = ["DocumentIngestor"] diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py deleted file mode 100644 index 18db7ca86..000000000 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ /dev/null @@ -1,137 +0,0 @@ -from pathlib import Path -from typing import Type - -from decouple import config -from llama_index.core.readers.base import BaseReader -from llama_index.readers.file import PDFReader -from theflow.settings import settings as flowsettings - -from kotaemon.base import BaseComponent, Document, Param -from kotaemon.indices.extractors import BaseDocParser -from kotaemon.indices.splitters import BaseSplitter, TokenSplitter -from kotaemon.loaders import ( - AdobeReader, - AzureAIDocumentIntelligenceLoader, - DirectoryReader, - DoclingReader, - HtmlReader, - MathpixPDFReader, - MhtmlReader, - OCRReader, - PandasExcelReader, - PDFThumbnailReader, - TxtReader, - UnstructuredReader, - WebReader, -) - -web_reader = WebReader() -unstructured = UnstructuredReader() -adobe_reader = AdobeReader() -azure_reader = AzureAIDocumentIntelligenceLoader( - endpoint=str(config("AZURE_DI_ENDPOINT", default="")), - credential=str(config("AZURE_DI_CREDENTIAL", default="")), - cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None), -) -docling_reader = DoclingReader() -adobe_reader.vlm_endpoint = ( - azure_reader.vlm_endpoint -) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "") - - -KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = { - ".xlsx": PandasExcelReader(), - ".docx": unstructured, - ".pptx": unstructured, - ".xls": unstructured, - ".doc": unstructured, - ".html": HtmlReader(), - ".mhtml": MhtmlReader(), - ".png": unstructured, - ".jpeg": unstructured, - ".jpg": unstructured, - ".tiff": unstructured, - ".tif": unstructured, - ".pdf": PDFThumbnailReader(), - ".txt": TxtReader(), - ".md": TxtReader(), -} - - -class DocumentIngestor(BaseComponent): - """Ingest common office document types into Document for indexing - - Document types: - - pdf - - xlsx, xls - - docx, doc - - Args: - pdf_mode: mode for pdf extraction, one of "normal", "mathpix", "ocr" - - normal: parse pdf text - - mathpix: parse pdf text using mathpix - - ocr: parse pdf image using flax - doc_parsers: list of document parsers to parse the document - text_splitter: splitter to split the document into text nodes - override_file_extractors: override file extractors for specific file extensions - The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS` - """ - - pdf_mode: str = "normal" # "normal", "mathpix", "ocr", "multimodal" - doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: []) - text_splitter: BaseSplitter = TokenSplitter.withx( - chunk_size=1024, - chunk_overlap=256, - separator="\n\n", - backup_separators=["\n", ".", " ", "\u200B"], - ) - override_file_extractors: dict[str, Type[BaseReader]] = {} - - def _get_reader(self, input_files: list[str | Path]): - """Get appropriate readers for the input files based on file extension""" - file_extractors: dict[str, BaseReader] = { - ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items() - } - for ext, cls in self.override_file_extractors.items(): - file_extractors[ext] = cls() - - if self.pdf_mode == "normal": - file_extractors[".pdf"] = PDFReader() - elif self.pdf_mode == "ocr": - file_extractors[".pdf"] = OCRReader() - elif self.pdf_mode == "multimodal": - file_extractors[".pdf"] = AdobeReader() - else: - file_extractors[".pdf"] = MathpixPDFReader() - - main_reader = DirectoryReader( - input_files=input_files, - file_extractor=file_extractors, - ) - - return main_reader - - def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]: - """Ingest the file paths into Document - - Args: - file_paths: list of file paths or a single file path - - Returns: - list of parsed Documents - """ - if not isinstance(file_paths, list): - file_paths = [file_paths] - - documents = self._get_reader(input_files=file_paths)() - print(f"Read {len(file_paths)} files into {len(documents)} documents.") - nodes = self.text_splitter(documents) - print(f"Transform {len(documents)} documents into {len(nodes)} nodes.") - self.log_progress(".num_docs", num_docs=len(nodes)) - - # document parsers call - if self.doc_parsers: - for parser in self.doc_parsers: - nodes = parser(nodes) - - return nodes diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py index f498da806..05edbf663 100644 --- a/libs/kotaemon/kotaemon/loaders/__init__.py +++ b/libs/kotaemon/kotaemon/loaders/__init__.py @@ -7,7 +7,7 @@ from .excel_loader import ExcelReader, PandasExcelReader from .html_loader import HtmlReader, MhtmlReader from .mathpix_loader import MathpixPDFReader -from .ocr_loader import ImageReader, OCRReader +from .ocr_loader import GOCR2ImageReader, ImageReader, OCRReader from .pdf_loader import PDFThumbnailReader from .txt_loader import TxtReader from .unstructured_loader import UnstructuredReader @@ -32,4 +32,5 @@ "PDFThumbnailReader", "WebReader", "DoclingReader", + "GOCR2ImageReader", ] diff --git a/libs/kotaemon/kotaemon/loaders/ocr_loader.py b/libs/kotaemon/kotaemon/loaders/ocr_loader.py index 4e009deef..da10b8500 100644 --- a/libs/kotaemon/kotaemon/loaders/ocr_loader.py +++ b/libs/kotaemon/kotaemon/loaders/ocr_loader.py @@ -191,3 +191,74 @@ def load_data( ) return result + + +class GOCR2ImageReader(BaseReader): + default_endpoint = "http://localhost:8881/ai/infer/" + """Read Image using GOCR-2.0 + + Args: + endpoint: URL to GOCR endpoint. If not provided, will look for + environment variable `GOCR2_ENDPOINT` or use the default + (http://localhost:8881/ai/infer/) + """ + + def __init__(self, endpoint: Optional[str] = None): + """Init the OCR reader with OCR endpoint (FullOCR pipeline)""" + super().__init__() + self.endpoint = endpoint or os.getenv("GOCR2_ENDPOINT", self.default_endpoint) + + def load_data( + self, file_path: Path, extra_info: dict | None = None, **kwargs + ) -> List[Document]: + """Load data using OCR reader + + Args: + file_path (Path): Path to PDF file + extra_info (Path): Extra information while inference + + Returns: + List[Document]: list of documents extracted from the PDF file + """ + + @retry( + stop=stop_after_attempt(6), + wait=wait_exponential(multiplier=20, exp_base=2, min=1, max=1000), + after=after_log(logger, logging.WARNING), + ) + def _tenacious_api_post( + url: str, file_path: Path, ocr_type: str = "ocr", **kwargs + ): + with file_path.open("rb") as content: + files = {"file": content} + data = {"ocr_type": ocr_type} + resp = requests.post(url=url, files=files, data=data, **kwargs) + resp.raise_for_status() + return resp + + file_path = Path(file_path).resolve() + + # call the API from GOCR endpoint + if "response_content" in kwargs: + # overriding response content if specified + ocr_results = kwargs["response_content"] + else: + # call original API + resp = _tenacious_api_post(url=self.endpoint, file_path=file_path) + ocr_results = [resp.json()["result"]] + + extra_info = extra_info or {} + result = [] + for ocr_result in ocr_results: + metadata = {"file_name": file_path.name, "page_label": 1} + if extra_info is not None: + metadata.update(extra_info) + + result.append( + Document( + content=ocr_result, + metadata=metadata, + ) + ) + + return result diff --git a/libs/kotaemon/tests/test_ingestor.py b/libs/kotaemon/tests/test_ingestor.py deleted file mode 100644 index 33fa5a235..000000000 --- a/libs/kotaemon/tests/test_ingestor.py +++ /dev/null @@ -1,15 +0,0 @@ -from pathlib import Path - -from kotaemon.indices.ingests import DocumentIngestor -from kotaemon.indices.splitters import TokenSplitter - - -def test_ingestor_include_src(): - dirpath = Path(__file__).parent - ingestor = DocumentIngestor( - pdf_mode="normal", - text_splitter=TokenSplitter(chunk_size=200, chunk_overlap=10), - ) - nodes = ingestor(dirpath / "resources" / "table.pdf") - assert type(nodes) is list - assert nodes[0].relationships diff --git a/libs/ktem/ktem/app.py b/libs/ktem/ktem/app.py index 7142377e1..4734372c9 100644 --- a/libs/ktem/ktem/app.py +++ b/libs/ktem/ktem/app.py @@ -8,6 +8,7 @@ from ktem.components import reasonings from ktem.exceptions import HookAlreadyDeclared, HookNotDeclared from ktem.index import IndexManager +from ktem.loaders.extensions import extension_manager from ktem.settings import BaseSettingGroup, SettingGroup, SettingReasoningGroup from theflow.settings import settings from theflow.utils.modules import import_dotted_string @@ -63,6 +64,9 @@ def __init__(self): self.default_settings = SettingGroup( application=BaseSettingGroup(settings=settings.SETTINGS_APP), reasoning=SettingReasoningGroup(settings=settings.SETTINGS_REASONING), + extension=BaseSettingGroup( + settings=extension_manager.generate_gradio_settings() + ), ) self._callbacks: dict[str, list] = {} diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index 4d53e6538..651fabdd8 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -7,7 +7,6 @@ import time import warnings from collections import defaultdict -from copy import deepcopy from functools import lru_cache from hashlib import sha256 from pathlib import Path @@ -17,6 +16,7 @@ from ktem.db.models import engine from ktem.embeddings.manager import embedding_models_manager from ktem.llms.manager import llms +from ktem.loaders.extensions import extension_manager from ktem.rerankings.manager import reranking_models_manager from llama_index.core.readers.base import BaseReader from llama_index.core.readers.file.base import default_file_metadata_func @@ -35,14 +35,6 @@ from kotaemon.base import BaseComponent, Document, Node, Param, RetrievedDocument from kotaemon.embeddings import BaseEmbeddings from kotaemon.indices import VectorIndexing, VectorRetrieval -from kotaemon.indices.ingests.files import ( - KH_DEFAULT_FILE_EXTRACTORS, - adobe_reader, - azure_reader, - docling_reader, - unstructured, - web_reader, -) from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring from kotaemon.indices.splitters import BaseSplitter, TokenSplitter @@ -665,45 +657,17 @@ class IndexDocumentPipeline(BaseFileIndexIndexing): decide which pipeline should be used. """ - reader_mode: str = Param("default", help="The reader mode") embedding: BaseEmbeddings run_embedding_in_thread: bool = False - @Param.auto(depends_on="reader_mode") + @Param.auto() def readers(self): - readers = deepcopy(KH_DEFAULT_FILE_EXTRACTORS) - print("reader_mode", self.reader_mode) - if self.reader_mode == "adobe": - readers[".pdf"] = adobe_reader - elif self.reader_mode == "azure-di": - readers[".pdf"] = azure_reader - elif self.reader_mode == "docling": - readers[".pdf"] = docling_reader - + readers: dict[str, BaseReader] = extension_manager.get_current_loader() dev_readers, _, _ = dev_settings() readers.update(dev_readers) return readers - @classmethod - def get_user_settings(cls): - return { - "reader_mode": { - "name": "File loader", - "value": "default", - "choices": [ - ("Default (open-source)", "default"), - ("Adobe API (figure+table extraction)", "adobe"), - ( - "Azure AI Document Intelligence (figure+table extraction)", - "azure-di", - ), - ("Docling (figure+table extraction)", "docling"), - ], - "component": "dropdown", - }, - } - @classmethod def get_pipeline(cls, user_settings, index_settings) -> BaseFileIndexIndexing: use_quick_index_mode = user_settings.get("quick_index_mode", False) @@ -715,7 +679,6 @@ def get_pipeline(cls, user_settings, index_settings) -> BaseFileIndexIndexing: ) ], run_embedding_in_thread=use_quick_index_mode, - reader_mode=user_settings.get("reader_mode", "default"), ) return obj @@ -737,11 +700,11 @@ def route(self, file_path: str | Path) -> IndexPipeline: # check if file_path is a URL if self.is_url(file_path): - reader = web_reader + reader = extension_manager.factory.web else: assert isinstance(file_path, Path) ext = file_path.suffix.lower() - reader = self.readers.get(ext, unstructured) + reader = self.readers.get(ext, extension_manager.factory.unstructured) if reader is None: raise NotImplementedError( f"No supported pipeline to index {file_path.name}. Please specify " diff --git a/libs/ktem/ktem/loaders/extensions.py b/libs/ktem/ktem/loaders/extensions.py new file mode 100644 index 000000000..d05f8255b --- /dev/null +++ b/libs/ktem/ktem/loaders/extensions.py @@ -0,0 +1,130 @@ +from copy import deepcopy +from typing import Any + +from llama_index.core.readers.base import BaseReader + +from .factory import ReaderFactory + + +class ExtensionManager: + """Pool of loaders for extensions""" + + def __init__(self, factory: ReaderFactory | None = None): + self.factory = factory or ReaderFactory() + self._supported, self._default_index = self._init_supported() + + def get_current_loader(self) -> dict[str, BaseReader]: + return deepcopy( + { + k: self.get_selected_loader_by_extension(k)[0] + for k, _ in self._supported.items() + } + ) + + def _init_supported(self) -> tuple[dict[str, list[BaseReader]], dict[str, str]]: + supported: dict[str, list[BaseReader]] = { + ".xlsx": [self.factory.pandas_excel], + ".docx": [self.factory.unstructured], + ".pptx": [self.factory.unstructured], + ".xls": [self.factory.unstructured], + ".doc": [self.factory.unstructured], + ".html": [self.factory.html], + ".mhtml": [self.factory.mhtml], + ".png": [ + self.factory.unstructured, + self.factory.gocr, + self.factory.docling, + ], + ".jpeg": [ + self.factory.unstructured, + self.factory.gocr, + self.factory.docling, + ], + ".jpg": [ + self.factory.unstructured, + self.factory.gocr, + self.factory.docling, + ], + ".tiff": [self.factory.unstructured, self.factory.docling], + ".tif": [self.factory.unstructured, self.factory.docling], + ".pdf": [ + self.factory.pdf_thumbnail, + self.factory.adobe, + self.factory.azuredi, + self.factory.docling, + ], + ".txt": [self.factory.txt], + ".md": [self.factory.txt], + } + + default_index = { + k: ExtensionManager.get_loader_name(vs[0]) for k, vs in supported.items() + } + + return supported, default_index + + def load(self, settings: dict, prefix="extension"): + for key, value in settings.items(): + if not key.startswith(prefix): + continue + extension = key.replace("extension.", "") + if extension in self._supported: + # Update the default index + # Only if it's in supported list + supported_loader_names = self.get_loaders_by_extension(extension)[1] + if value in supported_loader_names: + self._default_index[extension] = value + else: + print( + f"[{extension}]Can not find loader: {value} from list of " + f"supported extensions: {supported_loader_names}" + ) + + @staticmethod + def get_loader_name(loader: BaseReader) -> str: + return loader.__class__.__name__ + + def get_supported_extensions(self): + return list(self._supported.keys()) + + def get_loaders_by_extension( + self, extension: str + ) -> tuple[list[BaseReader], list[str]]: + loaders = self._supported[extension] + loaders_name = [self.get_loader_name(loader) for loader in loaders] + return loaders, loaders_name + + def get_selected_loader_by_extension( + self, extension: str + ) -> tuple[BaseReader, str]: + supported_loaders: list[BaseReader] = self._supported[extension] + + for loader in supported_loaders: + loader_name = self.get_loader_name(loader) + + if loader_name == self._default_index[extension]: + return loader, loader_name + + raise Exception(f"can not find the selected loader for extension: {extension}") + + def generate_gradio_settings(self) -> dict[str, Any]: + """Generates the settings dictionary for use in Gradio.""" + settings = {} + + for extension, loaders in self._supported.items(): + current_loader: str = self._default_index[extension] + loaders_choices: list[str] = [ + self.get_loader_name(loader) for loader in loaders + ] + + settings[extension] = { + "name": f"Loader {extension}", + "value": current_loader, + "choices": loaders_choices, + "component": "dropdown", # You can customize this to "radio" if needed + } + + return settings + + +extension_manager = ExtensionManager() diff --git a/libs/ktem/ktem/loaders/factory.py b/libs/ktem/ktem/loaders/factory.py new file mode 100644 index 000000000..477338adc --- /dev/null +++ b/libs/ktem/ktem/loaders/factory.py @@ -0,0 +1,77 @@ +from functools import cached_property + +from decouple import config +from theflow.settings import settings as flowsettings + +from kotaemon.loaders import ( + AdobeReader, + AzureAIDocumentIntelligenceLoader, + DoclingReader, + GOCR2ImageReader, + HtmlReader, + MathpixPDFReader, + MhtmlReader, + PandasExcelReader, + PDFThumbnailReader, + TxtReader, + UnstructuredReader, + WebReader, +) + + +class ReaderFactory: + @cached_property + def mathpix_pdf(self) -> MathpixPDFReader: + return MathpixPDFReader() + + @cached_property + def web(self) -> WebReader: + return WebReader() + + @cached_property + def unstructured(self) -> UnstructuredReader: + return UnstructuredReader() + + @cached_property + def adobe(self) -> AdobeReader: + adobe_reader = AdobeReader() + adobe_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "") + return adobe_reader + + @cached_property + def azuredi(self) -> AzureAIDocumentIntelligenceLoader: + azuredi_reader = AzureAIDocumentIntelligenceLoader( + endpoint=str(config("AZURE_DI_ENDPOINT", default="")), + credential=str(config("AZURE_DI_CREDENTIAL", default="")), + cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None), + ) + azuredi_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "") + return azuredi_reader + + @cached_property + def pandas_excel(self) -> PandasExcelReader: + return PandasExcelReader() + + @cached_property + def html(self) -> HtmlReader: + return HtmlReader() + + @cached_property + def mhtml(self) -> MhtmlReader: + return MhtmlReader() + + @cached_property + def gocr(self) -> GOCR2ImageReader: + return GOCR2ImageReader() + + @cached_property + def txt(self) -> TxtReader: + return TxtReader() + + @cached_property + def docling(self) -> DoclingReader: + return DoclingReader() + + @cached_property + def pdf_thumbnail(self) -> PDFThumbnailReader: + return PDFThumbnailReader() diff --git a/libs/ktem/ktem/pages/chat/__init__.py b/libs/ktem/ktem/pages/chat/__init__.py index 86ed46fa3..1bdf8269e 100644 --- a/libs/ktem/ktem/pages/chat/__init__.py +++ b/libs/ktem/ktem/pages/chat/__init__.py @@ -9,6 +9,7 @@ from ktem.components import reasonings from ktem.db.models import Conversation, engine from ktem.index.file.ui import File +from ktem.loaders.extensions import extension_manager from ktem.reasoning.prompt_optimization.suggest_conversation_name import ( SuggestConvNamePipeline, ) @@ -20,7 +21,6 @@ from theflow.settings import settings as flowsettings from kotaemon.base import Document -from kotaemon.indices.ingests.files import KH_DEFAULT_FILE_EXTRACTORS from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex from .chat_panel import ChatPanel @@ -160,7 +160,8 @@ def on_building_ui(self): if len(self._app.index_manager.indices) > 0: with gr.Accordion(label="Quick Upload") as _: self.quick_file_upload = File( - file_types=list(KH_DEFAULT_FILE_EXTRACTORS.keys()), + # file_types=list(KH_DEFAULT_FILE_EXTRACTORS.keys()), + file_types=extension_manager.get_supported_extensions(), file_count="multiple", container=True, show_label=False, diff --git a/libs/ktem/ktem/pages/settings.py b/libs/ktem/ktem/pages/settings.py index b74d641f0..1f8a0e595 100644 --- a/libs/ktem/ktem/pages/settings.py +++ b/libs/ktem/ktem/pages/settings.py @@ -4,6 +4,7 @@ from ktem.app import BasePage from ktem.components import reasonings from ktem.db.models import Settings, User, engine +from ktem.loaders.extensions import extension_manager from sqlmodel import Session, select signout_js = """ @@ -113,6 +114,7 @@ def on_building_ui(self): self.app_tab() self.index_tab() self.reasoning_tab() + self.extension_tab() self.setting_save_btn = gr.Button( "Save changes", variant="primary", scale=1, elem_classes=["right-button"] @@ -177,7 +179,12 @@ def on_register_events(self): self.save_setting, inputs=[self._user_id] + self.components(), outputs=self._settings_state, + ).then( + fn=lambda state: extension_manager.load(state), + inputs=[self._settings_state], + outputs=None, ) + self._components["reasoning.use"].change( self.change_reasoning_mode, inputs=[self._components["reasoning.use"]], @@ -282,6 +289,41 @@ def index_tab(self): if si.special_type == "embedding": self._embeddings.append(obj) + def extension_tab(self): + extensions: list[str] = list(self._default_settings.extension.settings.keys()) + + lefts = extensions[::2] + rights = extensions[1::2] + + if len(lefts) > len(rights): + rights += [""] + + assert len(lefts) == len(rights) + + with gr.Tab("Loader settings"): + for left, right in zip(lefts, rights): + left_setting = self._default_settings.extension.settings.get(left, None) + right_setting = self._default_settings.extension.settings.get( + right, None + ) + + with gr.Row(): + with gr.Column(1): + if left_setting: + left_gradio_obj = render_setting_item( + left_setting, left_setting.value + ) + self._components[f"extension.{left}"] = left_gradio_obj + + with gr.Column(1): + if right_setting: + right_gradio_obj = render_setting_item( + right_setting, right_setting.value + ) + self._components[f"extension.{right}"] = right_gradio_obj + else: + gr.TextArea(value="", visible=False) + def reasoning_tab(self): with gr.Tab("Reasoning settings", visible=self._render_reasoning_tab): with gr.Group(): @@ -339,6 +381,8 @@ def load_setting(self, user_id=None): if result: settings = result[0].setting + extension_manager.load(settings) + output = [settings] output += tuple(settings[name] for name in self.component_names()) return output @@ -416,6 +460,7 @@ def update_embeddings(): outputs=[llm], show_progress="hidden", ) + for emb in self._embeddings: self._app.app.load( update_embeddings, diff --git a/libs/ktem/ktem/settings.py b/libs/ktem/ktem/settings.py index 89f5c6518..b9dea27d5 100644 --- a/libs/ktem/ktem/settings.py +++ b/libs/ktem/ktem/settings.py @@ -124,6 +124,7 @@ class SettingGroup(BaseModel): application: BaseSettingGroup = Field(default_factory=BaseSettingGroup) index: SettingIndexGroup = Field(default_factory=SettingIndexGroup) reasoning: SettingReasoningGroup = Field(default_factory=SettingReasoningGroup) + extension: BaseSettingGroup = Field(default_factory=BaseSettingGroup) def flatten(self) -> dict: """Render the setting group into value""" @@ -137,6 +138,9 @@ def flatten(self) -> dict: for key, value in self.reasoning.flatten().items(): output[f"reasoning.{key}"] = value + for key, value in self.extension.flatten().items(): + output[f"extension.{key}"] = value + return output def get_setting_item(self, path: str) -> SettingItem: