novatechflow
diff --git a/‎.dockerignore‎
Lines changed: 14 additions & 0 deletions b/‎.dockerignore‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/docker.yml‎
Lines changed: 31 additions & 0 deletions b/‎.github/workflows/docker.yml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎.github/workflows/pypi.yml‎
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/pypi.yml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 16 additions & 0 deletions b/‎.gitignore‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 31 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 111 additions & 44 deletions b/‎README.md‎
Lines changed: 111 additions & 44 deletions
diff --git a/‎docai_toolkit/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎docai_toolkit/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docai_toolkit/config.py‎
Lines changed: 79 additions & 0 deletions b/‎docai_toolkit/config.py‎
Lines changed: 79 additions & 0 deletions
@@ -0,0 +1,14 @@
+.git
+.gitignore
+__pycache__
+.pytest_cache
+.venv
+env
+venv
+*.pyc
+*.pyo
+*.pyd
+*.db
+*.sqlite
+*.log
+*.DS_Store
@@ -0,0 +1,31 @@
+name: Build Docker image
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to DockerHub
+        if: secrets.DOCKERHUB_USERNAME && secrets.DOCKERHUB_TOKEN
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: ${{ secrets.DOCKERHUB_USERNAME && secrets.DOCKERHUB_TOKEN }}
+          tags: ${{ secrets.DOCKERHUB_USERNAME }}/docai-toolkit:latest
@@ -0,0 +1,36 @@
+name: Publish to PyPI
+
+on:
+  push:
+    tags:
+      - "v*"
+
+permissions:
+  contents: read
+  id-token: write  # required for trusted publishing
+
+jobs:
+  build-and-publish:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install build tooling
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install build
+
+      - name: Build wheel and sdist
+        run: |
+          python -m build
+
+      - name: Publish to PyPI via Trusted Publisher
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist
@@ -0,0 +1,16 @@
+# Byte-compiled / cache
+__pycache__/
+*.py[cod]
+
+# Virtual env
+.venv/
+env/
+venv/
+
+# OS junk
+.DS_Store
+
+# IDE
+.idea/
+.vscode/
+.pytest_cache/
@@ -0,0 +1,31 @@
+# Base Python image
+FROM python:3.12-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1
+
+WORKDIR /app
+
+# System deps for OCR / pdf rendering
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    poppler-utils \
+    build-essential \
+    libgl1 \
+    python3-tk \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python deps
+COPY requirements.txt .
+RUN python -m pip install --upgrade pip && python -m pip install -r requirements.txt
+
+# Copy source
+COPY . .
+
+# Create non-root user and switch
+RUN useradd -m appuser
+USER appuser
+
+# Default command: launch the viewer UI
+CMD ["python", "pdf_viewer_app.py"]
@@ -1,56 +1,123 @@
-# Contextual PDF Search 
+# DocAI Toolkit
 
-This scripta enable you to ask natural questions about PDF document(s) and get answers generated by a (S)LLM of your choice. It leverages the model's natural language processing capabilities to understand your queries and provide relevant information from the PDF, building a RAG and responds to natural questions.
+Local OCR + Markdown + RAG with optional Hugging Face/custom endpoints. Renamed to avoid PyPI name collisions (`docai-toolkit` package import is `docai_toolkit`).
 
-## Features
-
-* **Question-Answering:** Ask questions in natural language about the content of your PDF.
-* **Hugging Face Integration:**  Leverages the Hugging Face Transformers library to access a wide range of state-of-the-art LLM models.
-* **Sentence Embeddings:** Uses sentence embeddings to efficiently find the most relevant parts of the PDF to answer your questions.
-* **Automatic Dependency Management:** Checks and installs required libraries to ensure a smooth setup.
+- `pdf_viewer_app.py`: Tkinter UI to open PDFs, run OCR → Markdown, and “chat” via retrieval + generation.
+- `docai_toolkit/`: library for OCR (local Tesseract or remote endpoint), embedding/indexing (local or remote), and simple chat over FAISS.
+- Status: under active development; APIs and defaults may change as the AI ecosystem moves quickly.
 
 ## Requirements
 
-* **Python 3.9 or higher:**  Please ensure you have a compatible version of Python installed.
-* **Hugging Face Account:** You'll need a Hugging Face account to access their models. You can create one for free at [https://huggingface.co/](https://huggingface.co/).
-* **Libraries:**  The following Python libraries are required and will be installed automatically if not present:
-    * `langchain`
-    * `transformers`
-    * `accelerate`
-    * `bitsandbytes`
-    * `sentence_transformers`
-
-## Usage
+- Python 3.9+
+- Runtime deps vary by script:
+  - Viewer: `PyPDF2`, `reportlab` (for saving)
+  - RAG scripts: `langchain`, `langchain-community`, `transformers`, `accelerate`, `bitsandbytes`, `sentence_transformers`
 
-1.  **Save the Script:** Download this script and save it as `pdf_qa.py`.
+Install everything:
 
-2.  **Install Dependencies:** Although the script installs and updates all needed libraries, it sometimes fails to do so. In that case open your terminal or command prompt and run:
-    ```bash
-    pip install -r requirements.txt
-    ```
+```bash
+pip install -r requirements.txt
+# or editable install
+pip install -e .
+```
 
-3.  **Run the Script:**
-    ```
-    python3 pdf_qa.py [model_id] [pdf_file_path]
-    ```
-    Replace `[model_id]` with the Hugging Face model ID you want to use (e.g., `mistralai/Mistral-7B-Instruct-v0.1`). You can find a list of available models at [https://huggingface.co/models](https://huggingface.co/models).
-    Replace `[pdf_file_path]` with the path to your PDF file(s).
-
-4.  **Ask Questions:**
-    You'll be prompted to enter questions. Type your questions in natural language and press Enter. The script will provide answers based on the content of the PDF.
+## Usage
 
-5.  **Exit:**
-    Type `exit` and press Enter to quit the script.
+### GUI Viewer
+
+```bash
+python pdf_viewer_app.py
+```
+
+- Open: loads all pages of a PDF into the text area.
+- Save As: renders the text area content into a new PDF (requires `reportlab`).
+- OCR → Markdown: run OCR on a PDF and save Markdown to the configured output directory (local Tesseract or remote OCR endpoint via HF/custom).
+- Chat: build a quick FAISS index over a chosen Markdown file and query it with a selected HF model (remote endpoint or local HF pipeline).
+- Settings: set HF token, optional custom endpoints (OCR/embeddings/LLM), model choices, and output directory. Settings persist to `~/.docai/config.json`. Env vars (`HF_TOKEN`, `HUGGINGFACEHUB_API_TOKEN`, `DOC_AI_OUTPUT_DIR`) are auto-read.
+
+### Hugging Face onboarding (fast path)
+
+1. Create a Hugging Face access token: https://huggingface.co/settings/tokens (choose “Read” or “Write” as needed).
+2. Export it so the app can auto-load it:
+   ```bash
+   export HF_TOKEN=your_token_here
+   # or HUGGINGFACEHUB_API_TOKEN=your_token_here
+   ```
+3. Pick models (examples):
+   - OCR: point the OCR endpoint at a hosted OCR model (HF Inference API URL).
+   - Embeddings: e.g., `sentence-transformers/all-mpnet-base-v2` via Inference Endpoints (text-embeddings task) or local.
+   - LLM: e.g., `mistralai/Mistral-7B-Instruct-v0.1` via Inference Endpoints or local HF pipeline.
+4. Start the app, open Settings, and paste endpoints/models if you didn’t set env vars. Output dir can be set there as well.
+
+Environment variables:
+- `HF_TOKEN` / `HUGGINGFACEHUB_API_TOKEN` / `DOC_AI_HF_TOKEN`: auth token (auto-loads into LLM + embeddings).
+- `DOC_AI_OUTPUT_DIR`: default output directory for OCR/Markdown.
+
+### Docker
+
+Build:
+```bash
+docker build -t docai-toolkit .
+```
+
+Run (GUI requires X/Wayland forwarding; for headless tasks, override CMD):
+```bash
+docker run --rm -v $PWD:/data docai-toolkit python -m pytest -q
+# or override to run OCR in batch using the library CLI you add
+```
+
+macOS GUI via XQuartz:
+1) Install/start XQuartz (`brew install --cask xquartz`; enable “Allow connections from network clients” in prefs and restart).
+2) Allow local clients: `xhost +localhost`
+3) Run:
+```bash
+docker run --rm -it \
+  -e DISPLAY=host.docker.internal:0 \
+  -v /tmp/.X11-unix:/tmp/.X11-unix \
+  docai-toolkit
+```
+For day-to-day use, running natively is simpler; use the container when you need an isolated, reproducible environment.
+
+## Tests
+
+Basic round-trip test for the viewer’s PDF writer:
+
+```bash
+pytest
+```
+
+`reportlab` must be installed for the test to run.
+
+## OCR + RAG (docai_toolkit/)
+
+- OCR: pluggable clients (`RemoteOcrClient` for HF/custom endpoints, `TesseractOcrClient` local fallback) that turn PDFs into Markdown (`ocr/pipeline.py`).
+- RAG: build a FAISS index from Markdown (`rag/index.py`), then chat using a chosen HF model (`rag/chat.py`).
+- Config: lightweight dataclasses in `docai_toolkit/config.py` for selecting providers/models; saved at `~/.docai/config.json`.
+- Remote-friendly: use HF token + model ids by default; configs allow custom OCR/embedding/generation endpoints. FAISS runs locally for fast retrieval.
+
+To experiment locally:
+
+```bash
+# OCR to Markdown (Tesseract fallback requires pytesseract + pdf2image installed)
+python - <<'PY'
+from pathlib import Path
+from docai_toolkit.ocr import TesseractOcrClient, run_ocr_to_markdown
+client = TesseractOcrClient()
+md_path = run_ocr_to_markdown(Path("your.pdf"), Path("outputs"), client)
+print("Saved:", md_path)
+PY
+
+# Build index + chat (requires sentence_transformers + transformers)
+python - <<'PY'
+from pathlib import Path
+from docai_toolkit.rag import build_index_from_markdown, chat_over_corpus, load_index
+index_path = Path("outputs/faiss_index")
+db = build_index_from_markdown([Path("outputs/your.md")], persist_path=index_path)
+print(chat_over_corpus(db, "What is this document about?", model_id="mistralai/Mistral-7B-Instruct-v0.1"))
+# Later: db = load_index(index_path)
+PY
+```
 
 ## License
 
-This code is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) license. See `LICENSE.md` for details.
-
-## Contributing
-
-Contributions are welcome! Please feel free to fork this repository and submit pull requests.
-
-## Disclaimer
-
-This script is provided as-is for educational and personal use. It is not intended for production or commercial applications. The author assumes no liability for any consequences arising from the use of this script.
-
+CC BY-NC-SA 4.0 (see `LICENSE`).
@@ -0,0 +1,3 @@
+"""DocAI Toolkit package (renamed to avoid PyPI conflicts)."""
+
+__all__ = ["config"]
@@ -0,0 +1,79 @@
+import json
+import os
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Optional
+
+CONFIG_PATH = Path.home() / ".docai" / "config.json"
+
+
+@dataclass
+class OcrConfig:
+    provider: str = "deepseek"  # deepseek | tesseract
+    api_key: Optional[str] = None
+    model: Optional[str] = None  # provider-specific
+    endpoint: Optional[str] = None  # user-defined OCR API endpoint
+
+
+@dataclass
+class EmbeddingConfig:
+    backend: str = "sentence-transformers"  # sentence-transformers | huggingface-hub
+    model: str = "all-mpnet-base-v2"
+    device: str = "auto"
+    endpoint: Optional[str] = None  # user-defined embedding API endpoint
+    api_key: Optional[str] = None  # for hosted endpoints
+
+
+@dataclass
+class LlmConfig:
+    backend: str = "huggingface-hub"  # huggingface-hub | local-gguf | openai-compatible
+    model: str = "mistralai/Mistral-7B-Instruct-v0.1"
+    api_key: Optional[str] = None
+    max_new_tokens: int = 256
+    endpoint: Optional[str] = None  # user-defined generation endpoint
+
+
+@dataclass
+class AppConfig:
+    output_dir: Path = field(default_factory=lambda: Path("./outputs"))
+    ocr: OcrConfig = field(default_factory=OcrConfig)
+    embeddings: EmbeddingConfig = field(default_factory=EmbeddingConfig)
+    llm: LlmConfig = field(default_factory=LlmConfig)
+
+    @classmethod
+    def from_env(cls) -> "AppConfig":
+        cfg = cls.load_from_file(CONFIG_PATH) or cls()
+        hf_token = (
+            os.getenv("HF_TOKEN")
+            or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+            or os.getenv("DOC_AI_HF_TOKEN")
+        )
+        if hf_token:
+            cfg.llm.api_key = hf_token
+            cfg.embeddings.api_key = hf_token
+
+        output_dir_env = os.getenv("DOC_AI_OUTPUT_DIR")
+        if output_dir_env:
+            cfg.output_dir = Path(output_dir_env)
+        return cfg
+
+    @classmethod
+    def load_from_file(cls, path: Path | None) -> "AppConfig | None":
+        if not path:
+            return None
+        if not path.exists():
+            return None
+        data = json.loads(path.read_text(encoding="utf-8"))
+        return cls(
+            output_dir=Path(data.get("output_dir", "./outputs")),
+            ocr=OcrConfig(**data.get("ocr", {})),
+            embeddings=EmbeddingConfig(**data.get("embeddings", {})),
+            llm=LlmConfig(**data.get("llm", {})),
+        )
+
+    def save(self, path: Path | None = None) -> None:
+        path = path or CONFIG_PATH
+        path.parent.mkdir(parents=True, exist_ok=True)
+        data = asdict(self)
+        data["output_dir"] = str(self.output_dir)
+        path.write_text(json.dumps(data, indent=2), encoding="utf-8")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+"""DocAI Toolkit package (renamed to avoid PyPI conflicts)."""`
	`2`	`+`
	`3`	`+__all__ = ["config"]`