From db25caa7062706751c5f2dbdfb9d014a832c992c Mon Sep 17 00:00:00 2001 From: Denis Jannot Date: Thu, 15 Jan 2026 10:29:58 +0100 Subject: [PATCH 1/5] Adding topics and multiple parsing improvements Signed-off-by: Denis Jannot --- Dockerfile | 12 +- content-processor.ts | 293 +++++++++++++++++++++++++++++++++---------- doc2vec.ts | 38 +++++- package-lock.json | 4 +- 4 files changed, 275 insertions(+), 72 deletions(-) diff --git a/Dockerfile b/Dockerfile index 06f3dc6..ffec336 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,7 @@ RUN apt-get update && apt-get install -y \ curl \ ca-certificates \ chromium \ + chromium-sandbox \ fonts-freefont-ttf \ fonts-ipafont-gothic \ fonts-kacst \ @@ -26,4 +27,13 @@ RUN apt-get update && apt-get install -y \ libxrandr2 \ libxshmfence1 \ libxtst6 \ - && apt-get clean \ No newline at end of file + && apt-get clean \ + && ln -s /usr/bin/chromium /usr/bin/chromium-browser || true + +COPY package*.json ./ +RUN npm install --ignore-scripts +# Install Chrome via Puppeteer as fallback (system Chromium will be used first) +RUN npx puppeteer browsers install chrome || true +COPY . . + +RUN npm run build \ No newline at end of file diff --git a/content-processor.ts b/content-processor.ts index 0d8652e..51df2b2 100644 --- a/content-processor.ts +++ b/content-processor.ts @@ -364,7 +364,18 @@ export class ContentProcessor { // Original HTML page processing logic let browser: Browser | null = null; try { + // Use system Chromium if available (for Docker environments) + let executablePath: string | undefined = process.env.PUPPETEER_EXECUTABLE_PATH; + if (!executablePath) { + if (fs.existsSync('/usr/bin/chromium')) { + executablePath = '/usr/bin/chromium'; + } else if (fs.existsSync('/usr/bin/chromium-browser')) { + executablePath = '/usr/bin/chromium-browser'; + } + } + browser = await puppeteer.launch({ + executablePath, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page: Page = await browser.newPage(); @@ -372,7 +383,15 @@ export class ContentProcessor { await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); const htmlContent: string = await page.evaluate(() => { - const mainContentElement = document.querySelector('div[role="main"].document') || document.querySelector('main') || document.body; + // 💡 Try specific content selectors first, then fall back to broader ones + const mainContentElement = + document.querySelector('.docs-content') || // Common docs pattern + document.querySelector('.doc-content') || // Alternative docs pattern + document.querySelector('.markdown-body') || // GitHub-style + document.querySelector('article') || // Semantic article + document.querySelector('div[role="main"].document') || + document.querySelector('main') || + document.body; return mainContentElement.innerHTML; }); @@ -392,10 +411,25 @@ export class ContentProcessor { this.markCodeParents(pre.parentElement); }); + // 💡 Extract H1s BEFORE Readability - it often strips them as "chrome" + // We'll inject them back after Readability processing + const h1Elements = document.querySelectorAll('h1'); + const extractedH1s: string[] = []; + logger.debug(`[Readability Debug] Found ${h1Elements.length} H1 elements before Readability`); + h1Elements.forEach((h1: Element, index: number) => { + const h1Text = h1.textContent?.trim() || ''; + // Skip empty H1s or icon-only H1s (like "link" anchors) + if (h1Text && h1Text.length > 3 && !h1Text.match(/^(link|#|menu|close)$/i)) { + extractedH1s.push(h1Text); + logger.debug(`[Readability Debug] Extracted H1[${index}]: "${h1Text.substring(0, 50)}..."`); + } + h1.classList.add('original-h1'); + }); + logger.debug(`Applying Readability to extract main content`); const reader = new Readability(document, { charThreshold: 20, - classesToPreserve: ['article-content'], + classesToPreserve: ['article-content', 'original-h1'], }); const article = reader.parse(); @@ -404,9 +438,38 @@ export class ContentProcessor { await browser.close(); return null; } + + // Debug: Log what Readability extracted + logger.debug(`[Readability Debug] article.title: "${article.title}"`); + logger.debug(`[Readability Debug] article.content length: ${article.content?.length}`); + logger.debug(`[Readability Debug] article.content starts with: "${article.content?.substring(0, 200)}..."`); + logger.debug(`[Readability Debug] Contains H1 tag: ${article.content?.includes(' { + logger.debug(`[Readability Debug] Restoring[${index}]: tagName=${heading.tagName}, text="${heading.textContent?.trim().substring(0, 50)}..."`); + // Create a new H1 element with the same content + const h1 = articleDoc.createElement('h1'); + h1.innerHTML = heading.innerHTML; + // Copy other attributes except class + Array.from(heading.attributes).forEach(attr => { + if (attr.name !== 'class') { + h1.setAttribute(attr.name, attr.value); + } + }); + heading.replaceWith(h1); + }); + const restoredContent = articleDoc.body.innerHTML; + logger.debug(`[Readability Debug] Restored content contains H1: ${restoredContent.includes(' 0 ? extractedH1s[0] : (article.title?.trim() || ''); + if (pageTitle) { + // Check if markdown already starts with this exact H1 (allowing for leading whitespace) + const normalizedTitle = pageTitle.replace(/\s+/g, ' '); + const markdownFirstLine = markdown.trimStart().split('\n')[0] || ''; + const existingH1Match = markdownFirstLine.match(/^#\s+(.+)$/); + const existingH1Text = existingH1Match ? existingH1Match[1].replace(/\s+/g, ' ').trim() : ''; + + // Only inject if markdown doesn't already start with this H1 + if (!existingH1Match || existingH1Text !== normalizedTitle) { + markdown = `# ${pageTitle}\n\n${markdown}`; + logger.debug(`[Readability Debug] Injected page title as H1: "${pageTitle}"`); + } else { + logger.debug(`[Readability Debug] H1 "${pageTitle}" already present in markdown`); + } + } + logger.debug(`Markdown conversion complete (${markdown.length} chars)`); return markdown; } catch (error) { @@ -743,91 +827,174 @@ export class ContentProcessor { async chunkMarkdown(markdown: string, sourceConfig: SourceConfig, url: string): Promise { const logger = this.logger.child('chunker'); - logger.debug(`Chunking markdown from ${url} (${markdown.length} chars)`); + // --- Configuration --- const MAX_TOKENS = 1000; + const MIN_TOKENS = 150; // 💡 Merges "OpenAI-compatible" sentence into the next block + const OVERLAP_PERCENT = 0.1; // 10% overlap for large splits + const chunks: DocumentChunk[] = []; const lines = markdown.split("\n"); - let currentChunk = ""; + + let buffer = ""; let headingHierarchy: string[] = []; - - const processChunk = () => { - if (currentChunk.trim()) { - const tokens = Utils.tokenize(currentChunk); - if (tokens.length > MAX_TOKENS) { - logger.debug(`Chunk exceeds max token count (${tokens.length}), splitting into smaller chunks`); - let subChunk = ""; - let tokenCount = 0; - const overlapSize = Math.floor(MAX_TOKENS * 0.05); - let lastTokens: string[] = []; - - for (const token of tokens) { - if (tokenCount + 1 > MAX_TOKENS) { - chunks.push(createDocumentChunk(subChunk, headingHierarchy)); - subChunk = lastTokens.join("") + token; - tokenCount = lastTokens.length + 1; - lastTokens = []; - } else { - subChunk += token; - tokenCount++; - lastTokens.push(token); - if (lastTokens.length > overlapSize) { - lastTokens.shift(); - } - } - } - if (subChunk) { - chunks.push(createDocumentChunk(subChunk, headingHierarchy)); - } - } else { - chunks.push(createDocumentChunk(currentChunk, headingHierarchy)); - } + let bufferHeadings: Array<{ level: number; text: string }> = []; // Track headings in current buffer + + /** + * Computes the topic hierarchy for merged content. + * When merging sibling sections (same level), uses their parent heading. + * Otherwise uses the current hierarchy. + */ + const computeTopicHierarchy = (): string[] => { + if (bufferHeadings.length === 0) { + return headingHierarchy; + } + + // Find the deepest level (most recent headings) + const deepestLevel = Math.max(...bufferHeadings.map(h => h.level)); + + // Get all headings at the deepest level + const deepestHeadings = bufferHeadings.filter(h => h.level === deepestLevel); + + // If we have multiple sibling headings at the deepest level, use their parent + if (deepestHeadings.length > 1 && deepestLevel > 1) { + // Use parent heading (one level up from the sibling headings) + // headingHierarchy still contains the parent at index (deepestLevel - 2) + // We want everything up to (but not including) the deepest level + return headingHierarchy.slice(0, deepestLevel - 1); } - currentChunk = ""; + + // Single heading or different levels: use the current hierarchy + // This reflects the most recent heading which is appropriate + return headingHierarchy; }; - + + /** + * Internal helper to create the final chunk object with injected context. + */ const createDocumentChunk = (content: string, hierarchy: string[]): DocumentChunk => { - const chunkId = Utils.generateHash(content); - logger.debug(`Created chunk ${chunkId.substring(0, 8)}... with ${content.length} chars`); + // 💡 BREADCRUMB INJECTION + // We prepend the hierarchy to the text. This makes the vector highly relevant + // to searches for parent topics even if the body doesn't mention them. + const breadcrumbs = hierarchy.filter(h => h).join(" > "); + const contextPrefix = breadcrumbs ? `[Topic: ${breadcrumbs}]\n` : ""; + const searchableText = contextPrefix + content.trim(); + console.log(searchableText); + console.log(hierarchy); + const chunkId = Utils.generateHash(searchableText); return { - content, + content: searchableText, metadata: { product_name: sourceConfig.product_name, version: sourceConfig.version, - heading_hierarchy: [...hierarchy], + heading_hierarchy: hierarchy.filter(h => h), section: hierarchy[hierarchy.length - 1] || "Introduction", chunk_id: chunkId, url: url, - hash: Utils.generateHash(content) + hash: chunkId } }; }; - + + /** + * Flushes the current buffer into the chunks array. + * Uses sub-splitting logic if the buffer exceeds MAX_TOKENS. + */ + const flushBuffer = (force = false) => { + const trimmedBuffer = buffer.trim(); + if (!trimmedBuffer) return; + + const tokenCount = Utils.tokenize(trimmedBuffer).length; + + // 💡 SEMANTIC MERGING + // If the current section is too short (like just a title or a one-liner), + // we don't flush yet unless it's the end of the file (force=true). + if (tokenCount < MIN_TOKENS && !force) { + return; + } + + // Compute the appropriate topic hierarchy for merged content + const topicHierarchy = computeTopicHierarchy(); + + if (tokenCount > MAX_TOKENS) { + // 💡 RECURSIVE OVERLAP SPLITTING + // If the section is a massive guide, split it but keep headers on every sub-piece. + const tokens = Utils.tokenize(trimmedBuffer); + const overlapSize = Math.floor(MAX_TOKENS * OVERLAP_PERCENT); + + for (let i = 0; i < tokens.length; i += (MAX_TOKENS - overlapSize)) { + const subTokens = tokens.slice(i, i + MAX_TOKENS); + const subContent = subTokens.join(""); + chunks.push(createDocumentChunk(subContent, topicHierarchy)); + } + } else { + chunks.push(createDocumentChunk(trimmedBuffer, topicHierarchy)); + } + + buffer = ""; // Reset buffer after successful flush + bufferHeadings = []; // Reset tracked headings + }; + + // --- Main Processing Loop --- for (const line of lines) { - if (line.startsWith("#")) { - processChunk(); + const isHeading = line.startsWith("#"); + + if (isHeading) { + // Update Hierarchy Stack for the new heading const levelMatch = line.match(/^(#+)/); - let level = levelMatch ? levelMatch[1].length : 1; - const heading = line.replace(/^#+\s*/, "").trim(); - - logger.debug(`Found heading (level ${level}): ${heading}`); + const level = levelMatch ? levelMatch[1].length : 1; + // Clean heading: remove markdown prefix and anchor links like [](#anchor-id) + const headingText = line + .replace(/^#+\s*/, "") // Remove ## prefix + .replace(/\[.*?\]\(#[^)]*\)/g, "") // Remove [text](#anchor) patterns + .replace(/\[\]\(#[^)]*\)/g, "") // Remove [](#anchor) patterns + .trim(); - while (headingHierarchy.length < level - 1) { - headingHierarchy.push(""); - } - - if (level <= headingHierarchy.length) { - headingHierarchy = headingHierarchy.slice(0, level - 1); + // Check if we should merge with previous content + const currentTokenCount = Utils.tokenize(buffer.trim()).length; + const hasBufferContent = currentTokenCount > 0; + const bufferIsSmall = currentTokenCount < MIN_TOKENS; + + // Only merge if: + // 1. Buffer has content and is small + // 2. Buffer has tracked headings (we're merging sections, not just content) + // 3. New heading is at same or deeper level than the deepest heading in buffer (siblings or children) + // If new heading is shallower (e.g., H2 after H3), it's a new section - flush first + const deepestBufferLevel = bufferHeadings.length > 0 + ? Math.max(...bufferHeadings.map(h => h.level)) + : 0; + const shouldMerge = hasBufferContent && bufferIsSmall && bufferHeadings.length > 0 && + level >= deepestBufferLevel; + + if (!shouldMerge && hasBufferContent) { + // Buffer is large enough OR new heading starts a new section - flush first + flushBuffer(); } - headingHierarchy[level - 1] = heading; + // If shouldMerge is true, we keep the buffer and merge the sections + + // Reset hierarchy below this level (e.g., H2 reset should clear previous H3s) + headingHierarchy = headingHierarchy.slice(0, level - 1); + headingHierarchy[level - 1] = headingText; + + // Track this heading in the buffer + bufferHeadings.push({ level, text: headingText }); + + buffer += `${line}\n`; } else { - currentChunk += `${line}\n`; + buffer += `${line}\n`; + + // Safety valve: if a single section is huge, flush it periodically + if (Utils.tokenize(buffer).length >= MAX_TOKENS) { + flushBuffer(); + } } } - processChunk(); + + // Final sweep + flushBuffer(true); - logger.debug(`Chunking complete, created ${chunks.length} chunks`); + logger.debug(`Chunking complete: ${chunks.length} rich context chunks created.`); return chunks; } } \ No newline at end of file diff --git a/doc2vec.ts b/doc2vec.ts index 4a9d8bc..be3e53b 100644 --- a/doc2vec.ts +++ b/doc2vec.ts @@ -124,14 +124,40 @@ class Doc2Vec { return response.data; } catch (error: any) { if (error.response && error.response.status === 403) { + // Check if this is actually a rate limit error + const rateLimitRemaining = error.response.headers['x-ratelimit-remaining']; const resetTime = error.response.headers['x-ratelimit-reset']; - const currentTime = Math.floor(Date.now() / 1000); - const waitTime = resetTime ? (resetTime - currentTime) * 1000 : delay * 2; - logger.warn(`GitHub rate limit exceeded. Waiting ${waitTime / 1000}s`); - await new Promise(res => setTimeout(res, waitTime)); + + if (rateLimitRemaining === '0' && resetTime) { + const currentTime = Math.floor(Date.now() / 1000); + const resetTimestamp = parseInt(resetTime, 10); + let waitTime = (resetTimestamp - currentTime) * 1000; + + // Ensure waitTime is at least 1 second (in case resetTime is in the past) + if (waitTime < 1000) { + waitTime = 1000; + } + + logger.warn(`GitHub rate limit exceeded. Waiting ${Math.ceil(waitTime / 1000)}s (attempt ${attempt + 1}/${retries})`); + await new Promise(res => setTimeout(res, waitTime)); + + // Retry the request after waiting + continue; + } else { + // Other 403 errors (e.g., forbidden access) + logger.error(`GitHub API returned 403 (not rate limit): ${error.message}`); + throw error; + } } else { - logger.error(`GitHub fetch failed: ${error.message}`); - throw error; + // For non-403 errors, wait before retrying (exponential backoff) + if (attempt < retries - 1) { + const backoffDelay = delay * Math.pow(2, attempt); + logger.warn(`GitHub fetch failed (attempt ${attempt + 1}/${retries}): ${error.message}. Retrying in ${backoffDelay}ms`); + await new Promise(res => setTimeout(res, backoffDelay)); + } else { + logger.error(`GitHub fetch failed: ${error.message}`); + throw error; + } } } } diff --git a/package-lock.json b/package-lock.json index 4afa738..aeb3d1a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "doc2vec", - "version": "1.1.1", + "version": "1.3.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "doc2vec", - "version": "1.1.1", + "version": "1.3.0", "license": "ISC", "dependencies": { "@mozilla/readability": "^0.4.4", From a57817a667d8f49301316c764571e99eb0c6fbad Mon Sep 17 00:00:00 2001 From: Denis Jannot Date: Thu, 15 Jan 2026 15:02:26 +0100 Subject: [PATCH 2/5] Adding chunk index and total Signed-off-by: Denis Jannot --- README.md | 80 +++++++++++++++++++++++++++++++++++++++++++- content-processor.ts | 17 +++++++--- database.ts | 24 +++++++++---- doc2vec.ts | 22 +++++++++++- types.ts | 2 ++ 5 files changed, 133 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index da49910..c21bdae 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,10 @@ The primary goal is to prepare documentation content for Retrieval-Augmented Gen * **Local Directory Processing:** Scans local directories for files, converts content to searchable chunks. * **PDF Support:** Automatically extracts text from PDF files and converts them to Markdown format using Mozilla's PDF.js. * **Content Extraction:** Uses Puppeteer for rendering JavaScript-heavy pages and `@mozilla/readability` to extract the main article content. + * **Smart H1 Preservation:** Automatically extracts and preserves page titles (H1 headings) that Readability might strip as "page chrome", ensuring proper heading hierarchy. + * **Flexible Content Selectors:** Supports multiple content container patterns (`.docs-content`, `.doc-content`, `.markdown-body`, `article`, etc.) for better compatibility with various documentation sites. * **HTML to Markdown:** Converts extracted HTML to clean Markdown using `turndown`, preserving code blocks and basic formatting. + * **Clean Heading Text:** Automatically removes anchor links (like `[](#section-id)`) from heading text for cleaner hierarchy display. * **Intelligent Chunking:** Splits Markdown content into manageable chunks based on headings and token limits, preserving context. * **Vector Embeddings:** Generates embeddings for each chunk using OpenAI's `text-embedding-3-large` model. * **Vector Storage:** Supports storing chunks, metadata, and embeddings in: @@ -32,6 +35,58 @@ The primary goal is to prepare documentation content for Retrieval-Augmented Gen * **Configuration:** Driven by a YAML configuration file (`config.yaml`) specifying sites, repositories, local directories, Zendesk instances, database types, metadata, and other parameters. * **Structured Logging:** Uses a custom logger (`logger.ts`) with levels, timestamps, colors, progress bars, and child loggers for clear execution monitoring. +## Chunk Metadata & Page Reconstruction + +Each chunk stored in the database includes rich metadata that enables powerful retrieval and page reconstruction capabilities. + +### Metadata Fields + +| Field | Type | Description | +|-------|------|-------------| +| `product_name` | string | Product identifier from config | +| `version` | string | Version identifier from config | +| `heading_hierarchy` | string[] | Hierarchical breadcrumb trail (e.g., `["Installation", "Prerequisites", "Docker"]`) | +| `section` | string | Current section heading | +| `chunk_id` | string | Unique hash identifier for the chunk | +| `url` | string | Source URL/path of the original document | +| `hash` | string | Content hash for change detection | +| `chunk_index` | number | Position of this chunk within the page (0-based) | +| `total_chunks` | number | Total number of chunks for this page | + +### Page Reconstruction + +The `chunk_index` and `total_chunks` fields enable you to reconstruct full pages from chunks: + +```typescript +// Example: Retrieve all chunks for a URL and reconstruct the page +const chunks = await db.query({ + filter: { url: "https://docs.example.com/guide" }, + sort: { chunk_index: "asc" } +}); + +// Check if there are more chunks after the current one +if (currentChunk.chunk_index < currentChunk.total_chunks - 1) { + // More chunks available - fetch the next one + const nextChunkIndex = currentChunk.chunk_index + 1; +} + +// Reconstruct full page content +const fullPageContent = chunks + .sort((a, b) => a.chunk_index - b.chunk_index) + .map(c => c.content) + .join("\n\n"); +``` + +### Heading Hierarchy (Breadcrumbs) + +Each chunk includes a `heading_hierarchy` array that provides context about where the content appears in the document structure. This is injected as a `[Topic: ...]` prefix in the chunk content to improve vector search relevance. + +For example, a chunk under "Installation > Prerequisites > Docker" will have: +- `heading_hierarchy`: `["Installation", "Prerequisites", "Docker"]` +- Content prefix: `[Topic: Installation > Prerequisites > Docker]` + +This ensures that searches for parent topics (like "Installation") will also match relevant child content. + ## Prerequisites * **Node.js:** Version 18 or higher recommended (check `.nvmrc` if available). @@ -345,4 +400,27 @@ If you don't specify a config path, it will look for config.yaml in the current * **Embed (if needed):** If the chunk is new or changed, call the OpenAI API (`createEmbeddings`) to get the vector embedding. * **Store:** Insert or update the chunk, metadata, hash, and embedding in the database (SQLite `vec_items` table or Qdrant collection). 4. **Cleanup:** After processing, remove any obsolete chunks from the database. -4. **Complete:** Log completion status. \ No newline at end of file +4. **Complete:** Log completion status. + +## Recent Changes + +### Page Reconstruction Support +- Added `chunk_index` field to track each chunk's position within a page (0-based) +- Added `total_chunks` field to indicate the total number of chunks per page +- Enables AI agents and applications to fetch additional context or reconstruct full pages +- Works consistently across all content types: websites, GitHub, Zendesk, and local directories + +### Improved H1/Title Handling +- Smart H1 preservation ensures page titles aren't stripped by Readability +- Falls back to `article.title` when H1 extraction fails +- Proper heading hierarchy starting from H1 through the document structure + +### Enhanced Content Extraction +- Added support for multiple content container selectors (`.docs-content`, `.doc-content`, `.markdown-body`, `article`) +- Cleaner heading text by removing anchor links like `[](#section-id)` +- Better handling of pages where H1 is outside the main content container + +### Heading Hierarchy Improvements +- Fixed sparse array issues that caused `NULL` values in heading hierarchy +- Proper breadcrumb generation for nested sections +- Hierarchical context preserved across chunk boundaries \ No newline at end of file diff --git a/content-processor.ts b/content-processor.ts index 51df2b2..82c02c0 100644 --- a/content-processor.ts +++ b/content-processor.ts @@ -839,6 +839,7 @@ export class ContentProcessor { let buffer = ""; let headingHierarchy: string[] = []; let bufferHeadings: Array<{ level: number; text: string }> = []; // Track headings in current buffer + let chunkCounter = 0; // Tracks chunk position within this page for ordering /** * Computes the topic hierarchy for merged content. @@ -879,11 +880,9 @@ export class ContentProcessor { const breadcrumbs = hierarchy.filter(h => h).join(" > "); const contextPrefix = breadcrumbs ? `[Topic: ${breadcrumbs}]\n` : ""; const searchableText = contextPrefix + content.trim(); - console.log(searchableText); - console.log(hierarchy); const chunkId = Utils.generateHash(searchableText); - return { + const chunk: DocumentChunk = { content: searchableText, metadata: { product_name: sourceConfig.product_name, @@ -892,9 +891,13 @@ export class ContentProcessor { section: hierarchy[hierarchy.length - 1] || "Introduction", chunk_id: chunkId, url: url, - hash: chunkId + hash: chunkId, + chunk_index: chunkCounter, + total_chunks: 0 // Placeholder, will be updated after all chunks are created } }; + chunkCounter++; // Increment for next chunk + return chunk; }; /** @@ -994,6 +997,12 @@ export class ContentProcessor { // Final sweep flushBuffer(true); + // Update all chunks with the final total count + const totalChunks = chunks.length; + for (const chunk of chunks) { + chunk.metadata.total_chunks = totalChunks; + } + logger.debug(`Chunking complete: ${chunks.length} rich context chunks created.`); return chunks; } diff --git a/database.ts b/database.ts index eb9f810..4991544 100644 --- a/database.ts +++ b/database.ts @@ -41,7 +41,9 @@ export class DatabaseManager { chunk_id TEXT UNIQUE, content TEXT, url TEXT, - hash TEXT + hash TEXT, + chunk_index INTEGER, + total_chunks INTEGER ); `); logger.info(`SQLite database initialized successfully`); @@ -220,12 +222,12 @@ export class DatabaseManager { static prepareSQLiteStatements(db: Database) { return { insertStmt: db.prepare(` - INSERT INTO vec_items (embedding, product_name, version, heading_hierarchy, section, chunk_id, content, url, hash) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO vec_items (embedding, product_name, version, heading_hierarchy, section, chunk_id, content, url, hash, chunk_index, total_chunks) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `), checkHashStmt: db.prepare(`SELECT hash FROM vec_items WHERE chunk_id = ?`), updateStmt: db.prepare(` - UPDATE vec_items SET embedding = ?, product_name = ?, version = ?, heading_hierarchy = ?, section = ?, content = ?, url = ?, hash = ? + UPDATE vec_items SET embedding = ?, product_name = ?, version = ?, heading_hierarchy = ?, section = ?, content = ?, url = ?, hash = ?, chunk_index = ?, total_chunks = ? WHERE chunk_id = ? `), getAllChunkIdsStmt: db.prepare(`SELECT chunk_id FROM vec_items`), @@ -247,13 +249,21 @@ export class DatabaseManager { chunk.metadata.chunk_id, chunk.content, chunk.metadata.url, - hash + hash, + chunk.metadata.chunk_index, + chunk.metadata.total_chunks ]; try { insertStmt.run(params); } catch (error) { - updateStmt.run([...params.slice(0, 8), chunk.metadata.chunk_id]); + // Update params: all fields except chunk_id (which is the WHERE clause), then chunk_id at end for WHERE + const updateParams = [ + ...params.slice(0, 5), // embedding through section + ...params.slice(6), // content through total_chunks (skip chunk_id) + chunk.metadata.chunk_id // WHERE clause + ]; + updateStmt.run(updateParams); } }); @@ -287,6 +297,8 @@ export class DatabaseManager { url: chunk.metadata.url, hash: hash, original_chunk_id: chunk.metadata.chunk_id, + chunk_index: chunk.metadata.chunk_index, + total_chunks: chunk.metadata.total_chunks, }, }; diff --git a/doc2vec.ts b/doc2vec.ts index be3e53b..940123a 100644 --- a/doc2vec.ts +++ b/doc2vec.ts @@ -114,15 +114,30 @@ class Doc2Vec { const fetchWithRetry = async (url: string, params = {}, retries = 5, delay = 5000): Promise => { for (let attempt = 0; attempt < retries; attempt++) { try { + // Only log on retries to reduce noise during pagination + if (attempt > 0) { + logger.debug(`GitHub API retry: ${url} (attempt ${attempt + 1}/${retries})`); + } const response = await axios.get(url, { headers: { Authorization: `token ${GITHUB_TOKEN}`, Accept: 'application/vnd.github.v3+json', }, params, + timeout: 30000, // 30 second timeout }); return response.data; } catch (error: any) { + // Enhanced error logging for debugging + const errorDetails = { + code: error.code, + message: error.message, + status: error.response?.status, + isTimeout: error.code === 'ECONNABORTED' || error.message?.includes('timeout'), + isNetworkError: !error.response && error.code, + }; + logger.debug(`GitHub API error details: ${JSON.stringify(errorDetails)}`); + if (error.response && error.response.status === 403) { // Check if this is actually a rate limit error const rateLimitRemaining = error.response.headers['x-ratelimit-remaining']; @@ -155,7 +170,7 @@ class Doc2Vec { logger.warn(`GitHub fetch failed (attempt ${attempt + 1}/${retries}): ${error.message}. Retrying in ${backoffDelay}ms`); await new Promise(res => setTimeout(res, backoffDelay)); } else { - logger.error(`GitHub fetch failed: ${error.message}`); + logger.error(`GitHub fetch failed after ${retries} attempts: ${error.message} (code: ${error.code || 'unknown'})`); throw error; } } @@ -171,6 +186,11 @@ class Doc2Vec { const sinceTimestamp = new Date(sinceDate); while (true) { + // Log progress every 10 pages to reduce noise + if (page === 1 || page % 10 === 0) { + logger.debug(`Fetching issues page ${page}... (${issues.length} issues so far)`); + } + const data = await fetchWithRetry(GITHUB_API_URL, { per_page: perPage, page, diff --git a/types.ts b/types.ts index db61f39..68b5c8e 100644 --- a/types.ts +++ b/types.ts @@ -78,6 +78,8 @@ export interface DocumentChunk { chunk_id: string; url: string; hash?: string; + chunk_index: number; // Position of this chunk within the page (0-based) + total_chunks: number; // Total number of chunks for this page, allows knowing if more chunks exist }; } From e1d60147c1b511c7cc2007fab944f981f3423b32 Mon Sep 17 00:00:00 2001 From: Denis Jannot Date: Sun, 18 Jan 2026 16:57:39 +0100 Subject: [PATCH 3/5] Fixing sqlite Signed-off-by: Denis Jannot --- content-processor.ts | 4 ++-- database.ts | 50 +++++++++++++++++++++++++------------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/content-processor.ts b/content-processor.ts index 82c02c0..811d0a0 100644 --- a/content-processor.ts +++ b/content-processor.ts @@ -892,7 +892,7 @@ export class ContentProcessor { chunk_id: chunkId, url: url, hash: chunkId, - chunk_index: chunkCounter, + chunk_index: Math.floor(chunkCounter), total_chunks: 0 // Placeholder, will be updated after all chunks are created } }; @@ -998,7 +998,7 @@ export class ContentProcessor { flushBuffer(true); // Update all chunks with the final total count - const totalChunks = chunks.length; + const totalChunks = Math.floor(chunks.length); for (const chunk of chunks) { chunk.metadata.total_chunks = totalChunks; } diff --git a/database.ts b/database.ts index 4991544..c75d2aa 100644 --- a/database.ts +++ b/database.ts @@ -240,30 +240,38 @@ export class DatabaseManager { const hash = chunkHash || Utils.generateHash(chunk.content); const transaction = db.transaction(() => { - const params = [ - new Float32Array(embedding), - chunk.metadata.product_name, - chunk.metadata.version, - JSON.stringify(chunk.metadata.heading_hierarchy), - chunk.metadata.section, - chunk.metadata.chunk_id, - chunk.content, - chunk.metadata.url, - hash, - chunk.metadata.chunk_index, - chunk.metadata.total_chunks - ]; + // Use BigInt for true integer representation in SQLite vec0 + const chunkIndex = BigInt(chunk.metadata.chunk_index | 0); + const totalChunks = BigInt(chunk.metadata.total_chunks | 0); try { - insertStmt.run(params); + insertStmt.run( + new Float32Array(embedding), + chunk.metadata.product_name, + chunk.metadata.version, + JSON.stringify(chunk.metadata.heading_hierarchy), + chunk.metadata.section, + chunk.metadata.chunk_id, + chunk.content, + chunk.metadata.url, + hash, + chunkIndex, + totalChunks + ); } catch (error) { - // Update params: all fields except chunk_id (which is the WHERE clause), then chunk_id at end for WHERE - const updateParams = [ - ...params.slice(0, 5), // embedding through section - ...params.slice(6), // content through total_chunks (skip chunk_id) - chunk.metadata.chunk_id // WHERE clause - ]; - updateStmt.run(updateParams); + updateStmt.run( + new Float32Array(embedding), + chunk.metadata.product_name, + chunk.metadata.version, + JSON.stringify(chunk.metadata.heading_hierarchy), + chunk.metadata.section, + chunk.content, + chunk.metadata.url, + hash, + chunkIndex, + totalChunks, + chunk.metadata.chunk_id + ); } }); From 670f85de70d31f1896688c6f60afb7dc894305c7 Mon Sep 17 00:00:00 2001 From: Denis Jannot Date: Sun, 18 Jan 2026 17:38:35 +0100 Subject: [PATCH 4/5] Adding doc and docx support Signed-off-by: Denis Jannot --- README.md | 75 ++++++++++++++- content-processor.ts | 86 +++++++++++++++++ package-lock.json | 223 ++++++++++++++++++++++++++++++++++++++++++- package.json | 4 +- 4 files changed, 383 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c21bdae..c9235cd 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ The primary goal is to prepare documentation content for Retrieval-Augmented Gen * **Flexible Filtering:** Filter tickets by status and priority. * **Local Directory Processing:** Scans local directories for files, converts content to searchable chunks. * **PDF Support:** Automatically extracts text from PDF files and converts them to Markdown format using Mozilla's PDF.js. + * **Word Document Support:** Processes both legacy `.doc` and modern `.docx` files, extracting text and formatting. * **Content Extraction:** Uses Puppeteer for rendering JavaScript-heavy pages and `@mozilla/readability` to extract the main article content. * **Smart H1 Preservation:** Automatically extracts and preserves page titles (H1 headings) that Readability might strip as "page chrome", ensuring proper heading hierarchy. * **Flexible Content Selectors:** Supports multiple content container patterns (`.docs-content`, `.doc-content`, `.markdown-body`, `article`, etc.) for better compatibility with various documentation sites. @@ -154,7 +155,7 @@ Configuration is managed through two files: For local directories (`type: 'local_directory'`): * `path`: Path to the local directory to process. - * `include_extensions`: (Optional) Array of file extensions to include (e.g., `['.md', '.txt', '.pdf']`). Defaults to `['.md', '.txt', '.html', '.htm', '.pdf']`. + * `include_extensions`: (Optional) Array of file extensions to include (e.g., `['.md', '.txt', '.pdf', '.doc', '.docx']`). Defaults to `['.md', '.txt', '.html', '.htm', '.pdf']`. * `exclude_extensions`: (Optional) Array of file extensions to exclude. * `recursive`: (Optional) Whether to traverse subdirectories (defaults to `true`). * `url_rewrite_prefix` (Optional) URL prefix to rewrite `file://` URLs (e.g., `https://mydomain.com`) @@ -216,9 +217,9 @@ Configuration is managed through two files: product_name: 'project-docs' version: 'current' path: './docs' - include_extensions: ['.md', '.txt', '.pdf'] + include_extensions: ['.md', '.txt', '.pdf', '.doc', '.docx'] recursive: true - max_size: 10485760 # 10MB recommended for PDF files + max_size: 10485760 # 10MB recommended for PDF/Word files database_config: type: 'sqlite' params: @@ -346,6 +347,67 @@ A PDF file named "user-guide.pdf" will be converted to Markdown format like: The resulting Markdown is then chunked and embedded using the same process as other text content. +## Word Document Processing + +Doc2Vec supports processing Microsoft Word documents in both legacy `.doc` format and modern `.docx` format. + +### Supported Formats + +| Extension | Format | Library Used | +|-----------|--------|--------------| +| `.doc` | Legacy Word (97-2003) | [word-extractor](https://github.com/morungos/node-word-extractor) | +| `.docx` | Modern Word (2007+) | [mammoth](https://github.com/mwilliamson/mammoth.js) | + +### Features + +* **Legacy .doc Support:** Extracts plain text from older Word documents using binary parsing +* **Modern .docx Support:** Converts DOCX files to HTML first (preserving formatting), then to clean Markdown +* **Formatting Preservation:** For `.docx` files, headings, lists, bold, italic, and links are preserved +* **Automatic Title:** Uses the filename as an H1 heading for proper document structure +* **Local File Support:** Processes Word files found in local directories alongside other documents + +### Configuration + +Include `.doc` and/or `.docx` in your `include_extensions` array: + +```yaml +- type: 'local_directory' + product_name: 'company-docs' + version: 'current' + path: './documents' + include_extensions: ['.doc', '.docx', '.pdf', '.md'] + recursive: true + max_size: 10485760 # 10MB recommended + database_config: + type: 'sqlite' + params: + db_path: './company-docs.db' +``` + +### Example Output + +A Word document named "meeting-notes.docx" will be converted to Markdown like: + +```markdown +# meeting-notes + +## Agenda + +1. Review Q4 results +2. Discuss roadmap + +## Action Items + +- **John:** Prepare budget report +- **Sarah:** Schedule follow-up meeting +``` + +### Notes + +* **`.doc` files:** Only plain text is extracted. Formatting like bold/italic is not preserved in legacy Word format. +* **`.docx` files:** Full formatting is preserved including headings, lists, bold, italic, links, and tables. +* **Embedded Images:** Images embedded in Word documents are not extracted (text-only). + ## Now Available via npx You can run `doc2vec` without cloning the repo or installing it globally. Just use: @@ -388,6 +450,7 @@ If you don't specify a config path, it will look for config.yaml in the current * Recursively scan directories for files matching the configured extensions. * Read file content, converting HTML to Markdown if needed. * For PDF files, extract text using Mozilla's PDF.js and convert to Markdown format with proper page structure. + * For Word documents, extract text from `.doc` files or convert `.docx` files to Markdown with formatting. * Process each file's content. - **For Zendesk:** * Fetch tickets and articles using the Zendesk API. @@ -404,6 +467,12 @@ If you don't specify a config path, it will look for config.yaml in the current ## Recent Changes +### Word Document Support +- Added support for legacy `.doc` files using the `word-extractor` library +- Added support for modern `.docx` files using the `mammoth` library +- DOCX files preserve formatting (headings, lists, bold, italic, links) +- Both formats are converted to clean Markdown for embedding + ### Page Reconstruction Support - Added `chunk_index` field to track each chunk's position within a page (0-based) - Added `total_chunks` field to indicate the total number of chunks per page diff --git a/content-processor.ts b/content-processor.ts index 811d0a0..d263584 100644 --- a/content-processor.ts +++ b/content-processor.ts @@ -530,6 +530,84 @@ export class ContentProcessor { this.markCodeParents(node.parentElement); } + private async convertDocToMarkdown(filePath: string, logger: Logger): Promise { + logger.debug(`Converting DOC to markdown: ${filePath}`); + + try { + // Dynamic import for word-extractor + const WordExtractor = (await import('word-extractor')).default; + const extractor = new WordExtractor(); + + const extracted = await extractor.extract(filePath); + const text = extracted.getBody(); + + // Create markdown with filename as title + let markdown = `# ${path.basename(filePath, '.doc')}\n\n`; + + // Clean up the text and add to markdown + const cleanedText = text + .replace(/\r\n/g, '\n') // Normalize line endings + .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks + .trim(); + + markdown += cleanedText; + + logger.debug(`Converted DOC to ${markdown.length} characters of markdown`); + return markdown; + + } catch (error) { + logger.error(`Failed to convert DOC ${filePath}:`, error); + throw error; + } + } + + private async convertDocxToMarkdown(filePath: string, logger: Logger): Promise { + logger.debug(`Converting DOCX to markdown: ${filePath}`); + + try { + // Dynamic import for mammoth + const mammoth = await import('mammoth'); + + const result = await mammoth.convertToHtml({ path: filePath }); + const html = result.value; + + // Log any warnings from mammoth + if (result.messages.length > 0) { + logger.debug(`Mammoth warnings: ${result.messages.map(m => m.message).join(', ')}`); + } + + // Create markdown with filename as title + let markdown = `# ${path.basename(filePath, '.docx')}\n\n`; + + // Convert HTML to Markdown using turndown + const cleanHtml = sanitizeHtml(html, { + allowedTags: [ + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'ul', 'ol', + 'li', 'b', 'i', 'strong', 'em', 'code', 'pre', + 'div', 'span', 'table', 'thead', 'tbody', 'tr', 'th', 'td', 'br' + ], + allowedAttributes: { + 'a': ['href'], + 'pre': ['class'], + 'code': ['class'] + } + }); + + const convertedContent = this.turndownService.turndown(cleanHtml); + markdown += convertedContent; + + // Clean up excessive line breaks + markdown = markdown.replace(/\n{3,}/g, '\n\n').trim(); + + logger.debug(`Converted DOCX to ${markdown.length} characters of markdown`); + return markdown; + + } catch (error) { + logger.error(`Failed to convert DOCX ${filePath}:`, error); + throw error; + } + } + private async convertPdfToMarkdown(filePath: string, logger: Logger): Promise { logger.debug(`Converting PDF to markdown: ${filePath}`); @@ -771,6 +849,14 @@ export class ContentProcessor { // Handle PDF files logger.debug(`Processing PDF file: ${filePath}`); processedContent = await this.convertPdfToMarkdown(filePath, logger); + } else if (extension === '.doc') { + // Handle legacy Word DOC files + logger.debug(`Processing DOC file: ${filePath}`); + processedContent = await this.convertDocToMarkdown(filePath, logger); + } else if (extension === '.docx') { + // Handle modern Word DOCX files + logger.debug(`Processing DOCX file: ${filePath}`); + processedContent = await this.convertDocxToMarkdown(filePath, logger); } else { // Handle text-based files content = fs.readFileSync(filePath, { encoding: encoding as BufferEncoding }); diff --git a/package-lock.json b/package-lock.json index aeb3d1a..07a2e77 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,12 +19,14 @@ "dotenv": "^16.3.1", "js-yaml": "^4.1.0", "jsdom": "^26.0.0", + "mammoth": "^1.11.0", "openai": "^4.20.1", "pdfjs-dist": "^5.3.31", "puppeteer": "^24.1.1", "sanitize-html": "^2.11.0", "sqlite-vec": "0.1.7-alpha.2", - "turndown": "^7.1.2" + "turndown": "^7.1.2", + "word-extractor": "^1.0.4" }, "bin": { "doc2vec": "dist/doc2vec.js" @@ -662,6 +664,15 @@ "@types/node": "*" } }, + "node_modules/@xmldom/xmldom": { + "version": "0.8.11", + "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.11.tgz", + "integrity": "sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw==", + "license": "MIT", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/abort-controller": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", @@ -903,6 +914,12 @@ "readable-stream": "^3.4.0" } }, + "node_modules/bluebird": { + "version": "3.4.7", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", + "integrity": "sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==", + "license": "MIT" + }, "node_modules/boolbase": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", @@ -1093,6 +1110,12 @@ "node": ">= 0.8" } }, + "node_modules/core-util-is": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", + "license": "MIT" + }, "node_modules/cosmiconfig": { "version": "9.0.0", "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", @@ -1307,6 +1330,12 @@ "node": ">=0.3.1" } }, + "node_modules/dingbat-to-unicode": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", + "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==", + "license": "BSD-2-Clause" + }, "node_modules/dom-serializer": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", @@ -1369,6 +1398,15 @@ "url": "https://dotenvx.com" } }, + "node_modules/duck": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", + "integrity": "sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==", + "license": "BSD", + "dependencies": { + "underscore": "^1.13.1" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -1876,6 +1914,12 @@ } ] }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "license": "MIT" + }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -1939,6 +1983,12 @@ "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==" }, + "node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "license": "MIT" + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -2035,11 +2085,73 @@ "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz", "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==" }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, + "node_modules/jszip/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/jszip/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, + "node_modules/jszip/node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lines-and-columns": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==" }, + "node_modules/lop": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/lop/-/lop-0.4.2.tgz", + "integrity": "sha512-RefILVDQ4DKoRZsJ4Pj22TxE3omDO47yFpkIBoDKzkqPRISs5U1cnAdg/5583YPkWPaLIYHOKRMQSvjFsO26cw==", + "license": "BSD-2-Clause", + "dependencies": { + "duck": "^0.1.12", + "option": "~0.2.1", + "underscore": "^1.13.1" + } + }, "node_modules/lru-cache": { "version": "7.18.3", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz", @@ -2054,6 +2166,45 @@ "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", "dev": true }, + "node_modules/mammoth": { + "version": "1.11.0", + "resolved": "https://registry.npmjs.org/mammoth/-/mammoth-1.11.0.tgz", + "integrity": "sha512-BcEqqY/BOwIcI1iR5tqyVlqc3KIaMRa4egSoK83YAVrBf6+yqdAAbtUcFDCWX8Zef8/fgNZ6rl4VUv+vVX8ddQ==", + "license": "BSD-2-Clause", + "dependencies": { + "@xmldom/xmldom": "^0.8.6", + "argparse": "~1.0.3", + "base64-js": "^1.5.1", + "bluebird": "~3.4.0", + "dingbat-to-unicode": "^1.0.1", + "jszip": "^3.7.1", + "lop": "^0.4.2", + "path-is-absolute": "^1.0.0", + "underscore": "^1.13.1", + "xmlbuilder": "^10.0.0" + }, + "bin": { + "mammoth": "bin/mammoth" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/mammoth/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "license": "MIT", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, + "node_modules/mammoth/node_modules/sprintf-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==", + "license": "BSD-3-Clause" + }, "node_modules/math-intrinsics": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", @@ -2259,6 +2410,12 @@ "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" }, + "node_modules/option": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", + "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==", + "license": "BSD-2-Clause" + }, "node_modules/pac-proxy-agent": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz", @@ -2289,6 +2446,12 @@ "node": ">= 14" } }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", + "license": "(MIT AND Zlib)" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -2356,6 +2519,15 @@ "url": "https://github.com/inikulin/parse5?sponsor=1" } }, + "node_modules/path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/pdfjs-dist": { "version": "5.3.31", "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.3.31.tgz", @@ -2430,6 +2602,12 @@ "node": ">=10" } }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", + "license": "MIT" + }, "node_modules/progress": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", @@ -2621,6 +2799,12 @@ "node": ">=10" } }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", + "license": "MIT" + }, "node_modules/simple-concat": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", @@ -3000,6 +3184,12 @@ "node": ">=14.17" } }, + "node_modules/underscore": { + "version": "1.13.7", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", + "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==", + "license": "MIT" + }, "node_modules/undici": { "version": "5.28.5", "resolved": "https://registry.npmjs.org/undici/-/undici-5.28.5.tgz", @@ -3079,6 +3269,28 @@ "webidl-conversions": "^3.0.0" } }, + "node_modules/word-extractor": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/word-extractor/-/word-extractor-1.0.4.tgz", + "integrity": "sha512-PyAGZQ2gjnVA5kcZAOAxoYciCMaAvu0dbVlw/zxHphhy+3be8cDeYKHJPO8iedIM3Sx0arA/ugKTJyXhZNgo6g==", + "license": "MIT", + "dependencies": { + "saxes": "^5.0.1", + "yauzl": "^2.10.0" + } + }, + "node_modules/word-extractor/node_modules/saxes": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz", + "integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==", + "license": "ISC", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/wrap-ansi": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", @@ -3128,6 +3340,15 @@ "node": ">=18" } }, + "node_modules/xmlbuilder": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-10.1.1.tgz", + "integrity": "sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==", + "license": "MIT", + "engines": { + "node": ">=4.0" + } + }, "node_modules/xmlchars": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", diff --git a/package.json b/package.json index 58cffbb..2a96ecc 100644 --- a/package.json +++ b/package.json @@ -36,12 +36,14 @@ "dotenv": "^16.3.1", "js-yaml": "^4.1.0", "jsdom": "^26.0.0", + "mammoth": "^1.11.0", "openai": "^4.20.1", "pdfjs-dist": "^5.3.31", "puppeteer": "^24.1.1", "sanitize-html": "^2.11.0", "sqlite-vec": "0.1.7-alpha.2", - "turndown": "^7.1.2" + "turndown": "^7.1.2", + "word-extractor": "^1.0.4" }, "devDependencies": { "@types/better-sqlite3": "^7.6.12", From 9eee0a629c4efdfe8c2434b15cd826bfbcf8a426 Mon Sep 17 00:00:00 2001 From: Denis Jannot Date: Thu, 29 Jan 2026 14:30:03 +0100 Subject: [PATCH 5/5] Updating the MCP server Signed-off-by: Denis Jannot --- README.md | 2 + mcp/README.md | 42 ++++++++- mcp/package.json | 2 +- mcp/src/index.ts | 225 +++++++++++++++++++++++++++++++++++++++++++++- package-lock.json | 4 +- package.json | 2 +- 6 files changed, 268 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c9235cd..28f8de6 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ This project provides a configurable tool (`doc2vec`) to crawl specified website The primary goal is to prepare documentation content for Retrieval-Augmented Generation (RAG) systems or semantic search applications. +> **⚠️ Version 2.0.0 Breaking Change:** Version 2.0.0 introduced enhanced chunking with new metadata fields (`chunk_index` and `total_chunks`) that enable page reconstruction and improved chunk ordering. The database schema has changed, and databases created with versions prior to 2.0.0 use a different format. **If you're upgrading to version 2.0.0 or later, you should start with fresh databases** to take advantage of the new features. While the MCP server maintains backward compatibility for querying old databases, doc2vec itself will create databases in the new format. If you need to migrate existing data, consider re-running doc2vec on your sources to regenerate the databases with the enhanced chunking format. + ## Key Features * **Website Crawling:** Recursively crawls websites starting from a given base URL. diff --git a/mcp/README.md b/mcp/README.md index 4e7c8da..f208e5b 100644 --- a/mcp/README.md +++ b/mcp/README.md @@ -18,6 +18,25 @@ This is a Model Context Protocol (MCP) server that enables querying documentatio - OpenAI API key - Documentation stored in SQLite vector databases (using `sqlite-vec`) +## Backward Compatibility + +The MCP server is **backward compatible** with databases created using older versions of doc2vec that don't include the `chunk_index` and `total_chunks` columns. + +### Database Format Support + +**New Format (with `chunk_index` and `total_chunks`):** +- Full functionality including range filtering with `startIndex` and `endIndex` +- Chunk ordering and pagination support +- Metadata available for page reconstruction + +**Old Format (without `chunk_index` and `total_chunks`):** +- `query_documentation` tool works fully (uses `SELECT *` and handles optional fields) +- `get_chunks` tool works but returns all chunks for a document +- Range filtering (`startIndex`/`endIndex`) is gracefully ignored with a warning if requested +- No errors or failures - the server automatically detects missing columns and adapts + +The server automatically detects the database schema and adapts its queries accordingly. No migration or database updates are required to use older databases. + ## Environment Variables | Variable | Description | Default | @@ -239,15 +258,34 @@ kubectl apply -f service.yaml ## Using the MCP Server -The server implements a tool called `query-documentation` that can be used to query documentation. +The server implements two tools: +- `query_documentation` to search documentation +- `get_chunks` to retrieve specific chunks by file path and chunk index -### Tool Parameters +### query_documentation +**Parameters** - `queryText` (string, required): The natural language query to search for - `productName` (string, required): The name of the product documentation database to search within - `version` (string, optional): The specific version of the product documentation - `limit` (number, optional, default: 4): Maximum number of results to return +**Notes** +- Results include `chunk_index` and `total_chunks` when available, so clients can request neighboring chunks. + +### get_chunks + +**Parameters** +- `productName` (string, required): The name of the product documentation database to search within +- `filePath` (string, required): The document path (stored as `url` in the DB) +- `startIndex` (number, optional): Start index of the chunk range to retrieve (0-based). If not provided, returns all chunks from the beginning +- `endIndex` (number, optional): End index of the chunk range to retrieve (0-based, inclusive). If not provided, returns all chunks to the end +- `version` (string, optional): The specific version of the product documentation + +**Notes** +- Range filtering (`startIndex`/`endIndex`) requires the `chunk_index` column in the database. For older databases without this column, the tool will return all chunks and log a warning if range parameters are provided. +- Results include `chunk_index` and `total_chunks` metadata when available (new format databases only). + ## Integration Examples ### Claude Desktop Configuration diff --git a/mcp/package.json b/mcp/package.json index 55a8ad7..c0ee757 100644 --- a/mcp/package.json +++ b/mcp/package.json @@ -1,6 +1,6 @@ { "name": "sqlite-vec-mcp-server", - "version": "1.0.0", + "version": "2.0.0", "description": "MCP Server for querying documentation with sqlite-vec", "main": "build/index.js", "type": "module", diff --git a/mcp/src/index.ts b/mcp/src/index.ts index ecc12df..bb9dac5 100644 --- a/mcp/src/index.ts +++ b/mcp/src/index.ts @@ -83,6 +83,10 @@ export interface QueryResult { distance: number; content: string; url?: string; + section?: string; + heading_hierarchy?: string; + chunk_index?: number; + total_chunks?: number; embedding?: Float32Array | number[]; [key: string]: unknown; } @@ -196,16 +200,147 @@ function queryCollection(queryEmbedding: number[], filter: { product_name: strin } } -async function queryDocumentation(queryText: string, productName: string, version?: string, limit: number = 4): Promise<{ distance: number, content: string, url?: string }[]> { +async function queryDocumentation( + queryText: string, + productName: string, + version?: string, + limit: number = 4 +): Promise<{ + distance: number; + content: string; + url?: string; + section?: string; + chunk_index?: number; + total_chunks?: number; +}[]> { const queryEmbedding = await createEmbeddings(queryText); const results = queryCollection(queryEmbedding, { product_name: productName, version: version }, limit); return results.map((qr: QueryResult) => ({ distance: qr.distance, content: qr.content, ...(qr.url && { url: qr.url }), + ...(qr.section && { section: qr.section }), + ...(typeof qr.chunk_index === 'number' && { chunk_index: qr.chunk_index }), + ...(typeof qr.total_chunks === 'number' && { total_chunks: qr.total_chunks }), })); } +function getChunksForDocument( + productName: string, + filePath: string, + startIndex?: number, + endIndex?: number, + version?: string +): QueryResult[] { + const dbPath = path.join(dbDir, `${productName}.db`); + + if (!fs.existsSync(dbPath)) { + throw new Error(`Database file not found at ${dbPath}`); + } + + let db: DatabaseType | null = null; + try { + db = new Database(dbPath); + sqliteVec.load(db); + + const hasRange = typeof startIndex === 'number' && typeof endIndex === 'number'; + + // Try to build and execute query with chunk_index/total_chunks first + // If it fails, retry without those columns (backward compatibility) + let selectColumns = [ + 'chunk_id', + 'content', + 'url', + 'section', + 'heading_hierarchy', + 'chunk_index', + 'total_chunks' + ]; + + let query = ` + SELECT + ${selectColumns.join(', ')} + FROM vec_items + WHERE url = ?`; + + if (version) query += ` AND version = ?`; + if (hasRange) { + query += ` AND chunk_index >= ? AND chunk_index <= ?`; + } + + query += ` + ORDER BY chunk_index;`; + + let stmt; + let params: (string | number)[] = [filePath]; + if (version) params.push(version); + if (hasRange) { + params.push(startIndex); + params.push(endIndex); + } + + try { + stmt = db.prepare(query); + const rows = stmt.all(...params) as QueryResult[]; + return rows; + } catch (error: any) { + // If query fails due to missing chunk_index column, retry without it + const errorMessage = error?.message || String(error); + const errorStr = String(error); + const isChunkIndexError = (errorMessage.includes('no such column') && errorMessage.includes('chunk_index')) || + (errorStr.includes('no such column') && errorStr.includes('chunk_index')); + + if (isChunkIndexError) { + console.error(`Warning: chunk_index column doesn't exist in database. Using backward compatible query.`); + + if (hasRange) { + console.error(`Warning: startIndex/endIndex provided but chunk_index column doesn't exist. Ignoring range filter.`); + } + + // Build query without chunk_index/total_chunks + selectColumns = [ + 'chunk_id', + 'content', + 'url', + 'section', + 'heading_hierarchy' + ]; + + query = ` + SELECT + ${selectColumns.join(', ')} + FROM vec_items + WHERE url = ?`; + + if (version) query += ` AND version = ?`; + query += `;`; + + params = [filePath]; + if (version) params.push(version); + + try { + stmt = db.prepare(query); + const rows = stmt.all(...params) as QueryResult[]; + return rows; + } catch (retryError: any) { + // If retry also fails, throw the original error + throw error; + } + } else { + // Re-throw if it's a different error + throw error; + } + } + } catch (error) { + console.error(`Error retrieving chunks in ${dbPath}:`, error); + throw new Error(`Chunk retrieval failed: ${error instanceof Error ? error.message : String(error)}`); + } finally { + if (db) { + db.close(); + } + } +} + // --- MCP Server Setup --- const serverName = "sqlite-vec-doc-query"; // Store name for logging const serverVersion = "1.0.0"; // Store version for logging @@ -217,7 +352,17 @@ const server = new McpServer({ }); // --- Define the MCP Tool Logic --- -const queryDocumentationToolHandler = async ({ queryText, productName, version, limit }: { queryText: string; productName: string; version?: string; limit: number }) => { +const queryDocumentationToolHandler = async ({ + queryText, + productName, + version, + limit, +}: { + queryText: string; + productName: string; + version?: string; + limit: number; +}) => { console.error(`Received query: text="${queryText}", product="${productName}", version="${version || 'any'}", limit=${limit}`); try { @@ -235,6 +380,9 @@ const queryDocumentationToolHandler = async ({ queryText, productName, version, ` Content: ${r.content}`, ` Distance: ${r.distance.toFixed(4)}`, r.url ? ` URL: ${r.url}` : null, + typeof r.chunk_index === 'number' && typeof r.total_chunks === 'number' + ? ` Chunk: ${r.chunk_index + 1} of ${r.total_chunks}` + : null, "---" ].filter(line => line !== null).join("\n") ).join("\n"); @@ -253,6 +401,51 @@ const queryDocumentationToolHandler = async ({ queryText, productName, version, } }; +const getChunksToolHandler = async ({ + productName, + filePath, + startIndex, + endIndex, + version, +}: { + productName: string; + filePath: string; + startIndex?: number; + endIndex?: number; + version?: string; +}) => { + console.error(`Received get_chunks: filePath="${filePath}", product="${productName}", version="${version || 'any'}", startIndex=${startIndex}, endIndex=${endIndex}`); + + try { + const results = getChunksForDocument(productName, filePath, startIndex, endIndex, version); + + if (results.length === 0) { + return { + content: [{ type: "text" as const, text: `No chunks found for "${filePath}" in product "${productName}" ${version ? `(version ${version})` : ''}.` }], + }; + } + + const formattedResults = results.map((r) => + [ + `Chunk ${typeof r.chunk_index === 'number' && typeof r.total_chunks === 'number' ? `${r.chunk_index + 1} of ${r.total_chunks}` : ''}`.trim(), + ` Content: ${r.content}`, + r.section ? ` Section: ${r.section}` : null, + r.url ? ` URL: ${r.url}` : null, + "---" + ].filter(line => line !== null).join("\n") + ).join("\n"); + + return { + content: [{ type: "text" as const, text: `Retrieved ${results.length} chunk(s) for "${filePath}":\n\n${formattedResults}` }], + }; + } catch (error: any) { + console.error("Error processing 'get_chunks' tool:", error); + return { + content: [{ type: "text" as const, text: `Error retrieving chunks: ${error.message}` }], + }; + } +}; + // --- Define the MCP Tool --- server.tool( "query_documentation", @@ -266,6 +459,19 @@ server.tool( queryDocumentationToolHandler ); +server.tool( + "get_chunks", + "Retrieve specific chunks from a document by file path.", + { + productName: z.string().min(1).describe("The name of the product documentation database to search within (e.g., 'my-product'). Corresponds to the DB filename without .db."), + filePath: z.string().min(1).describe("The file path (url) of the document to retrieve chunks from."), + startIndex: z.number().int().nonnegative().optional().describe("Start index of the chunk range to retrieve (0-based). If not provided, returns all chunks from the beginning."), + endIndex: z.number().int().nonnegative().optional().describe("End index of the chunk range to retrieve (0-based, inclusive). If not provided, returns all chunks to the end."), + version: z.string().optional().describe("The specific version of the product documentation (e.g., '1.2.0'). Optional."), + }, + getChunksToolHandler +); + // --- Transport Setup --- async function main() { const transport_type = process.env.TRANSPORT_TYPE || 'http'; @@ -428,7 +634,7 @@ async function main() { }, }); - // Add the query_documentation tool to this server instance using the shared handler + // Add tools to this server instance using shared handlers sessionServer.tool( "query_documentation", "Query documentation stored in a sqlite-vec database using vector search.", @@ -440,6 +646,19 @@ async function main() { }, queryDocumentationToolHandler ); + + sessionServer.tool( + "get_chunks", + "Retrieve specific chunks from a document by file path.", + { + productName: z.string().min(1).describe("The name of the product documentation database to search within (e.g., 'my-product'). Corresponds to the DB filename without .db."), + filePath: z.string().min(1).describe("The file path (url) of the document to retrieve chunks from."), + startIndex: z.number().int().nonnegative().optional().describe("Start index of the chunk range to retrieve (0-based). If not provided, returns all chunks from the beginning."), + endIndex: z.number().int().nonnegative().optional().describe("End index of the chunk range to retrieve (0-based, inclusive). If not provided, returns all chunks to the end."), + version: z.string().optional().describe("The specific version of the product documentation (e.g., '1.2.0'). Optional."), + }, + getChunksToolHandler + ); transport = new StreamableHTTPServerTransport({ sessionIdGenerator: () => randomUUID(), diff --git a/package-lock.json b/package-lock.json index 07a2e77..0249887 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "doc2vec", - "version": "1.3.0", + "version": "2.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "doc2vec", - "version": "1.3.0", + "version": "2.0.0", "license": "ISC", "dependencies": { "@mozilla/readability": "^0.4.4", diff --git a/package.json b/package.json index 2a96ecc..b796ed5 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "doc2vec", - "version": "1.3.0", + "version": "2.0.0", "type": "commonjs", "description": "", "main": "dist/doc2vec.js",