From cc2ae4205cdd404d89b593a258bb3ecedb9ca10d Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 12:55:36 -0700 Subject: [PATCH 1/3] init --- README.md | 4 +- src/index.ts | 2 +- src/types.ts | 8 +- tests/integration/processedContent.test.ts | 86 ++++++++++++++++++++++ 4 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 tests/integration/processedContent.test.ts diff --git a/README.md b/README.md index 7f1908f..6785251 100644 --- a/README.md +++ b/README.md @@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult` object: ```typescript interface ExtractorResult { data: T; // Extracted structured data - markdown: string; // The markdown content that was processed usage: { // Token usage statistics inputTokens?: number; outputTokens?: number; - } + }; + processedContent: string; // Processed content that was sent to the LLM. Markdown if the input was HTML (after conversion) } ``` diff --git a/src/index.ts b/src/index.ts index 190763f..76b24b5 100644 --- a/src/index.ts +++ b/src/index.ts @@ -100,7 +100,7 @@ export async function extract( // Return the full result return { data, - markdown: content, + processedContent: content, usage, }; } diff --git a/src/types.ts b/src/types.ts index 64ed4b2..7634b8c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -100,8 +100,12 @@ export interface ExtractorResult { /** Extracted data according to the schema */ data: T; - /** Raw markdown content that was processed */ - markdown: string; + /** + * Processed content that was sent to the LLM. + * This will be markdown if the input was HTML (after conversion), + * or the original content if the input was already markdown or plain text. + */ + processedContent: string; /** Usage statistics */ usage: Usage; diff --git a/tests/integration/processedContent.test.ts b/tests/integration/processedContent.test.ts new file mode 100644 index 0000000..5817454 --- /dev/null +++ b/tests/integration/processedContent.test.ts @@ -0,0 +1,86 @@ +import { z } from "zod"; +import { extract, ContentFormat, LLMProvider } from "../../src"; + +describe("ProcessedContent Integration Tests", () => { + const simpleSchema = z.object({ + title: z.string(), + content: z.string().nullable(), + }); + + // Skip tests if API keys are not available + const skipIfNoKeys = () => { + if (!process.env.OPENAI_API_KEY) { + return true; + } + return false; + }; + + it("should return original content as processedContent for TXT format", async () => { + if (skipIfNoKeys()) { + console.log("Skipping test: No API keys available"); + return; + } + + const plainTextContent = + "Title: Simple Test\n\nThis is a test of plain text extraction."; + + const result = await extract({ + content: plainTextContent, + format: ContentFormat.TXT, + schema: simpleSchema, + provider: LLMProvider.OPENAI, + openaiApiKey: process.env.OPENAI_API_KEY, + }); + + // Verify the processedContent is the same as the original content + expect(result.processedContent).toBe(plainTextContent); + }, 30000); + + it("should return original content as processedContent for MARKDOWN format", async () => { + if (skipIfNoKeys()) { + console.log("Skipping test: No API keys available"); + return; + } + + const markdownContent = + "# Simple Test\n\nThis is a test of markdown extraction."; + + const result = await extract({ + content: markdownContent, + format: ContentFormat.MARKDOWN, + schema: simpleSchema, + provider: LLMProvider.OPENAI, + openaiApiKey: process.env.OPENAI_API_KEY, + }); + + // Verify the processedContent is the same as the original content + expect(result.processedContent).toBe(markdownContent); + }, 30000); + + it("should return converted markdown as processedContent for HTML format", async () => { + if (skipIfNoKeys()) { + console.log("Skipping test: No API keys available"); + return; + } + + const htmlContent = + "

Simple Test

This is a test of HTML extraction.

"; + + const result = await extract({ + content: htmlContent, + format: ContentFormat.HTML, + schema: simpleSchema, + provider: LLMProvider.OPENAI, + openaiApiKey: process.env.OPENAI_API_KEY, + sourceUrl: "https://example.com", + }); + + // For HTML, processedContent should be the converted markdown + expect(result.processedContent).toContain("Simple Test"); + expect(result.processedContent).toContain( + "This is a test of HTML extraction." + ); + expect(result.processedContent).not.toContain("

"); + expect(result.processedContent).not.toContain("

"); + }, 30000); +}); From 8fc32b6547c7445ddb72f9d6eb927d5ee4522438 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 12:57:08 -0700 Subject: [PATCH 2/3] minor --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6785251..b0c5d9f 100644 --- a/README.md +++ b/README.md @@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult` object: ```typescript interface ExtractorResult { data: T; // Extracted structured data + processedContent: string; // Processed content that was sent to the LLM. Markdown if the input was HTM (after conversion) usage: { // Token usage statistics inputTokens?: number; outputTokens?: number; }; - processedContent: string; // Processed content that was sent to the LLM. Markdown if the input was HTML (after conversion) } ``` From a9e8124ca4ebdc4845c328c854e47daa1aece117 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 12:59:21 -0700 Subject: [PATCH 3/3] fix example test --- src/example.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/example.ts b/src/example.ts index 7f8da99..420ff6b 100644 --- a/src/example.ts +++ b/src/example.ts @@ -71,7 +71,7 @@ async function example() { console.log(JSON.stringify(result.data, null, 2)); console.log("\nMarkdown Content:"); - console.log(result.markdown); + console.log(result.processedContent); console.log("\nToken Usage:"); console.log(result.usage);