From cc2ae4205cdd404d89b593a258bb3ecedb9ca10d Mon Sep 17 00:00:00 2001
From: Andrew Zhong <axzhong3@gmail.com>
Date: Tue, 13 May 2025 12:55:36 -0700
Subject: [PATCH 1/3] init

---
 README.md                                  |  4 +-
 src/index.ts                               |  2 +-
 src/types.ts                               |  8 +-
 tests/integration/processedContent.test.ts | 86 ++++++++++++++++++++++
 4 files changed, 95 insertions(+), 5 deletions(-)
 create mode 100644 tests/integration/processedContent.test.ts
diff --git a/README.md b/README.md
index 7f1908f..6785251 100644
--- a/README.md
+++ b/README.md
@@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult<T>` object:
 ```typescript
 interface ExtractorResult<T> {
   data: T;             // Extracted structured data
-  markdown: string;    // The markdown content that was processed
   usage: {             // Token usage statistics
     inputTokens?: number;
     outputTokens?: number;
-  }
+  };
+  processedContent: string;    // Processed content that was sent to the LLM. Markdown if the input was HTML (after conversion)
 }
 ```
 
diff --git a/src/index.ts b/src/index.ts
index 190763f..76b24b5 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -100,7 +100,7 @@ export async function extract<T extends z.ZodTypeAny>(
   // Return the full result
   return {
     data,
-    markdown: content,
+    processedContent: content,
     usage,
   };
 }
diff --git a/src/types.ts b/src/types.ts
index 64ed4b2..7634b8c 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -100,8 +100,12 @@ export interface ExtractorResult<T> {
   /** Extracted data according to the schema */
   data: T;
 
-  /** Raw markdown content that was processed */
-  markdown: string;
+  /**
+   * Processed content that was sent to the LLM.
+   * This will be markdown if the input was HTML (after conversion),
+   * or the original content if the input was already markdown or plain text.
+   */
+  processedContent: string;
 
   /** Usage statistics */
   usage: Usage;
diff --git a/tests/integration/processedContent.test.ts b/tests/integration/processedContent.test.ts
new file mode 100644
index 0000000..5817454
--- /dev/null
+++ b/tests/integration/processedContent.test.ts
@@ -0,0 +1,86 @@
+import { z } from "zod";
+import { extract, ContentFormat, LLMProvider } from "../../src";
+
+describe("ProcessedContent Integration Tests", () => {
+  const simpleSchema = z.object({
+    title: z.string(),
+    content: z.string().nullable(),
+  });
+
+  // Skip tests if API keys are not available
+  const skipIfNoKeys = () => {
+    if (!process.env.OPENAI_API_KEY) {
+      return true;
+    }
+    return false;
+  };
+
+  it("should return original content as processedContent for TXT format", async () => {
+    if (skipIfNoKeys()) {
+      console.log("Skipping test: No API keys available");
+      return;
+    }
+
+    const plainTextContent =
+      "Title: Simple Test\n\nThis is a test of plain text extraction.";
+
+    const result = await extract({
+      content: plainTextContent,
+      format: ContentFormat.TXT,
+      schema: simpleSchema,
+      provider: LLMProvider.OPENAI,
+      openaiApiKey: process.env.OPENAI_API_KEY,
+    });
+
+    // Verify the processedContent is the same as the original content
+    expect(result.processedContent).toBe(plainTextContent);
+  }, 30000);
+
+  it("should return original content as processedContent for MARKDOWN format", async () => {
+    if (skipIfNoKeys()) {
+      console.log("Skipping test: No API keys available");
+      return;
+    }
+
+    const markdownContent =
+      "# Simple Test\n\nThis is a test of markdown extraction.";
+
+    const result = await extract({
+      content: markdownContent,
+      format: ContentFormat.MARKDOWN,
+      schema: simpleSchema,
+      provider: LLMProvider.OPENAI,
+      openaiApiKey: process.env.OPENAI_API_KEY,
+    });
+
+    // Verify the processedContent is the same as the original content
+    expect(result.processedContent).toBe(markdownContent);
+  }, 30000);
+
+  it("should return converted markdown as processedContent for HTML format", async () => {
+    if (skipIfNoKeys()) {
+      console.log("Skipping test: No API keys available");
+      return;
+    }
+
+    const htmlContent =
+      "<h1>Simple Test</h1><p>This is a test of HTML extraction.</p>";
+
+    const result = await extract({
+      content: htmlContent,
+      format: ContentFormat.HTML,
+      schema: simpleSchema,
+      provider: LLMProvider.OPENAI,
+      openaiApiKey: process.env.OPENAI_API_KEY,
+      sourceUrl: "https://example.com",
+    });
+
+    // For HTML, processedContent should be the converted markdown
+    expect(result.processedContent).toContain("Simple Test");
+    expect(result.processedContent).toContain(
+      "This is a test of HTML extraction."
+    );
+    expect(result.processedContent).not.toContain("<h1>");
+    expect(result.processedContent).not.toContain("</p>");
+  }, 30000);
+});

From 8fc32b6547c7445ddb72f9d6eb927d5ee4522438 Mon Sep 17 00:00:00 2001
From: Andrew Zhong <axzhong3@gmail.com>
Date: Tue, 13 May 2025 12:57:08 -0700
Subject: [PATCH 2/3] minor

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6785251..b0c5d9f 100644
--- a/README.md
+++ b/README.md
@@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult<T>` object:
 ```typescript
 interface ExtractorResult<T> {
   data: T;             // Extracted structured data
+  processedContent: string;    // Processed content that was sent to the LLM. Markdown if the input was HTM (after conversion)
   usage: {             // Token usage statistics
     inputTokens?: number;
     outputTokens?: number;
   };
-  processedContent: string;    // Processed content that was sent to the LLM. Markdown if the input was HTML (after conversion)
 }
 ```
 

From a9e8124ca4ebdc4845c328c854e47daa1aece117 Mon Sep 17 00:00:00 2001
From: Andrew Zhong <axzhong3@gmail.com>
Date: Tue, 13 May 2025 12:59:21 -0700
Subject: [PATCH 3/3] fix example test

---
 src/example.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/example.ts b/src/example.ts
index 7f8da99..420ff6b 100644
--- a/src/example.ts
+++ b/src/example.ts
@@ -71,7 +71,7 @@ async function example() {
     console.log(JSON.stringify(result.data, null, 2));
 
     console.log("\nMarkdown Content:");
-    console.log(result.markdown);
+    console.log(result.processedContent);
 
     console.log("\nToken Usage:");
     console.log(result.usage);