From cc2ae4205cdd404d89b593a258bb3ecedb9ca10d Mon Sep 17 00:00:00 2001
From: Andrew Zhong
Date: Tue, 13 May 2025 12:55:36 -0700
Subject: [PATCH 1/3] init
---
README.md | 4 +-
src/index.ts | 2 +-
src/types.ts | 8 +-
tests/integration/processedContent.test.ts | 86 ++++++++++++++++++++++
4 files changed, 95 insertions(+), 5 deletions(-)
create mode 100644 tests/integration/processedContent.test.ts
diff --git a/README.md b/README.md
index 7f1908f..6785251 100644
--- a/README.md
+++ b/README.md
@@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult` object:
```typescript
interface ExtractorResult {
data: T; // Extracted structured data
- markdown: string; // The markdown content that was processed
usage: { // Token usage statistics
inputTokens?: number;
outputTokens?: number;
- }
+ };
+ processedContent: string; // Processed content that was sent to the LLM. Markdown if the input was HTML (after conversion)
}
```
diff --git a/src/index.ts b/src/index.ts
index 190763f..76b24b5 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -100,7 +100,7 @@ export async function extract(
// Return the full result
return {
data,
- markdown: content,
+ processedContent: content,
usage,
};
}
diff --git a/src/types.ts b/src/types.ts
index 64ed4b2..7634b8c 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -100,8 +100,12 @@ export interface ExtractorResult {
/** Extracted data according to the schema */
data: T;
- /** Raw markdown content that was processed */
- markdown: string;
+ /**
+ * Processed content that was sent to the LLM.
+ * This will be markdown if the input was HTML (after conversion),
+ * or the original content if the input was already markdown or plain text.
+ */
+ processedContent: string;
/** Usage statistics */
usage: Usage;
diff --git a/tests/integration/processedContent.test.ts b/tests/integration/processedContent.test.ts
new file mode 100644
index 0000000..5817454
--- /dev/null
+++ b/tests/integration/processedContent.test.ts
@@ -0,0 +1,86 @@
+import { z } from "zod";
+import { extract, ContentFormat, LLMProvider } from "../../src";
+
+describe("ProcessedContent Integration Tests", () => {
+ const simpleSchema = z.object({
+ title: z.string(),
+ content: z.string().nullable(),
+ });
+
+ // Skip tests if API keys are not available
+ const skipIfNoKeys = () => {
+ if (!process.env.OPENAI_API_KEY) {
+ return true;
+ }
+ return false;
+ };
+
+ it("should return original content as processedContent for TXT format", async () => {
+ if (skipIfNoKeys()) {
+ console.log("Skipping test: No API keys available");
+ return;
+ }
+
+ const plainTextContent =
+ "Title: Simple Test\n\nThis is a test of plain text extraction.";
+
+ const result = await extract({
+ content: plainTextContent,
+ format: ContentFormat.TXT,
+ schema: simpleSchema,
+ provider: LLMProvider.OPENAI,
+ openaiApiKey: process.env.OPENAI_API_KEY,
+ });
+
+ // Verify the processedContent is the same as the original content
+ expect(result.processedContent).toBe(plainTextContent);
+ }, 30000);
+
+ it("should return original content as processedContent for MARKDOWN format", async () => {
+ if (skipIfNoKeys()) {
+ console.log("Skipping test: No API keys available");
+ return;
+ }
+
+ const markdownContent =
+ "# Simple Test\n\nThis is a test of markdown extraction.";
+
+ const result = await extract({
+ content: markdownContent,
+ format: ContentFormat.MARKDOWN,
+ schema: simpleSchema,
+ provider: LLMProvider.OPENAI,
+ openaiApiKey: process.env.OPENAI_API_KEY,
+ });
+
+ // Verify the processedContent is the same as the original content
+ expect(result.processedContent).toBe(markdownContent);
+ }, 30000);
+
+ it("should return converted markdown as processedContent for HTML format", async () => {
+ if (skipIfNoKeys()) {
+ console.log("Skipping test: No API keys available");
+ return;
+ }
+
+ const htmlContent =
+ "Simple Test
This is a test of HTML extraction.
";
+
+ const result = await extract({
+ content: htmlContent,
+ format: ContentFormat.HTML,
+ schema: simpleSchema,
+ provider: LLMProvider.OPENAI,
+ openaiApiKey: process.env.OPENAI_API_KEY,
+ sourceUrl: "https://example.com",
+ });
+
+ // For HTML, processedContent should be the converted markdown
+ expect(result.processedContent).toContain("Simple Test");
+ expect(result.processedContent).toContain(
+ "This is a test of HTML extraction."
+ );
+ expect(result.processedContent).not.toContain("");
+ expect(result.processedContent).not.toContain("
");
+ }, 30000);
+});
From 8fc32b6547c7445ddb72f9d6eb927d5ee4522438 Mon Sep 17 00:00:00 2001
From: Andrew Zhong
Date: Tue, 13 May 2025 12:57:08 -0700
Subject: [PATCH 2/3] minor
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 6785251..b0c5d9f 100644
--- a/README.md
+++ b/README.md
@@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult` object:
```typescript
interface ExtractorResult {
data: T; // Extracted structured data
+ processedContent: string; // Processed content that was sent to the LLM. Markdown if the input was HTM (after conversion)
usage: { // Token usage statistics
inputTokens?: number;
outputTokens?: number;
};
- processedContent: string; // Processed content that was sent to the LLM. Markdown if the input was HTML (after conversion)
}
```
From a9e8124ca4ebdc4845c328c854e47daa1aece117 Mon Sep 17 00:00:00 2001
From: Andrew Zhong
Date: Tue, 13 May 2025 12:59:21 -0700
Subject: [PATCH 3/3] fix example test
---
src/example.ts | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/example.ts b/src/example.ts
index 7f8da99..420ff6b 100644
--- a/src/example.ts
+++ b/src/example.ts
@@ -71,7 +71,7 @@ async function example() {
console.log(JSON.stringify(result.data, null, 2));
console.log("\nMarkdown Content:");
- console.log(result.markdown);
+ console.log(result.processedContent);
console.log("\nToken Usage:");
console.log(result.usage);