Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -302,11 +302,11 @@ The function returns a Promise that resolves to an `ExtractorResult<T>` object:
```typescript
interface ExtractorResult<T> {
data: T; // Extracted structured data
markdown: string; // The markdown content that was processed
processedContent: string; // Processed content that was sent to the LLM. Markdown if the input was HTM (after conversion)
usage: { // Token usage statistics
inputTokens?: number;
outputTokens?: number;
}
};
}
```

Expand Down
2 changes: 1 addition & 1 deletion src/example.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ async function example() {
console.log(JSON.stringify(result.data, null, 2));

console.log("\nMarkdown Content:");
console.log(result.markdown);
console.log(result.processedContent);

console.log("\nToken Usage:");
console.log(result.usage);
Expand Down
2 changes: 1 addition & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ export async function extract<T extends z.ZodTypeAny>(
// Return the full result
return {
data,
markdown: content,
processedContent: content,
usage,
};
}
Expand Down
8 changes: 6 additions & 2 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,12 @@ export interface ExtractorResult<T> {
/** Extracted data according to the schema */
data: T;

/** Raw markdown content that was processed */
markdown: string;
/**
* Processed content that was sent to the LLM.
* This will be markdown if the input was HTML (after conversion),
* or the original content if the input was already markdown or plain text.
*/
processedContent: string;

/** Usage statistics */
usage: Usage;
Expand Down
86 changes: 86 additions & 0 deletions tests/integration/processedContent.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import { z } from "zod";
import { extract, ContentFormat, LLMProvider } from "../../src";

describe("ProcessedContent Integration Tests", () => {
const simpleSchema = z.object({
title: z.string(),
content: z.string().nullable(),
});

// Skip tests if API keys are not available
const skipIfNoKeys = () => {
if (!process.env.OPENAI_API_KEY) {
return true;
}
return false;
};

it("should return original content as processedContent for TXT format", async () => {
if (skipIfNoKeys()) {
console.log("Skipping test: No API keys available");
return;
}

const plainTextContent =
"Title: Simple Test\n\nThis is a test of plain text extraction.";

const result = await extract({
content: plainTextContent,
format: ContentFormat.TXT,
schema: simpleSchema,
provider: LLMProvider.OPENAI,
openaiApiKey: process.env.OPENAI_API_KEY,
});

// Verify the processedContent is the same as the original content
expect(result.processedContent).toBe(plainTextContent);
}, 30000);

it("should return original content as processedContent for MARKDOWN format", async () => {
if (skipIfNoKeys()) {
console.log("Skipping test: No API keys available");
return;
}

const markdownContent =
"# Simple Test\n\nThis is a test of markdown extraction.";

const result = await extract({
content: markdownContent,
format: ContentFormat.MARKDOWN,
schema: simpleSchema,
provider: LLMProvider.OPENAI,
openaiApiKey: process.env.OPENAI_API_KEY,
});

// Verify the processedContent is the same as the original content
expect(result.processedContent).toBe(markdownContent);
}, 30000);

it("should return converted markdown as processedContent for HTML format", async () => {
if (skipIfNoKeys()) {
console.log("Skipping test: No API keys available");
return;
}

const htmlContent =
"<h1>Simple Test</h1><p>This is a test of HTML extraction.</p>";

const result = await extract({
content: htmlContent,
format: ContentFormat.HTML,
schema: simpleSchema,
provider: LLMProvider.OPENAI,
openaiApiKey: process.env.OPENAI_API_KEY,
sourceUrl: "https://example.com",
});

// For HTML, processedContent should be the converted markdown
expect(result.processedContent).toContain("Simple Test");
expect(result.processedContent).toContain(
"This is a test of HTML extraction."
);
expect(result.processedContent).not.toContain("<h1>");
expect(result.processedContent).not.toContain("</p>");
}, 30000);
});