Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ Use LLMs to **robustly** extract or enrich structured data from HTML and markdow
4. **URL Validation**: All extracted URLs are validated - handling relative URLs, removing invalid ones, and repairing markdown-escaped links. See [URL Validation](#url-validation) section for details.

## Why use an LLM extractor?
💡 Can reason from context and return structured answers in addition to extracting content as-is

🔎 Can search from additional context and enrich existing data objects
💡 Understands natural language criteria and context to extract the data you need, not just raw content as displayed

⚡️ No need to manually create custom scraper code for each site

Expand Down Expand Up @@ -203,6 +201,9 @@ const result = await extract({
});
```

> [!WARNING]
> For OpenAI models, optional schema is not supported. You need to change `.optional()` to `.nullable()`.

### Extracting from Main HTML

For blog posts or articles with lots of navigation elements, headers, and footers, you can use the `extractMainHtml` option to focus on just the main content:
Expand Down Expand Up @@ -377,13 +378,13 @@ const productSchema = z.object({
name: z.string(), // Required field
price: z.number().optional(), // Optional number
inStock: z.boolean().optional(),
category: z.string().optional()
category: z.string().optional(),
})
),
storeInfo: z.object({
name: z.string(),
location: z.string().optional(),
rating: z.number().optional()
rating: z.number().optional(),
})
});

Expand All @@ -394,14 +395,14 @@ const rawLLMOutput = {
id: 1,
name: "Laptop",
price: 999,
inStock: true
inStock: true,
}, // Valid product
{
id: 2,
name: "Headphones",
price: "N/A", // Non-convertible string for optional number
inStock: true,
category: "Audio"
category: "Audio",
},
{
id: 3,
Expand Down Expand Up @@ -433,24 +434,24 @@ const sanitizedData = safeSanitizedParser(productSchema, rawLLMOutput);
// id: 1,
// name: "Laptop",
// price: 999,
// inStock: true
// inStock: true,
// },
// {
// id: 2,
// name: "Headphones",
// inStock: true,
// category: "Audio"
// category: "Audio",
// },
// {
// id: 4,
// name: "Keyboard",
// price: 59.99,
// inStock: true
// inStock: true,
// }
// ],
// storeInfo: {
// name: "TechStore",
// location: "123 Main St"
// location: "123 Main St",
// }
// }
```
Expand Down
10 changes: 7 additions & 3 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 32 additions & 2 deletions src/dev/runLocalTest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@ const blogSchema = z.object({
content: z.string().optional(),
});

// OpenAI version with nullable instead of optional
const blogSchemaOpenAI = z.object({
title: z.string(),
author: z.string().nullable(),
date: z.string().nullable(),
tags: z
.array(z.string())
.nullable()
.describe("Tags appear after the date. Do not include the # symbol."),
summary: z.string(),
content: z.string().nullable(),
});

const productSchema = z.object({
products: z.array(
z.object({
Expand All @@ -40,6 +53,19 @@ const productSchema = z.object({
),
});

// OpenAI version with nullable instead of optional
const productSchemaOpenAI = z.object({
products: z.array(
z.object({
name: z.string(),
price: z.string(),
rating: z.string().nullable(),
description: z.string().nullable(),
features: z.array(z.string()).nullable(),
})
),
});

// Test functions
async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
console.log(`Testing blog post extraction with ${provider}...`);
Expand All @@ -64,7 +90,8 @@ async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
const result = await extract({
content: html,
format: ContentFormat.HTML,
schema: blogSchema,
schema:
provider === LLMProvider.GOOGLE_GEMINI ? blogSchema : blogSchemaOpenAI,
provider,
googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,
Expand Down Expand Up @@ -109,7 +136,10 @@ async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
const result = await extract({
content: html,
format: ContentFormat.HTML,
schema: productSchema,
schema:
provider === LLMProvider.GOOGLE_GEMINI
? productSchema
: productSchemaOpenAI,
provider,
googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,
Expand Down
68 changes: 67 additions & 1 deletion src/utils/schemaUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
ZodOptional,
ZodTypeAny,
ZodString,
ZodNullable,
} from "zod";

/**
Expand Down Expand Up @@ -94,6 +95,21 @@ export function transformSchemaForLLM<T extends ZodTypeAny>(
});
}

// For nullable schemas, transform the inner schema
if (schema instanceof ZodNullable) {
const originalDef = { ...(schema as any)._def };
const transformedInner = transformSchemaForLLM(
schema.unwrap() as ZodTypeAny
);

// Create a new nullable with the same definition but transformed inner type
return new z.ZodNullable({
...originalDef,
innerType: transformedInner,
typeName: z.ZodFirstPartyTypeKind.ZodNullable,
});
}

// Return the original schema for all other types
return schema;
}
Expand Down Expand Up @@ -144,6 +160,11 @@ export function fixUrlEscapeSequences(data: any, schema: ZodTypeAny): any {
return fixUrlEscapeSequences(data, innerSchema);
}

if (schema instanceof ZodNullable) {
const innerSchema = schema.unwrap() as ZodTypeAny;
return fixUrlEscapeSequences(data, innerSchema);
}

return data;
}

Expand Down Expand Up @@ -172,6 +193,8 @@ export function safeSanitizedParser<T extends ZodTypeAny>(
return sanitizeArray(schema, rawObject);
} else if (schema instanceof ZodOptional) {
return sanitizeOptional(schema, rawObject);
} else if (schema instanceof ZodNullable) {
return sanitizeNullable(schema, rawObject);
} else {
// For primitive values, try to parse directly
return schema.parse(rawObject);
Expand Down Expand Up @@ -215,6 +238,18 @@ function sanitizeObject(schema: ZodObject<any>, rawObject: unknown): any {
result[key] = sanitized;
}
// If sanitization fails, just skip the optional property
} else if (propertySchema instanceof ZodNullable) {
// For nullable properties, try to sanitize or set to null
try {
const sanitized = safeSanitizedParser(
propertySchema as ZodTypeAny,
rawObjectRecord[key]
);
result[key] = sanitized;
} catch {
// If sanitization fails, set to null for nullable properties
result[key] = null;
}
} else {
// For required properties, try to sanitize and throw if it fails
const sanitized = safeSanitizedParser(
Expand Down Expand Up @@ -267,9 +302,40 @@ function sanitizeOptional(schema: ZodOptional<any>, rawValue: unknown): any {
try {
// Try to sanitize using the inner schema
const innerSchema = schema.unwrap();
return safeSanitizedParser(innerSchema, rawValue);
const parsed = safeSanitizedParser(innerSchema, rawValue);
// If the parsed value is not valid, return undefined for optional values
if (parsed === null) {
return undefined;
}
return parsed;
} catch {
// If sanitization fails, return undefined for optional values
return undefined;
}
}

/**
* Sanitizes a value against a nullable Zod schema
*/
function sanitizeNullable(schema: ZodNullable<any>, rawValue: unknown): any {
// If the value is null, return null directly
if (rawValue === null) {
return null;
}

try {
// Try to sanitize using the inner schema
const innerSchema = schema.unwrap();
const sanitized = safeSanitizedParser(innerSchema, rawValue);

// If sanitization of inner schema fails, return null
if (sanitized === null) {
return null;
}

return sanitized;
} catch {
// If sanitization fails, return null for nullable values
return null;
}
}
Loading