Skip to content

Commit 17fb6f4

Browse files
authored
Handle nullable (#9)
1 parent d2d601c commit 17fb6f4

File tree

6 files changed

+525
-23
lines changed

6 files changed

+525
-23
lines changed

README.md

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,7 @@ Use LLMs to **robustly** extract or enrich structured data from HTML and markdow
3939
4. **URL Validation**: All extracted URLs are validated - handling relative URLs, removing invalid ones, and repairing markdown-escaped links. See [URL Validation](#url-validation) section for details.
4040

4141
## Why use an LLM extractor?
42-
💡 Can reason from context and return structured answers in addition to extracting content as-is
43-
44-
🔎 Can search from additional context and enrich existing data objects
42+
💡 Understands natural language criteria and context to extract the data you need, not just raw content as displayed
4543

4644
⚡️ No need to manually create custom scraper code for each site
4745

@@ -203,6 +201,9 @@ const result = await extract({
203201
});
204202
```
205203

204+
> [!WARNING]
205+
> For OpenAI models, optional schema is not supported. You need to change `.optional()` to `.nullable()`.
206+
206207
### Extracting from Main HTML
207208

208209
For blog posts or articles with lots of navigation elements, headers, and footers, you can use the `extractMainHtml` option to focus on just the main content:
@@ -377,13 +378,13 @@ const productSchema = z.object({
377378
name: z.string(), // Required field
378379
price: z.number().optional(), // Optional number
379380
inStock: z.boolean().optional(),
380-
category: z.string().optional()
381+
category: z.string().optional(),
381382
})
382383
),
383384
storeInfo: z.object({
384385
name: z.string(),
385386
location: z.string().optional(),
386-
rating: z.number().optional()
387+
rating: z.number().optional(),
387388
})
388389
});
389390

@@ -394,14 +395,14 @@ const rawLLMOutput = {
394395
id: 1,
395396
name: "Laptop",
396397
price: 999,
397-
inStock: true
398+
inStock: true,
398399
}, // Valid product
399400
{
400401
id: 2,
401402
name: "Headphones",
402403
price: "N/A", // Non-convertible string for optional number
403404
inStock: true,
404-
category: "Audio"
405+
category: "Audio",
405406
},
406407
{
407408
id: 3,
@@ -433,24 +434,24 @@ const sanitizedData = safeSanitizedParser(productSchema, rawLLMOutput);
433434
// id: 1,
434435
// name: "Laptop",
435436
// price: 999,
436-
// inStock: true
437+
// inStock: true,
437438
// },
438439
// {
439440
// id: 2,
440441
// name: "Headphones",
441442
// inStock: true,
442-
// category: "Audio"
443+
// category: "Audio",
443444
// },
444445
// {
445446
// id: 4,
446447
// name: "Keyboard",
447448
// price: 59.99,
448-
// inStock: true
449+
// inStock: true,
449450
// }
450451
// ],
451452
// storeInfo: {
452453
// name: "TechStore",
453-
// location: "123 Main St"
454+
// location: "123 Main St",
454455
// }
455456
// }
456457
```

package-lock.json

Lines changed: 7 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/dev/runLocalTest.ts

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@ const blogSchema = z.object({
2828
content: z.string().optional(),
2929
});
3030

31+
// OpenAI version with nullable instead of optional
32+
const blogSchemaOpenAI = z.object({
33+
title: z.string(),
34+
author: z.string().nullable(),
35+
date: z.string().nullable(),
36+
tags: z
37+
.array(z.string())
38+
.nullable()
39+
.describe("Tags appear after the date. Do not include the # symbol."),
40+
summary: z.string(),
41+
content: z.string().nullable(),
42+
});
43+
3144
const productSchema = z.object({
3245
products: z.array(
3346
z.object({
@@ -40,6 +53,19 @@ const productSchema = z.object({
4053
),
4154
});
4255

56+
// OpenAI version with nullable instead of optional
57+
const productSchemaOpenAI = z.object({
58+
products: z.array(
59+
z.object({
60+
name: z.string(),
61+
price: z.string(),
62+
rating: z.string().nullable(),
63+
description: z.string().nullable(),
64+
features: z.array(z.string()).nullable(),
65+
})
66+
),
67+
});
68+
4369
// Test functions
4470
async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
4571
console.log(`Testing blog post extraction with ${provider}...`);
@@ -64,7 +90,8 @@ async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
6490
const result = await extract({
6591
content: html,
6692
format: ContentFormat.HTML,
67-
schema: blogSchema,
93+
schema:
94+
provider === LLMProvider.GOOGLE_GEMINI ? blogSchema : blogSchemaOpenAI,
6895
provider,
6996
googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
7097
openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,
@@ -109,7 +136,10 @@ async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
109136
const result = await extract({
110137
content: html,
111138
format: ContentFormat.HTML,
112-
schema: productSchema,
139+
schema:
140+
provider === LLMProvider.GOOGLE_GEMINI
141+
? productSchema
142+
: productSchemaOpenAI,
113143
provider,
114144
googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
115145
openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,

src/utils/schemaUtils.ts

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
ZodOptional,
66
ZodTypeAny,
77
ZodString,
8+
ZodNullable,
89
} from "zod";
910

1011
/**
@@ -94,6 +95,21 @@ export function transformSchemaForLLM<T extends ZodTypeAny>(
9495
});
9596
}
9697

98+
// For nullable schemas, transform the inner schema
99+
if (schema instanceof ZodNullable) {
100+
const originalDef = { ...(schema as any)._def };
101+
const transformedInner = transformSchemaForLLM(
102+
schema.unwrap() as ZodTypeAny
103+
);
104+
105+
// Create a new nullable with the same definition but transformed inner type
106+
return new z.ZodNullable({
107+
...originalDef,
108+
innerType: transformedInner,
109+
typeName: z.ZodFirstPartyTypeKind.ZodNullable,
110+
});
111+
}
112+
97113
// Return the original schema for all other types
98114
return schema;
99115
}
@@ -144,6 +160,11 @@ export function fixUrlEscapeSequences(data: any, schema: ZodTypeAny): any {
144160
return fixUrlEscapeSequences(data, innerSchema);
145161
}
146162

163+
if (schema instanceof ZodNullable) {
164+
const innerSchema = schema.unwrap() as ZodTypeAny;
165+
return fixUrlEscapeSequences(data, innerSchema);
166+
}
167+
147168
return data;
148169
}
149170

@@ -172,6 +193,8 @@ export function safeSanitizedParser<T extends ZodTypeAny>(
172193
return sanitizeArray(schema, rawObject);
173194
} else if (schema instanceof ZodOptional) {
174195
return sanitizeOptional(schema, rawObject);
196+
} else if (schema instanceof ZodNullable) {
197+
return sanitizeNullable(schema, rawObject);
175198
} else {
176199
// For primitive values, try to parse directly
177200
return schema.parse(rawObject);
@@ -215,6 +238,18 @@ function sanitizeObject(schema: ZodObject<any>, rawObject: unknown): any {
215238
result[key] = sanitized;
216239
}
217240
// If sanitization fails, just skip the optional property
241+
} else if (propertySchema instanceof ZodNullable) {
242+
// For nullable properties, try to sanitize or set to null
243+
try {
244+
const sanitized = safeSanitizedParser(
245+
propertySchema as ZodTypeAny,
246+
rawObjectRecord[key]
247+
);
248+
result[key] = sanitized;
249+
} catch {
250+
// If sanitization fails, set to null for nullable properties
251+
result[key] = null;
252+
}
218253
} else {
219254
// For required properties, try to sanitize and throw if it fails
220255
const sanitized = safeSanitizedParser(
@@ -267,9 +302,40 @@ function sanitizeOptional(schema: ZodOptional<any>, rawValue: unknown): any {
267302
try {
268303
// Try to sanitize using the inner schema
269304
const innerSchema = schema.unwrap();
270-
return safeSanitizedParser(innerSchema, rawValue);
305+
const parsed = safeSanitizedParser(innerSchema, rawValue);
306+
// If the parsed value is not valid, return undefined for optional values
307+
if (parsed === null) {
308+
return undefined;
309+
}
310+
return parsed;
271311
} catch {
272312
// If sanitization fails, return undefined for optional values
273313
return undefined;
274314
}
275315
}
316+
317+
/**
318+
* Sanitizes a value against a nullable Zod schema
319+
*/
320+
function sanitizeNullable(schema: ZodNullable<any>, rawValue: unknown): any {
321+
// If the value is null, return null directly
322+
if (rawValue === null) {
323+
return null;
324+
}
325+
326+
try {
327+
// Try to sanitize using the inner schema
328+
const innerSchema = schema.unwrap();
329+
const sanitized = safeSanitizedParser(innerSchema, rawValue);
330+
331+
// If sanitization of inner schema fails, return null
332+
if (sanitized === null) {
333+
return null;
334+
}
335+
336+
return sanitized;
337+
} catch {
338+
// If sanitization fails, return null for nullable values
339+
return null;
340+
}
341+
}

0 commit comments

Comments
 (0)