From 5ae22b18dbd0c404abc22f11b3f79db8a8cecc0b Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 19:55:24 -0700 Subject: [PATCH 1/7] init --- package-lock.json | 7 ++++--- package.json | 3 ++- src/extractors.ts | 6 ++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/package-lock.json b/package-lock.json index c13c686..671866a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "lightfeed-extract", - "version": "0.1.1", + "version": "0.1.3", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "lightfeed-extract", - "version": "0.1.1", + "version": "0.1.3", "license": "Apache-2.0", "dependencies": { "@langchain/google-genai": "^0.2.5", @@ -17,7 +17,8 @@ "turndown": "^7.2.0", "xmldom": "^0.6.0", "xpath": "^0.0.34", - "zod": "^3.24.3" + "zod": "^3.24.3", + "zod-to-json-schema": "^3.24.5" }, "devDependencies": { "@types/jest": "^29.5.12", diff --git a/package.json b/package.json index 7d90909..a9668f1 100644 --- a/package.json +++ b/package.json @@ -58,7 +58,8 @@ "turndown": "^7.2.0", "xmldom": "^0.6.0", "xpath": "^0.0.34", - "zod": "^3.24.3" + "zod": "^3.24.3", + "zod-to-json-schema": "^3.24.5" }, "devDependencies": { "@types/jest": "^29.5.12", diff --git a/src/extractors.ts b/src/extractors.ts index 4160fde..c697969 100644 --- a/src/extractors.ts +++ b/src/extractors.ts @@ -9,6 +9,7 @@ import { fixUrlEscapeSequences, } from "./utils/schemaUtils"; import { jsonrepair } from "jsonrepair"; +import zodToJsonSchema from "zod-to-json-schema"; // Define LLMResult type here since direct import is problematic interface TokenUsage { @@ -213,8 +214,13 @@ export async function extractWithLLM( }); try { + console.log("schema", JSON.stringify(zodToJsonSchema(schema), null, 2)); // Transform schema to be compatible with LLM output (converting url() to string()) const llmSchema = transformSchemaForLLM(schema); + console.log( + "llmSchema", + JSON.stringify(zodToJsonSchema(llmSchema), null, 2) + ); // Extract structured data with a withStructuredOutput chain const structuredOutputLLM = llm.withStructuredOutput(llmSchema, { From 95d2526ccff6e83d1372a6364608b162c6b0a41c Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 20:13:50 -0700 Subject: [PATCH 2/7] add more console.log --- src/utils/schemaUtils.ts | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/utils/schemaUtils.ts b/src/utils/schemaUtils.ts index 6afcc9e..f8d8300 100644 --- a/src/utils/schemaUtils.ts +++ b/src/utils/schemaUtils.ts @@ -7,19 +7,31 @@ import { ZodString, ZodNullable, } from "zod"; +import zodToJsonSchema from "zod-to-json-schema"; /** * Checks if a schema is a ZodString with URL validation */ export function isUrlSchema(schema: ZodTypeAny): boolean { + console.log( + "Checking if schema is URL schema:", + JSON.stringify(zodToJsonSchema(schema), null, 2) + ); if (!(schema instanceof ZodString)) return false; // Check if schema has URL validation by checking for internal checks property // This is a bit of a hack but necessary since Zod doesn't expose validation info const checks = (schema as any)._def.checks; + console.log( + "Checking schema for URL validation:", + checks, + JSON.stringify(checks, null, 2) + ); if (!checks || !Array.isArray(checks)) return false; - return checks.some((check) => check.kind === "url"); + const isUrl = checks.some((check) => check.kind === "url"); + console.log("Is URL schema:", isUrl); + return isUrl; } /** @@ -29,15 +41,26 @@ export function isUrlSchema(schema: ZodTypeAny): boolean { export function transformSchemaForLLM( schema: T ): ZodTypeAny { + console.log("Transforming schema:", schema.constructor.name); + console.log("running transformSchemaForLLM", JSON.stringify(schema, null, 2)); + // For URL string schemas, remove the URL check but preserve everything else if (isUrlSchema(schema)) { + console.log("Found URL schema, transforming to string schema"); const originalDef = { ...(schema as any)._def }; + console.log("Original definition:", JSON.stringify(originalDef, null, 2)); // Filter out only URL checks, keep all other checks if (originalDef.checks && Array.isArray(originalDef.checks)) { + const originalChecks = [...originalDef.checks]; originalDef.checks = originalDef.checks.filter( (check: any) => check.kind !== "url" ); + console.log( + "Removed URL checks:", + originalChecks.length - originalDef.checks.length, + "checks removed" + ); } // Create a new string schema with the modified definition From 0ee465d68e187ebef1cf094cabc3ea341631fcc8 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 20:36:08 -0700 Subject: [PATCH 3/7] init --- src/dev/runLocalTest.ts | 10 ++++------ src/utils/schemaUtils.ts | 12 +++++++++++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/dev/runLocalTest.ts b/src/dev/runLocalTest.ts index e6b373a..c0c7b18 100644 --- a/src/dev/runLocalTest.ts +++ b/src/dev/runLocalTest.ts @@ -42,13 +42,11 @@ const blogSchemaOpenAI = z.object({ }); const productSchema = z.object({ - products: z.array( + items: z.array( z.object({ - name: z.string(), - price: z.string(), - rating: z.string().optional(), - description: z.string().optional(), - features: z.array(z.string()).optional(), + article_title: z.string().describe("Title of the article"), + link_url: z.string().url().optional().describe("URL of the article"), + summary: z.string().optional().describe("Summary of the article"), }) ), }); diff --git a/src/utils/schemaUtils.ts b/src/utils/schemaUtils.ts index f8d8300..5310dfa 100644 --- a/src/utils/schemaUtils.ts +++ b/src/utils/schemaUtils.ts @@ -41,7 +41,7 @@ export function isUrlSchema(schema: ZodTypeAny): boolean { export function transformSchemaForLLM( schema: T ): ZodTypeAny { - console.log("Transforming schema:", schema.constructor.name); + console.log("Start of Transforming schema:"); console.log("running transformSchemaForLLM", JSON.stringify(schema, null, 2)); // For URL string schemas, remove the URL check but preserve everything else @@ -70,6 +70,16 @@ export function transformSchemaForLLM( }); } + console.log( + "Transforming schema:", + JSON.stringify(zodToJsonSchema(schema), null, 2) + ); + console.log("schema is not a URL schema, returning original schema"); + console.log("schema is object?", schema instanceof ZodObject); + console.log("schema is array?", schema instanceof ZodArray); + console.log("schema is optional?", schema instanceof ZodOptional); + console.log("schema is nullable?", schema instanceof ZodNullable); + // For object schemas, transform each property if (schema instanceof ZodObject) { const originalDef = { ...(schema as any)._def }; From 4e3dd335e4ffd250a561c75e7ea987ff6a64beca Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 21:31:21 -0700 Subject: [PATCH 4/7] init --- src/utils/schemaUtils.ts | 92 ++++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 31 deletions(-) diff --git a/src/utils/schemaUtils.ts b/src/utils/schemaUtils.ts index 5310dfa..70126e9 100644 --- a/src/utils/schemaUtils.ts +++ b/src/utils/schemaUtils.ts @@ -6,6 +6,7 @@ import { ZodTypeAny, ZodString, ZodNullable, + ZodFirstPartyTypeKind, } from "zod"; import zodToJsonSchema from "zod-to-json-schema"; @@ -17,7 +18,7 @@ export function isUrlSchema(schema: ZodTypeAny): boolean { "Checking if schema is URL schema:", JSON.stringify(zodToJsonSchema(schema), null, 2) ); - if (!(schema instanceof ZodString)) return false; + if (!isZodType(schema, ZodFirstPartyTypeKind.ZodString)) return false; // Check if schema has URL validation by checking for internal checks property // This is a bit of a hack but necessary since Zod doesn't expose validation info @@ -34,6 +35,16 @@ export function isUrlSchema(schema: ZodTypeAny): boolean { return isUrl; } +/** + * Helper function to check schema type without using instanceof (can fail due to zod version differences) + */ +export function isZodType( + schema: ZodTypeAny, + type: ZodFirstPartyTypeKind +): boolean { + return (schema as any)._def.typeName === type; +} + /** * Transforms a schema, replacing any URL validations with string validations * for compatibility with LLM output @@ -75,18 +86,30 @@ export function transformSchemaForLLM( JSON.stringify(zodToJsonSchema(schema), null, 2) ); console.log("schema is not a URL schema, returning original schema"); - console.log("schema is object?", schema instanceof ZodObject); - console.log("schema is array?", schema instanceof ZodArray); - console.log("schema is optional?", schema instanceof ZodOptional); - console.log("schema is nullable?", schema instanceof ZodNullable); + console.log( + "schema is object?", + isZodType(schema, ZodFirstPartyTypeKind.ZodObject) + ); + console.log( + "schema is array?", + isZodType(schema, ZodFirstPartyTypeKind.ZodArray) + ); + console.log( + "schema is optional?", + isZodType(schema, ZodFirstPartyTypeKind.ZodOptional) + ); + console.log( + "schema is nullable?", + isZodType(schema, ZodFirstPartyTypeKind.ZodNullable) + ); // For object schemas, transform each property - if (schema instanceof ZodObject) { + if (isZodType(schema, ZodFirstPartyTypeKind.ZodObject)) { const originalDef = { ...(schema as any)._def }; const newShape: Record = {}; // Transform each property in the shape - for (const [key, propertySchema] of Object.entries(schema.shape)) { + for (const [key, propertySchema] of Object.entries((schema as any).shape)) { newShape[key] = transformSchemaForLLM(propertySchema as ZodTypeAny); } @@ -99,10 +122,10 @@ export function transformSchemaForLLM( } // For array schemas, transform the element schema - if (schema instanceof ZodArray) { + if (isZodType(schema, ZodFirstPartyTypeKind.ZodArray)) { const originalDef = { ...(schema as any)._def }; const transformedElement = transformSchemaForLLM( - schema.element as ZodTypeAny + (schema as any).element as ZodTypeAny ); // Create a new array with the same definition but transformed element @@ -114,10 +137,10 @@ export function transformSchemaForLLM( } // For optional schemas, transform the inner schema - if (schema instanceof ZodOptional) { + if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) { const originalDef = { ...(schema as any)._def }; const transformedInner = transformSchemaForLLM( - schema.unwrap() as ZodTypeAny + (schema as any).unwrap() as ZodTypeAny ); // Create a new optional with the same definition but transformed inner type @@ -129,10 +152,10 @@ export function transformSchemaForLLM( } // For nullable schemas, transform the inner schema - if (schema instanceof ZodNullable) { + if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) { const originalDef = { ...(schema as any)._def }; const transformedInner = transformSchemaForLLM( - schema.unwrap() as ZodTypeAny + (schema as any).unwrap() as ZodTypeAny ); // Create a new nullable with the same definition but transformed inner type @@ -162,11 +185,11 @@ export function fixUrlEscapeSequences(data: any, schema: ZodTypeAny): any { } if ( - schema instanceof ZodObject && + isZodType(schema, ZodFirstPartyTypeKind.ZodObject) && typeof data === "object" && !Array.isArray(data) ) { - const shape = schema.shape; + const shape = (schema as any).shape; const result: Record = {}; for (const [key, propertySchema] of Object.entries(shape)) { @@ -183,18 +206,21 @@ export function fixUrlEscapeSequences(data: any, schema: ZodTypeAny): any { return result; } - if (schema instanceof ZodArray && Array.isArray(data)) { - const elementSchema = schema.element as ZodTypeAny; + if ( + isZodType(schema, ZodFirstPartyTypeKind.ZodArray) && + Array.isArray(data) + ) { + const elementSchema = (schema as any).element as ZodTypeAny; return data.map((item) => fixUrlEscapeSequences(item, elementSchema)); } - if (schema instanceof ZodOptional) { - const innerSchema = schema.unwrap() as ZodTypeAny; + if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) { + const innerSchema = (schema as any).unwrap() as ZodTypeAny; return fixUrlEscapeSequences(data, innerSchema); } - if (schema instanceof ZodNullable) { - const innerSchema = schema.unwrap() as ZodTypeAny; + if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) { + const innerSchema = (schema as any).unwrap() as ZodTypeAny; return fixUrlEscapeSequences(data, innerSchema); } @@ -220,14 +246,14 @@ export function safeSanitizedParser( } // Handle different schema types - if (schema instanceof ZodObject) { - return sanitizeObject(schema, rawObject); - } else if (schema instanceof ZodArray) { - return sanitizeArray(schema, rawObject); - } else if (schema instanceof ZodOptional) { - return sanitizeOptional(schema, rawObject); - } else if (schema instanceof ZodNullable) { - return sanitizeNullable(schema, rawObject); + if (isZodType(schema, ZodFirstPartyTypeKind.ZodObject)) { + return sanitizeObject(schema as any, rawObject); + } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodArray)) { + return sanitizeArray(schema as any, rawObject); + } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) { + return sanitizeOptional(schema as any, rawObject); + } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) { + return sanitizeNullable(schema as any, rawObject); } else { // For primitive values, try to parse directly return schema.parse(rawObject); @@ -262,7 +288,9 @@ function sanitizeObject(schema: ZodObject, rawObject: unknown): any { } // If property is optional, try to sanitize it - if (propertySchema instanceof ZodOptional) { + if ( + isZodType(propertySchema as ZodTypeAny, ZodFirstPartyTypeKind.ZodOptional) + ) { const sanitized = safeSanitizedParser( propertySchema as ZodTypeAny, rawObjectRecord[key] @@ -271,7 +299,9 @@ function sanitizeObject(schema: ZodObject, rawObject: unknown): any { result[key] = sanitized; } // If sanitization fails, just skip the optional property - } else if (propertySchema instanceof ZodNullable) { + } else if ( + isZodType(propertySchema as ZodTypeAny, ZodFirstPartyTypeKind.ZodNullable) + ) { // For nullable properties, try to sanitize or set to null try { const sanitized = safeSanitizedParser( From 0b3477f6f36366726786b610995bfa05c0e20ee5 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 21:45:24 -0700 Subject: [PATCH 5/7] rm console.log --- package-lock.json | 3 +-- package.json | 3 +-- src/extractors.ts | 6 ------ src/utils/schemaUtils.ts | 44 ---------------------------------------- 4 files changed, 2 insertions(+), 54 deletions(-) diff --git a/package-lock.json b/package-lock.json index 671866a..2fdf103 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,8 +17,7 @@ "turndown": "^7.2.0", "xmldom": "^0.6.0", "xpath": "^0.0.34", - "zod": "^3.24.3", - "zod-to-json-schema": "^3.24.5" + "zod": "^3.24.3" }, "devDependencies": { "@types/jest": "^29.5.12", diff --git a/package.json b/package.json index a9668f1..7d90909 100644 --- a/package.json +++ b/package.json @@ -58,8 +58,7 @@ "turndown": "^7.2.0", "xmldom": "^0.6.0", "xpath": "^0.0.34", - "zod": "^3.24.3", - "zod-to-json-schema": "^3.24.5" + "zod": "^3.24.3" }, "devDependencies": { "@types/jest": "^29.5.12", diff --git a/src/extractors.ts b/src/extractors.ts index c697969..4160fde 100644 --- a/src/extractors.ts +++ b/src/extractors.ts @@ -9,7 +9,6 @@ import { fixUrlEscapeSequences, } from "./utils/schemaUtils"; import { jsonrepair } from "jsonrepair"; -import zodToJsonSchema from "zod-to-json-schema"; // Define LLMResult type here since direct import is problematic interface TokenUsage { @@ -214,13 +213,8 @@ export async function extractWithLLM( }); try { - console.log("schema", JSON.stringify(zodToJsonSchema(schema), null, 2)); // Transform schema to be compatible with LLM output (converting url() to string()) const llmSchema = transformSchemaForLLM(schema); - console.log( - "llmSchema", - JSON.stringify(zodToJsonSchema(llmSchema), null, 2) - ); // Extract structured data with a withStructuredOutput chain const structuredOutputLLM = llm.withStructuredOutput(llmSchema, { diff --git a/src/utils/schemaUtils.ts b/src/utils/schemaUtils.ts index 70126e9..729d8ee 100644 --- a/src/utils/schemaUtils.ts +++ b/src/utils/schemaUtils.ts @@ -4,34 +4,22 @@ import { ZodObject, ZodOptional, ZodTypeAny, - ZodString, ZodNullable, ZodFirstPartyTypeKind, } from "zod"; -import zodToJsonSchema from "zod-to-json-schema"; /** * Checks if a schema is a ZodString with URL validation */ export function isUrlSchema(schema: ZodTypeAny): boolean { - console.log( - "Checking if schema is URL schema:", - JSON.stringify(zodToJsonSchema(schema), null, 2) - ); if (!isZodType(schema, ZodFirstPartyTypeKind.ZodString)) return false; // Check if schema has URL validation by checking for internal checks property // This is a bit of a hack but necessary since Zod doesn't expose validation info const checks = (schema as any)._def.checks; - console.log( - "Checking schema for URL validation:", - checks, - JSON.stringify(checks, null, 2) - ); if (!checks || !Array.isArray(checks)) return false; const isUrl = checks.some((check) => check.kind === "url"); - console.log("Is URL schema:", isUrl); return isUrl; } @@ -52,14 +40,9 @@ export function isZodType( export function transformSchemaForLLM( schema: T ): ZodTypeAny { - console.log("Start of Transforming schema:"); - console.log("running transformSchemaForLLM", JSON.stringify(schema, null, 2)); - // For URL string schemas, remove the URL check but preserve everything else if (isUrlSchema(schema)) { - console.log("Found URL schema, transforming to string schema"); const originalDef = { ...(schema as any)._def }; - console.log("Original definition:", JSON.stringify(originalDef, null, 2)); // Filter out only URL checks, keep all other checks if (originalDef.checks && Array.isArray(originalDef.checks)) { @@ -67,11 +50,6 @@ export function transformSchemaForLLM( originalDef.checks = originalDef.checks.filter( (check: any) => check.kind !== "url" ); - console.log( - "Removed URL checks:", - originalChecks.length - originalDef.checks.length, - "checks removed" - ); } // Create a new string schema with the modified definition @@ -81,28 +59,6 @@ export function transformSchemaForLLM( }); } - console.log( - "Transforming schema:", - JSON.stringify(zodToJsonSchema(schema), null, 2) - ); - console.log("schema is not a URL schema, returning original schema"); - console.log( - "schema is object?", - isZodType(schema, ZodFirstPartyTypeKind.ZodObject) - ); - console.log( - "schema is array?", - isZodType(schema, ZodFirstPartyTypeKind.ZodArray) - ); - console.log( - "schema is optional?", - isZodType(schema, ZodFirstPartyTypeKind.ZodOptional) - ); - console.log( - "schema is nullable?", - isZodType(schema, ZodFirstPartyTypeKind.ZodNullable) - ); - // For object schemas, transform each property if (isZodType(schema, ZodFirstPartyTypeKind.ZodObject)) { const originalDef = { ...(schema as any)._def }; From dd9b4821716b18f0065caff6f56b665f36e293da Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 21:55:57 -0700 Subject: [PATCH 6/7] nit --- src/dev/runLocalTest.ts | 10 ++++++---- src/utils/schemaUtils.ts | 4 +--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/dev/runLocalTest.ts b/src/dev/runLocalTest.ts index c0c7b18..e6b373a 100644 --- a/src/dev/runLocalTest.ts +++ b/src/dev/runLocalTest.ts @@ -42,11 +42,13 @@ const blogSchemaOpenAI = z.object({ }); const productSchema = z.object({ - items: z.array( + products: z.array( z.object({ - article_title: z.string().describe("Title of the article"), - link_url: z.string().url().optional().describe("URL of the article"), - summary: z.string().optional().describe("Summary of the article"), + name: z.string(), + price: z.string(), + rating: z.string().optional(), + description: z.string().optional(), + features: z.array(z.string()).optional(), }) ), }); diff --git a/src/utils/schemaUtils.ts b/src/utils/schemaUtils.ts index 729d8ee..aa29823 100644 --- a/src/utils/schemaUtils.ts +++ b/src/utils/schemaUtils.ts @@ -19,8 +19,7 @@ export function isUrlSchema(schema: ZodTypeAny): boolean { const checks = (schema as any)._def.checks; if (!checks || !Array.isArray(checks)) return false; - const isUrl = checks.some((check) => check.kind === "url"); - return isUrl; + return checks.some((check) => check.kind === "url"); } /** @@ -46,7 +45,6 @@ export function transformSchemaForLLM( // Filter out only URL checks, keep all other checks if (originalDef.checks && Array.isArray(originalDef.checks)) { - const originalChecks = [...originalDef.checks]; originalDef.checks = originalDef.checks.filter( (check: any) => check.kind !== "url" ); From cf125799abb6ab5e76641a298bcb13f100a2c009 Mon Sep 17 00:00:00 2001 From: Andrew Zhong Date: Tue, 13 May 2025 21:57:45 -0700 Subject: [PATCH 7/7] minor --- src/utils/schemaUtils.ts | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/utils/schemaUtils.ts b/src/utils/schemaUtils.ts index aa29823..915e925 100644 --- a/src/utils/schemaUtils.ts +++ b/src/utils/schemaUtils.ts @@ -25,10 +25,7 @@ export function isUrlSchema(schema: ZodTypeAny): boolean { /** * Helper function to check schema type without using instanceof (can fail due to zod version differences) */ -export function isZodType( - schema: ZodTypeAny, - type: ZodFirstPartyTypeKind -): boolean { +function isZodType(schema: ZodTypeAny, type: ZodFirstPartyTypeKind): boolean { return (schema as any)._def.typeName === type; }