Skip to content

Commit 7ca8334

Browse files
committed
fix webfetch tool when returning html as text
1 parent f1a2b2e commit 7ca8334

File tree

1 file changed

+40
-5
lines changed

1 file changed

+40
-5
lines changed

packages/opencode/src/tool/webfetch.ts

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ export const WebFetchTool = Tool.define({
7676
switch (params.format) {
7777
case "text":
7878
if (contentType.includes("text/html")) {
79-
const text = extractTextFromHTML(content)
79+
const text = await extractTextFromHTML(content)
8080
return {
8181
output: text,
8282
metadata: {
@@ -127,10 +127,45 @@ export const WebFetchTool = Tool.define({
127127
},
128128
})
129129

130-
function extractTextFromHTML(html: string): string {
131-
const doc = new DOMParser().parseFromString(html, "text/html")
132-
const text = doc.body.textContent || doc.body.innerText || ""
133-
return text.replace(/\s+/g, " ").trim()
130+
async function extractTextFromHTML(html: string) {
131+
let text = ""
132+
let skipContent = false
133+
134+
const rewriter = new HTMLRewriter()
135+
.on("script, style, noscript, iframe, object, embed", {
136+
element() {
137+
skipContent = true
138+
},
139+
text() {
140+
// Skip text content inside these elements
141+
},
142+
})
143+
.on("*", {
144+
element(element) {
145+
// Reset skip flag when entering other elements
146+
if (
147+
![
148+
"script",
149+
"style",
150+
"noscript",
151+
"iframe",
152+
"object",
153+
"embed",
154+
].includes(element.tagName)
155+
) {
156+
skipContent = false
157+
}
158+
},
159+
text(input) {
160+
if (!skipContent) {
161+
text += input.text
162+
}
163+
},
164+
})
165+
.transform(new Response(html))
166+
167+
await rewriter.text()
168+
return text.trim()
134169
}
135170

136171
function convertHTMLToMarkdown(html: string): string {

0 commit comments

Comments
 (0)