From d475e908aaed8d6233f26c121bde4b0152955173 Mon Sep 17 00:00:00 2001 From: Dominik Hayon Date: Mon, 1 Jul 2024 21:48:00 +0200 Subject: [PATCH] poc for id based genios search --- src/extractor.ts | 25 ++++++++++++++++++------- src/sites.ts | 31 +++++++++++++++++++++++++++++-- src/types.ts | 1 + 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/src/extractor.ts b/src/extractor.ts index 7733055..5733e2c 100644 --- a/src/extractor.ts +++ b/src/extractor.ts @@ -143,16 +143,27 @@ class Extractor implements ExtractorInterface { extractArticleInfo (): ArticleInfo { const articleInfoSelectors = ['query', 'edition', 'date'] const articleInfo: RawArticleInfo = {} - for (const key of articleInfoSelectors) { - if (this.site.selectors[key]) { - const selector = this.site.selectors[key] - let result = this.runSelectorQuery(selector) - if (result instanceof window.HTMLElement) { - result = result.innerText + + if (this.site.extractId) { + const idFromCustomFunc = this.site.extractId() + if (idFromCustomFunc) { + articleInfo.query = idFromCustomFunc + } + } + + if (!articleInfo.query) { + for (const key of articleInfoSelectors) { + if (this.site.selectors[key]) { + const selector = this.site.selectors[key] + let result = this.runSelectorQuery(selector) + if (result instanceof window.HTMLElement) { + result = result.innerText + } + articleInfo[key] = result } - articleInfo[key] = result } } + return { query: articleInfo.query, edition: articleInfo.edition, diff --git a/src/sites.ts b/src/sites.ts index c3d7a34..20de653 100644 --- a/src/sites.ts +++ b/src/sites.ts @@ -94,7 +94,7 @@ const sites: Sites = { { url: 'https://www.spiegel.de/politik/deutschland/klara-geywitz-ueber-sanierungspflicht-von-immobilien-neuen-wohnraum-und-fluechtlinge-a-6aeb319e-fc25-4efa-a0cf-66e10ed49969', selectors: { - query: 'nicht ohne Ordnungsrecht gehen wenn wir die Klimaziele erreichen wollen«' + query: '6aeb319e-fc25-4efa-a0cf-66e10ed49969' } } ], @@ -104,6 +104,11 @@ const sites: Sites = { main: 'article section.relative', paywall: "div[data-component='Paywall'], div[data-target-id='paywall']" }, + extractId: () => { + const url = window.location.href + const match = url.match(/-([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})/) + return match ? match[1] : null + }, mimic: (content) => { return `
@@ -121,6 +126,14 @@ const sites: Sites = { } }, 'www.manager-magazin.de': { + examples: [ + { + url: 'https://www.manager-magazin.de/unternehmen/fussball-em-2024-martin-kallen-der-unbekannte-milliardenmacher-hinter-der-em-a-f89271c7-d048-49b4-bcb6-a974cc7eff26', + selectors: { + query: 'f89271c7-d048-49b4-bcb6-a974cc7eff26' + } + } + ], selectors: { query: makeQueryFunc('header h2~div:nth-of-type(1)'), date: 'time', @@ -128,6 +141,11 @@ const sites: Sites = { paywall: '[data-area="paywall"]', main: '[data-area="body"]' }, + extractId: () => { + const url = window.location.href + const match = url.match(/-([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})/) + return match ? match[1] : null + }, source: 'genios.de', sourceParams: { dbShortcut: 'MM,MMAG' @@ -453,10 +471,19 @@ const sites: Sites = { { url: 'https://www.wiwo.de/my/unternehmen/industrie/mischkonzern-zeppelin-ein-ausschluss-russlands-aus-swift-wuerde-eine-weltwirtschaftskrise-ausloesen/28091946.html', selectors: { - query: 'Mischkonzern Zeppelin vertreibt unter anderem US-amerikanische Baumaschinen in Russland und der Ukraine Ein' + query: 'WW_28091946' } } ], + extractId: () => { + const url = window.location.href + const id = url.match('/[0-9]{10}.html/') + if (id[1]) { + return `WW_${id[1]}` + } + + return null + }, selectors: { query: makeQueryFunc('.c-leadtext', false), main: '.o-article__content', diff --git a/src/types.ts b/src/types.ts index fdb14cd..7500736 100644 --- a/src/types.ts +++ b/src/types.ts @@ -80,6 +80,7 @@ export interface PartialSite { dateRange?: DateRange testSetup?: (page: PlaywrightPage) => Promise examples?: TestExample[] + extractId?: () => string | null } export interface Site extends PartialSite {