diff --git a/src/clis/amazon/bestsellers.test.ts b/src/clis/amazon/bestsellers.test.ts index b47791e9..3aebe024 100644 --- a/src/clis/amazon/bestsellers.test.ts +++ b/src/clis/amazon/bestsellers.test.ts @@ -1,9 +1,9 @@ import { describe, expect, it } from 'vitest'; -import { __test__ } from './bestsellers.js'; +import { __test__ } from './rankings.js'; describe('amazon bestsellers normalization', () => { it('normalizes bestseller cards and infers review counts from card text', () => { - const result = __test__.normalizeBestsellerCandidate({ + const result = __test__.normalizeRankingCandidate({ asin: 'B0DR31GC3D', title: '', href: 'https://www.amazon.com/NUTIKAS-Shelves-Desktop-Orgnizer-Shlef/dp/B0DR31GC3D/ref=zg_bs', @@ -11,7 +11,16 @@ describe('amazon bestsellers normalization', () => { rating_text: '4.3 out of 5 stars', review_count_text: '', card_text: 'Desk Shelves Desktop Organizer Shlef\n4.3 out of 5 stars\n435\n$25.92', - }, 2, 'Amazon Best Sellers: Best Desktop & Off-Surface Shelves', 'https://www.amazon.com/example'); + }, { + listType: 'bestsellers', + rankFallback: 2, + listTitle: 'Amazon Best Sellers: Best Desktop & Off-Surface Shelves', + sourceUrl: 'https://www.amazon.com/example', + categoryTitle: null, + categoryUrl: 'https://www.amazon.com/example', + categoryPath: [], + visibleCategoryLinks: [], + }); expect(result.rank).toBe(2); expect(result.asin).toBe('B0DR31GC3D'); diff --git a/src/clis/amazon/bestsellers.ts b/src/clis/amazon/bestsellers.ts index 83e0bac3..f5238a7c 100644 --- a/src/clis/amazon/bestsellers.ts +++ b/src/clis/amazon/bestsellers.ts @@ -1,180 +1,8 @@ -import { CommandExecutionError } from '../../errors.js'; -import { cli, Strategy } from '../../registry.js'; -import type { IPage } from '../../types.js'; -import { - buildProvenance, - cleanText, - extractAsin, - extractReviewCountFromCardText, - firstMeaningfulLine, - normalizeProductUrl, - parsePriceText, - parseRatingValue, - parseReviewCount, - resolveBestsellersUrl, - uniqueNonEmpty, - assertUsableState, - gotoAndReadState, -} from './shared.js'; +import { cli } from '../../registry.js'; +import { createRankingCliOptions } from './rankings.js'; -interface BestsellersPagePayload { - href?: string; - title?: string; - list_title?: string; - cards?: Array<{ - rank_text?: string | null; - asin?: string | null; - title?: string | null; - href?: string | null; - price_text?: string | null; - rating_text?: string | null; - review_count_text?: string | null; - card_text?: string | null; - }>; - page_links?: string[]; -} - -function normalizeBestsellerCandidate( - candidate: NonNullable[number], - rank: number, - listTitle: string | null, - sourceUrl: string, -): Record { - const productUrl = normalizeProductUrl(candidate.href); - const asin = extractAsin(candidate.asin ?? '') ?? extractAsin(productUrl ?? '') ?? null; - const title = cleanText(candidate.title) || firstMeaningfulLine(candidate.card_text); - const price = parsePriceText(cleanText(candidate.price_text) || candidate.card_text); - const ratingText = cleanText(candidate.rating_text) || null; - const reviewCountText = cleanText(candidate.review_count_text) - || extractReviewCountFromCardText(candidate.card_text) - || null; - const provenance = buildProvenance(sourceUrl); - - return { - rank, - asin, - title: title || null, - product_url: productUrl, - list_title: listTitle, - ...provenance, - price_text: price.price_text, - price_value: price.price_value, - currency: price.currency, - rating_text: ratingText, - rating_value: parseRatingValue(ratingText), - review_count_text: reviewCountText, - review_count: parseReviewCount(reviewCountText), - }; -} - -async function readBestsellersPage(page: IPage, url: string): Promise { - const state = await gotoAndReadState(page, url, 2500, 'bestsellers'); - assertUsableState(state, 'bestsellers'); - - return await page.evaluate(` - (() => ({ - href: window.location.href, - title: document.title || '', - list_title: - document.querySelector('#zg_banner_text')?.textContent - || document.querySelector('h1')?.textContent - || '', - cards: Array.from(document.querySelectorAll('.p13n-sc-uncoverable-faceout')) - .map((card) => ({ - rank_text: - card.querySelector('.zg-bdg-text')?.textContent - || card.querySelector('[class*="rank"]')?.textContent - || '', - asin: card.id || '', - title: - card.querySelector('[class*="line-clamp"]')?.textContent - || card.querySelector('img')?.getAttribute('alt') - || '', - href: card.querySelector('a[href*="/dp/"]')?.href || '', - price_text: card.querySelector('.a-price .a-offscreen')?.textContent || '', - rating_text: card.querySelector('[aria-label*="out of 5 stars"]')?.getAttribute('aria-label') || '', - review_count_text: - card.querySelector('a[href*="#customerReviews"]')?.textContent - || card.querySelector('.a-size-small')?.textContent - || '', - card_text: card.innerText || '', - })), - page_links: Array.from(document.querySelectorAll('li.a-normal a, li.a-selected a')) - .map((anchor) => anchor.href || '') - .filter((href) => /\\/zgbs\\//.test(href) && /(?:[?&]pg=|ref=zg_bs_pg_)/.test(href)), - }))() - `) as BestsellersPagePayload; -} - -cli({ - site: 'amazon', - name: 'bestsellers', +cli(createRankingCliOptions({ + commandName: 'bestsellers', + listType: 'bestsellers', description: 'Amazon Best Sellers pages for category candidate discovery', - domain: 'amazon.com', - strategy: Strategy.COOKIE, - navigateBefore: false, - args: [ - { - name: 'input', - positional: true, - help: 'Best sellers URL or /zgbs path. Omit to use the root Best Sellers page.', - }, - { - name: 'limit', - type: 'int', - default: 100, - help: 'Maximum number of ranked items to return (default 100)', - }, - ], - columns: ['rank', 'asin', 'title', 'price_text', 'rating_value', 'review_count'], - func: async (page, kwargs) => { - const limit = Math.max(1, Number(kwargs.limit) || 100); - const initialUrl = resolveBestsellersUrl(typeof kwargs.input === 'string' ? kwargs.input : undefined); - - const queue = [initialUrl]; - const visited = new Set(); - const seenAsins = new Set(); - const results: Record[] = []; - let listTitle: string | null = null; - - while (queue.length > 0 && results.length < limit) { - const nextUrl = queue.shift()!; - if (visited.has(nextUrl)) continue; - visited.add(nextUrl); - - const payload = await readBestsellersPage(page, nextUrl); - const sourceUrl = cleanText(payload.href) || nextUrl; - listTitle = cleanText(payload.list_title) || cleanText(payload.title) || listTitle; - const cards = payload.cards ?? []; - - for (const card of cards) { - const normalized = normalizeBestsellerCandidate(card, results.length + 1, listTitle, sourceUrl); - const asin = cleanText(String(normalized.asin ?? '')); - if (!asin || seenAsins.has(asin)) continue; - seenAsins.add(asin); - results.push(normalized); - if (results.length >= limit) break; - } - - const pageLinks = uniqueNonEmpty(payload.page_links ?? []); - for (const href of pageLinks) { - if (!visited.has(href) && !queue.includes(href)) { - queue.push(href); - } - } - } - - if (results.length === 0) { - throw new CommandExecutionError( - 'amazon bestsellers did not expose any ranked items', - 'Open the same best sellers page in Chrome, verify it is a real Amazon ranking page, and retry.', - ); - } - - return results.slice(0, limit); - }, -}); - -export const __test__ = { - normalizeBestsellerCandidate, -}; +})); diff --git a/src/clis/amazon/movers-shakers.ts b/src/clis/amazon/movers-shakers.ts new file mode 100644 index 00000000..450ef0ff --- /dev/null +++ b/src/clis/amazon/movers-shakers.ts @@ -0,0 +1,8 @@ +import { cli } from '../../registry.js'; +import { createRankingCliOptions } from './rankings.js'; + +cli(createRankingCliOptions({ + commandName: 'movers-shakers', + listType: 'movers_shakers', + description: 'Amazon Movers & Shakers pages for short-term growth signals', +})); diff --git a/src/clis/amazon/new-releases.ts b/src/clis/amazon/new-releases.ts new file mode 100644 index 00000000..9b444041 --- /dev/null +++ b/src/clis/amazon/new-releases.ts @@ -0,0 +1,8 @@ +import { cli } from '../../registry.js'; +import { createRankingCliOptions } from './rankings.js'; + +cli(createRankingCliOptions({ + commandName: 'new-releases', + listType: 'new_releases', + description: 'Amazon New Releases pages for early momentum discovery', +})); diff --git a/src/clis/amazon/rankings.test.ts b/src/clis/amazon/rankings.test.ts new file mode 100644 index 00000000..fc7186a7 --- /dev/null +++ b/src/clis/amazon/rankings.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './rankings.js'; + +describe('amazon rankings helpers', () => { + it('normalizes ranking candidates with unified schema', () => { + const result = __test__.normalizeRankingCandidate( + { + rank_text: '#3', + asin: 'B0DR31GC3D', + title: 'Desk Shelves Desktop Organizer', + href: 'https://www.amazon.com/dp/B0DR31GC3D/ref=zg_bs', + price_text: '$25.92', + rating_text: '4.3 out of 5 stars', + review_count_text: '435', + }, + { + listType: 'new_releases', + rankFallback: 3, + listTitle: 'Amazon New Releases', + sourceUrl: 'https://www.amazon.com/gp/new-releases', + categoryTitle: 'Home & Kitchen', + categoryUrl: 'https://www.amazon.com/gp/new-releases/home-garden', + categoryPath: ['Home & Kitchen'], + visibleCategoryLinks: [{ title: 'Storage', url: 'https://www.amazon.com/gp/new-releases/storage', node_id: null }], + }, + ); + + expect(result.list_type).toBe('new_releases'); + expect(result.rank).toBe(3); + expect(result.asin).toBe('B0DR31GC3D'); + expect(result.product_url).toBe('https://www.amazon.com/dp/B0DR31GC3D'); + expect(result.category_title).toBe('Home & Kitchen'); + expect(result.visible_category_links).toEqual([ + { title: 'Storage', url: 'https://www.amazon.com/gp/new-releases/storage', node_id: null }, + ]); + }); + + it('deduplicates category links and parses rank fallback', () => { + const links = __test__.normalizeVisibleCategoryLinks([ + { title: 'Kitchen', url: '/gp/new-releases/home-garden' }, + { title: 'Kitchen', url: 'https://www.amazon.com/gp/new-releases/home-garden' }, + { title: 'Storage', url: '/gp/new-releases/storage', node_id: '1064954' }, + ]); + expect(links.length).toBe(2); + expect(__test__.parseRank('N/A', 8)).toBe(8); + }); +}); diff --git a/src/clis/amazon/rankings.ts b/src/clis/amazon/rankings.ts new file mode 100644 index 00000000..0ba83155 --- /dev/null +++ b/src/clis/amazon/rankings.ts @@ -0,0 +1,312 @@ +import { CommandExecutionError } from '../../errors.js'; +import { Strategy, type CliOptions } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { + assertUsableState, + buildProvenance, + cleanText, + extractAsin, + extractCategoryNodeId, + extractReviewCountFromCardText, + firstMeaningfulLine, + gotoAndReadState, + isRankingPaginationUrl, + normalizeProductUrl, + parsePriceText, + parseRatingValue, + parseReviewCount, + resolveRankingUrl, + toAbsoluteAmazonUrl, + uniqueNonEmpty, + type AmazonRankingListType, +} from './shared.js'; + +export interface RankingCardPayload { + rank_text?: string | null; + asin?: string | null; + title?: string | null; + href?: string | null; + price_text?: string | null; + rating_text?: string | null; + review_count_text?: string | null; + card_text?: string | null; +} + +interface RankingPagePayload { + href?: string; + title?: string; + list_title?: string; + category_title?: string; + category_path?: string[]; + cards?: RankingCardPayload[]; + page_links?: string[]; + visible_category_links?: Array<{ + title?: string | null; + url?: string | null; + node_id?: string | null; + }>; +} + +interface RankingCommandDefinition { + commandName: string; + listType: AmazonRankingListType; + description: string; +} + +interface RankingNormalizeContext { + listType: AmazonRankingListType; + rankFallback: number; + listTitle: string | null; + sourceUrl: string; + categoryTitle: string | null; + categoryUrl: string | null; + categoryPath: string[]; + visibleCategoryLinks: Array<{ title: string; url: string; node_id: string | null }>; +} + +function parseRank(rawRank: string | null | undefined, fallback: number): number { + const normalized = cleanText(rawRank); + const match = normalized.match(/(\d{1,4})/); + if (!match) return fallback; + const parsed = Number.parseInt(match[1], 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} + +function normalizeVisibleCategoryLinks( + links: RankingPagePayload['visible_category_links'], +): Array<{ title: string; url: string; node_id: string | null }> { + const normalized = (links ?? []) + .map((entry) => ({ + title: cleanText(entry?.title), + url: toAbsoluteAmazonUrl(entry?.url) ?? '', + node_id: cleanText(entry?.node_id) || extractCategoryNodeId(entry?.url) || null, + })) + .filter((entry) => Boolean(entry.title) && Boolean(entry.url)); + + const seen = new Set(); + const deduped: Array<{ title: string; url: string; node_id: string | null }> = []; + for (const entry of normalized) { + if (seen.has(entry.url)) continue; + seen.add(entry.url); + deduped.push(entry); + } + return deduped; +} + +export function normalizeRankingCandidate( + candidate: RankingCardPayload, + context: RankingNormalizeContext, +): Record { + const productUrl = normalizeProductUrl(candidate.href); + const asin = extractAsin(candidate.asin ?? '') ?? extractAsin(productUrl ?? '') ?? null; + const title = cleanText(candidate.title) || firstMeaningfulLine(candidate.card_text); + const price = parsePriceText(cleanText(candidate.price_text) || candidate.card_text); + const ratingText = cleanText(candidate.rating_text) || null; + const reviewCountText = cleanText(candidate.review_count_text) + || extractReviewCountFromCardText(candidate.card_text) + || null; + const provenance = buildProvenance(context.sourceUrl); + const categoryUrl = context.categoryUrl || context.sourceUrl; + + return { + list_type: context.listType, + rank: parseRank(candidate.rank_text, context.rankFallback), + asin, + title: title || null, + product_url: productUrl, + price_text: price.price_text, + price_value: price.price_value, + currency: price.currency, + rating_text: ratingText, + rating_value: parseRatingValue(ratingText), + review_count_text: reviewCountText, + review_count: parseReviewCount(reviewCountText), + list_title: context.listTitle, + category_title: context.categoryTitle, + category_url: categoryUrl, + category_node_id: extractCategoryNodeId(categoryUrl), + category_path: context.categoryPath, + visible_category_links: context.visibleCategoryLinks, + ...provenance, + }; +} + +async function readRankingPage( + page: IPage, + listType: AmazonRankingListType, + url: string, +): Promise { + const state = await gotoAndReadState(page, url, 2500, listType); + assertUsableState(state, listType); + + return await page.evaluate(` + (() => ({ + href: window.location.href, + title: document.title || '', + list_title: + document.querySelector('#zg_banner_text')?.textContent + || document.querySelector('h1')?.textContent + || '', + category_title: + document.querySelector('#zg_browseRoot .zg_selected')?.textContent + || document.querySelector('#wayfinding-breadcrumbs_feature_div ul li:last-child')?.textContent + || document.querySelector('#wayfinding-breadcrumbs_container ul li:last-child')?.textContent + || '', + category_path: Array.from(document.querySelectorAll( + '#zg_browseRoot ul li a, #zg_browseRoot ul li span, ' + + '#wayfinding-breadcrumbs_feature_div ul li a, #wayfinding-breadcrumbs_feature_div ul li span.a-list-item, ' + + '#wayfinding-breadcrumbs_container ul li a, #wayfinding-breadcrumbs_container ul li span.a-list-item' + )) + .map((entry) => (entry.textContent || '').trim()) + .filter(Boolean), + cards: Array.from(document.querySelectorAll( + '.p13n-sc-uncoverable-faceout, .zg-grid-general-faceout, [data-asin][class*="p13n"]' + )).map((card) => ({ + rank_text: + card.querySelector('.zg-bdg-text')?.textContent + || card.querySelector('[class*="rank"]')?.textContent + || '', + asin: + card.getAttribute('data-asin') + || card.getAttribute('id') + || '', + title: + card.querySelector('[class*="line-clamp"]')?.textContent + || card.querySelector('img')?.getAttribute('alt') + || card.querySelector('a[href*="/dp/"]')?.textContent + || '', + href: + card.querySelector('a[href*="/dp/"], a[href*="/gp/product/"]')?.href + || '', + price_text: + card.querySelector('.a-price .a-offscreen')?.textContent + || card.querySelector('.a-color-price')?.textContent + || '', + rating_text: + card.querySelector('[aria-label*="out of 5 stars"]')?.getAttribute('aria-label') + || '', + review_count_text: + card.querySelector('a[href*="#customerReviews"]')?.textContent + || card.querySelector('.a-size-small')?.textContent + || '', + card_text: card.innerText || '', + })), + page_links: Array.from(document.querySelectorAll('.a-pagination a[href], li.a-normal a[href], li.a-selected a[href]')) + .map((anchor) => anchor.href || '') + .filter(Boolean), + visible_category_links: Array.from(document.querySelectorAll( + '#zg_browseRoot a[href], #zg-left-col a[href], [class*="zg-browse"] a[href]' + )).map((anchor) => ({ + title: (anchor.textContent || '').trim(), + url: anchor.href || '', + node_id: + anchor.getAttribute('data-node-id') + || anchor.dataset?.nodeid + || '', + })) + .filter((entry) => entry.title && entry.url), + }))() + `) as RankingPagePayload; +} + +function createEmptyResultHint(commandName: string): string { + return [ + `Open the same Amazon ${commandName} page in shared Chrome and verify ranked items are visible.`, + 'If the page shows a robot check, clear it manually and retry.', + ].join(' '); +} + +export function createRankingCliOptions(definition: RankingCommandDefinition): CliOptions { + return { + site: 'amazon', + name: definition.commandName, + description: definition.description, + domain: 'amazon.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'input', + positional: true, + help: 'Ranking URL or supported Amazon path. Omit to use the list root.', + }, + { + name: 'limit', + type: 'int', + default: 100, + help: 'Maximum number of ranked items to return (default 100)', + }, + ], + columns: ['list_type', 'rank', 'asin', 'title', 'price_text', 'rating_value', 'review_count'], + func: async (page, kwargs) => { + const limit = Math.max(1, Number(kwargs.limit) || 100); + const initialUrl = resolveRankingUrl(definition.listType, typeof kwargs.input === 'string' ? kwargs.input : undefined); + + const queue = [initialUrl]; + const visited = new Set(); + const seenEntityKeys = new Set(); + const results: Record[] = []; + let listTitle: string | null = null; + + while (queue.length > 0 && results.length < limit) { + const nextUrl = queue.shift()!; + if (visited.has(nextUrl)) continue; + visited.add(nextUrl); + + const payload = await readRankingPage(page, definition.listType, nextUrl); + const sourceUrl = cleanText(payload.href) || nextUrl; + listTitle = cleanText(payload.list_title) || cleanText(payload.title) || listTitle; + const categoryPath = uniqueNonEmpty(payload.category_path ?? []); + const categoryTitle = cleanText(payload.category_title) + || (categoryPath.length > 0 ? categoryPath[categoryPath.length - 1] : ''); + const visibleCategoryLinks = normalizeVisibleCategoryLinks(payload.visible_category_links); + const cards = payload.cards ?? []; + + for (const card of cards) { + const normalized = normalizeRankingCandidate(card, { + listType: definition.listType, + rankFallback: results.length + 1, + listTitle, + sourceUrl, + categoryTitle: categoryTitle || null, + categoryUrl: sourceUrl, + categoryPath, + visibleCategoryLinks, + }); + + const dedupeKey = cleanText(String(normalized.asin ?? '')) + || cleanText(String(normalized.product_url ?? '')); + if (dedupeKey && seenEntityKeys.has(dedupeKey)) continue; + if (dedupeKey) seenEntityKeys.add(dedupeKey); + + results.push(normalized); + if (results.length >= limit) break; + } + + const pageLinks = uniqueNonEmpty(payload.page_links ?? []); + for (const href of pageLinks) { + const absolute = toAbsoluteAmazonUrl(href); + if (!absolute || !isRankingPaginationUrl(definition.listType, absolute)) continue; + if (!visited.has(absolute) && !queue.includes(absolute)) { + queue.push(absolute); + } + } + } + + if (results.length === 0) { + throw new CommandExecutionError( + `amazon ${definition.commandName} did not expose any ranked items`, + createEmptyResultHint(definition.commandName), + ); + } + + return results.slice(0, limit); + }, + }; +} + +export const __test__ = { + parseRank, + normalizeVisibleCategoryLinks, + normalizeRankingCandidate, +}; diff --git a/src/clis/amazon/shared.test.ts b/src/clis/amazon/shared.test.ts index 1c5f5f86..3c6a7e3c 100644 --- a/src/clis/amazon/shared.test.ts +++ b/src/clis/amazon/shared.test.ts @@ -34,4 +34,20 @@ describe('amazon shared helpers', () => { expect(__test__.resolveBestsellersUrl('/Best-Sellers/zgbs')).toBe('https://www.amazon.com/Best-Sellers/zgbs'); expect(() => __test__.resolveBestsellersUrl('desk shelf organizer')).toThrow('amazon bestsellers expects a best sellers URL or /zgbs path'); }); + + it('resolves and validates all ranking list URLs', () => { + expect(__test__.resolveRankingUrl('new_releases')).toBe('https://www.amazon.com/gp/new-releases'); + expect(__test__.resolveRankingUrl('movers_shakers')).toBe('https://www.amazon.com/gp/movers-and-shakers'); + expect(__test__.resolveRankingUrl('new_releases', '/gp/new-releases/kitchen')).toBe('https://www.amazon.com/gp/new-releases/kitchen'); + expect(__test__.resolveRankingUrl( + 'bestsellers', + 'https://www.amazon.com/Best-Sellers/zgbs/ref=zg_bsnr_tab_bs', + )).toBe('https://www.amazon.com/Best-Sellers/zgbs'); + expect(() => __test__.resolveRankingUrl('movers_shakers', 'https://example.com/gp/movers-and-shakers')).toThrow('Invalid Amazon URL'); + }); + + it('extracts category node id from URL best effort', () => { + expect(__test__.extractCategoryNodeId('https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/3744371')).toBe('3744371'); + expect(__test__.extractCategoryNodeId('https://www.amazon.com/s?k=desk+organizer&rh=n%3A1064954')).toBe('1064954'); + }); }); diff --git a/src/clis/amazon/shared.ts b/src/clis/amazon/shared.ts index ca30e36e..e498cd60 100644 --- a/src/clis/amazon/shared.ts +++ b/src/clis/amazon/shared.ts @@ -5,6 +5,8 @@ export const SITE = 'amazon'; export const DOMAIN = 'amazon.com'; export const HOME_URL = 'https://www.amazon.com/'; export const BESTSELLERS_URL = 'https://www.amazon.com/Best-Sellers/zgbs'; +export const NEW_RELEASES_URL = 'https://www.amazon.com/gp/new-releases'; +export const MOVERS_SHAKERS_URL = 'https://www.amazon.com/gp/movers-and-shakers'; export const SEARCH_URL_PREFIX = 'https://www.amazon.com/s?k='; export const PRODUCT_URL_PREFIX = 'https://www.amazon.com/dp/'; export const DISCUSSION_URL_PREFIX = 'https://www.amazon.com/product-reviews/'; @@ -28,6 +30,40 @@ const ROBOT_TEXT_PATTERNS = [ 'To discuss automated access to Amazon data please contact', ]; +export type AmazonRankingListType = 'bestsellers' | 'new_releases' | 'movers_shakers'; + +interface AmazonRankingSpec { + commandName: string; + rootUrl: string; + pathPattern: RegExp; + invalidInputMessage: string; + invalidInputHint: string; +} + +const AMAZON_RANKING_SPECS: Record = { + bestsellers: { + commandName: 'bestsellers', + rootUrl: BESTSELLERS_URL, + pathPattern: /(?:^|\/)zgbs(?:\/|$)/i, + invalidInputMessage: 'amazon bestsellers expects a best sellers URL or /zgbs path', + invalidInputHint: 'Example: opencli amazon bestsellers https://www.amazon.com/Best-Sellers/zgbs', + }, + new_releases: { + commandName: 'new-releases', + rootUrl: NEW_RELEASES_URL, + pathPattern: /\/gp\/new-releases(?:\/|$)/i, + invalidInputMessage: 'amazon new-releases expects a new releases URL or /gp/new-releases path', + invalidInputHint: 'Example: opencli amazon new-releases https://www.amazon.com/gp/new-releases', + }, + movers_shakers: { + commandName: 'movers-shakers', + rootUrl: MOVERS_SHAKERS_URL, + pathPattern: /\/gp\/movers-and-shakers(?:\/|$)/i, + invalidInputMessage: 'amazon movers-shakers expects a movers-and-shakers URL or /gp/movers-and-shakers path', + invalidInputHint: 'Example: opencli amazon movers-shakers https://www.amazon.com/gp/movers-and-shakers', + }, +}; + export interface ProvenanceFields { source_url: string; fetched_at: string; @@ -115,23 +151,105 @@ export function buildDiscussionUrl(input: string): string { return `${DISCUSSION_URL_PREFIX}${asin}`; } -export function resolveBestsellersUrl(input?: string): string { +function getRankingSpec(listType: AmazonRankingListType): AmazonRankingSpec { + return AMAZON_RANKING_SPECS[listType]; +} + +export function isSupportedRankingPath(listType: AmazonRankingListType, inputUrl: string): boolean { + try { + const url = new URL(inputUrl); + return getRankingSpec(listType).pathPattern.test(url.pathname); + } catch { + return false; + } +} + +export function resolveRankingUrl(listType: AmazonRankingListType, input?: string): string { + const spec = getRankingSpec(listType); const normalized = cleanText(input); - if (!normalized) return BESTSELLERS_URL; - if (normalized === 'root') return BESTSELLERS_URL; + if (!normalized || normalized === 'root') return spec.rootUrl; + + let candidateUrl: string; if (normalized.startsWith('/')) { - return new URL(normalized, HOME_URL).toString(); + candidateUrl = new URL(normalized, HOME_URL).toString(); + } else if (/^https?:\/\//i.test(normalized)) { + candidateUrl = canonicalizeAmazonUrl(normalized); + } else if (normalized.includes('amazon.') && normalized.includes('/')) { + candidateUrl = canonicalizeAmazonUrl(`https://${normalized.replace(/^\/+/, '')}`); + } else { + throw new ArgumentError(spec.invalidInputMessage, spec.invalidInputHint); } - if (/^https?:\/\//i.test(normalized)) { - return canonicalizeAmazonUrl(normalized); + + if (!isSupportedRankingPath(listType, candidateUrl)) { + throw new ArgumentError(spec.invalidInputMessage, spec.invalidInputHint); } - if (normalized.includes('/zgbs/')) { - return canonicalizeAmazonUrl(`https://${normalized.replace(/^\/+/, '')}`); + return normalizeRankingInputUrl(candidateUrl); +} + +function normalizeRankingInputUrl(inputUrl: string): string { + try { + const url = new URL(inputUrl); + const normalizedPathSegments = url.pathname + .split('/') + .filter(Boolean) + .filter((segment) => !/^ref=/i.test(segment)); + url.pathname = `/${normalizedPathSegments.join('/')}`; + url.hash = ''; + // Ranking pages are frequently shared with tracking refs that can land on unstable variants. + // Dropping ref keeps the canonical ranking path while preserving useful params (for example pg=2). + url.searchParams.delete('ref'); + return url.toString(); + } catch { + return inputUrl; } - throw new ArgumentError( - 'amazon bestsellers expects a best sellers URL or /zgbs path', - 'Example: opencli amazon bestsellers https://www.amazon.com/Best-Sellers/zgbs', - ); +} + +export function isRankingPaginationUrl(listType: AmazonRankingListType, inputUrl: string): boolean { + const absolute = toAbsoluteAmazonUrl(inputUrl); + if (!absolute || !isSupportedRankingPath(listType, absolute)) return false; + + try { + const url = new URL(absolute); + const ref = cleanText(url.searchParams.get('ref')).toLowerCase(); + // pg= query param is the most reliable pagination indicator across all ranking lists + return url.searchParams.has('pg') + || /(?:^|_)pg(?:_|$)/.test(ref) + // Amazon ranking pagination refs: zg_bs_pg_ (bestsellers), zg_bsnr_pg_ (new releases), zg_bsms_pg_ (movers & shakers) + || /zg_bs(?:nr|ms)?_pg_/.test(ref); + } catch { + return false; + } +} + +export function extractCategoryNodeId(inputUrl: string | null | undefined): string | null { + const absolute = toAbsoluteAmazonUrl(inputUrl); + if (!absolute) return null; + + try { + const url = new URL(absolute); + + for (const key of ['node', 'nodeid', 'nodeId', 'browseNode']) { + const value = cleanText(url.searchParams.get(key)); + if (/^\d{4,}$/.test(value)) return value; + } + + const rhValue = cleanText(url.searchParams.get('rh')); + const rhMatch = decodeURIComponent(rhValue).match(/(?:^|,)\s*n:(\d{4,})(?:,|$)/i); + if (rhMatch) return rhMatch[1]; + + const pathMatches = [...url.pathname.matchAll(/\/(\d{4,})(?=\/|$)/g)]; + if (pathMatches.length > 0) { + return pathMatches[pathMatches.length - 1][1]; + } + } catch { + return null; + } + + return null; +} + +export function resolveBestsellersUrl(input?: string): string { + return resolveRankingUrl('bestsellers', input); } export function canonicalizeAmazonUrl(input: string): string { @@ -305,6 +423,10 @@ export const __test__ = { buildProductUrl, buildDiscussionUrl, resolveBestsellersUrl, + resolveRankingUrl, + isSupportedRankingPath, + isRankingPaginationUrl, + extractCategoryNodeId, parsePriceText, parseRatingValue, parseReviewCount,