const axios = require('axios'); const FIRECRAWL_BASE_URL = 'https://api.firecrawl.dev/v2'; const DEFAULT_PRODUCT_PAGE_LIMIT = 5; const HOMEPAGE_FORMATS = ['markdown', 'links', 'images', 'branding']; const DISCOVERY_PAGE_FORMATS = ['markdown', 'links', 'images']; const CONTENT_PAGE_FORMATS = ['markdown', 'images']; function getApiKey() { const apiKey = process.env.FIRECRAWL_API_KEY; if (!apiKey) throw new Error('FIRECRAWL_API_KEY not set'); return apiKey; } function getClient() { return axios.create({ baseURL: FIRECRAWL_BASE_URL, timeout: 60000, headers: { Authorization: `Bearer ${getApiKey()}`, 'Content-Type': 'application/json', }, validateStatus: () => true, }); } function normalizeDataEnvelope(data) { if (data && typeof data === 'object' && data.data && typeof data.data === 'object') { return data.data; } return data && typeof data === 'object' ? data : {}; } function normalizeText(value) { return typeof value === 'string' ? value.trim() : ''; } function normalizeUrl(rawUrl, baseUrl = '') { const value = normalizeText(rawUrl); if (!value) return ''; try { const url = baseUrl ? new URL(value, baseUrl) : new URL(value); url.hash = ''; url.search = ''; return url.toString().replace(/\/$/, ''); } catch { return ''; } } function normalizePageResult(page = {}, pageType = '') { const metadata = page?.metadata && typeof page.metadata === 'object' ? page.metadata : {}; return { url: normalizeUrl(page.url), pageType: normalizeText(pageType || page.pageType), markdown: typeof page.markdown === 'string' ? page.markdown : '', summary: typeof page.summary === 'string' ? page.summary : '', metadata, links: Array.isArray(page.links) ? page.links : [], images: Array.isArray(page.images) ? page.images : [], branding: page?.branding && typeof page.branding === 'object' ? page.branding : {}, }; } function normalizeLinkItem(link, baseUrl = '') { if (typeof link === 'string') { const href = normalizeUrl(link, baseUrl); return href ? { href, label: href } : null; } if (!link || typeof link !== 'object') return null; const href = normalizeUrl(link.href || link.url || link.link, baseUrl); if (!href) return null; return { href, label: normalizeText(link.text || link.title || link.label) || href, }; } function dedupeLinkItems(links, baseUrl = '') { const seen = new Set(); return (Array.isArray(links) ? links : []) .map((link) => normalizeLinkItem(link, baseUrl)) .filter((link) => { if (!link || seen.has(link.href)) return false; seen.add(link.href); return true; }); } function getHostname(url) { try { return new URL(url).hostname; } catch { return ''; } } function getPathname(url) { try { return new URL(url).pathname.toLowerCase(); } catch { return ''; } } function isSameDomain(url, startUrl) { return getHostname(url) === getHostname(startUrl); } function isUtilityLink(link) { const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase(); return /(login|sign[- ]?in|sign[- ]?up|account|cart|checkout|wishlist|track|tracking|privacy|refund|return|shipping|terms|policy|policies|contact|support|help|faq|blog|blogs|journal|careers?|gift card|stores?)/i.test(value); } function isAboutLink(link) { const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase(); return /(about|about-us|our story|our-story|story|brand story|who we are|who-we-are)/i.test(value); } function isDiscoveryLink(link) { const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase(); return /(shop|products?|collections?|catalog|storefront|category|categories|new arrivals|best sellers|featured)/i.test(value); } function scoreProductCandidate(link) { if (!link || isUtilityLink(link) || isAboutLink(link) || isDiscoveryLink(link)) return -100; const href = normalizeText(link.href).toLowerCase(); const label = normalizeText(link.label); const pathname = getPathname(href); const segments = pathname.split('/').filter(Boolean); let score = 0; if (/(^|\/)(product|products|p|item|items|buy)(\/|$)/i.test(pathname)) score += 12; if (/-\d{4,}$/.test(pathname) || /\d{4,}/.test(pathname)) score += 5; if (pathname.includes('/collections/') || pathname.includes('/category/')) score -= 8; if (segments.length >= 2) score += 3; if (label.length >= 8 && label.length <= 120) score += 3; if (/buy|shop now|view product|details/i.test(label)) score += 4; if (href.split('-').length >= 4) score += 2; return score; } function getSameDomainLinks(links, startUrl) { return dedupeLinkItems(links, startUrl).filter((link) => isSameDomain(link.href, startUrl)); } function selectAboutUrl(links, startUrl) { return getSameDomainLinks(links, startUrl).find(isAboutLink)?.href || ''; } function selectDiscoveryUrl(links, startUrl) { return getSameDomainLinks(links, startUrl) .filter((link) => isDiscoveryLink(link) && !isUtilityLink(link)) .map((link) => link.href)[0] || ''; } function selectProductUrls(links, startUrl, limit = DEFAULT_PRODUCT_PAGE_LIMIT) { return getSameDomainLinks(links, startUrl) .map((link) => ({ ...link, score: scoreProductCandidate(link) })) .filter((link) => link.score > 0) .sort((left, right) => right.score - left.score) .slice(0, limit) .map((link) => link.href); } async function scrapePage(url, { formats, onlyMainContent = true, pageType = '' } = {}) { const client = getClient(); const response = await client.post('/scrape', { url, formats, onlyMainContent, }); if (response.status < 200 || response.status >= 300) { throw new Error(`Firecrawl scrape failed for ${url} with status ${response.status}: ${JSON.stringify(response.data)}`); } return normalizePageResult(normalizeDataEnvelope(response.data), pageType); } async function buildBrandContextPlan(startUrl) { const normalizedStartUrl = normalizeUrl(startUrl); const homepage = await scrapePage(normalizedStartUrl, { formats: HOMEPAGE_FORMATS, onlyMainContent: false, pageType: 'home', }); const homepageLinks = getSameDomainLinks(homepage.links, normalizedStartUrl); const aboutUrl = selectAboutUrl(homepageLinks, normalizedStartUrl); const directProductUrls = selectProductUrls(homepageLinks, normalizedStartUrl); const discoveryUrl = directProductUrls.length === 0 ? selectDiscoveryUrl(homepageLinks, normalizedStartUrl) : ''; return { startUrl: normalizedStartUrl, homepage, aboutUrl, discoveryUrl, productUrls: directProductUrls, }; } async function collectBrandContextPages(plan) { const startUrl = normalizeUrl(plan?.startUrl); if (!startUrl || !plan?.homepage) { throw new Error('Brand context plan is missing a homepage scrape'); } let discoveryPage = null; let productUrls = Array.isArray(plan.productUrls) ? plan.productUrls.slice(0, DEFAULT_PRODUCT_PAGE_LIMIT) : []; if (productUrls.length === 0 && normalizeText(plan.discoveryUrl)) { discoveryPage = await scrapePage(plan.discoveryUrl, { formats: DISCOVERY_PAGE_FORMATS, onlyMainContent: true, pageType: 'discovery', }); productUrls = selectProductUrls(discoveryPage.links, startUrl); } const aboutPromise = normalizeText(plan.aboutUrl) ? scrapePage(plan.aboutUrl, { formats: CONTENT_PAGE_FORMATS, onlyMainContent: true, pageType: 'about', }) : Promise.resolve(null); const productPromises = productUrls .slice(0, DEFAULT_PRODUCT_PAGE_LIMIT) .map((productUrl) => scrapePage(productUrl, { formats: CONTENT_PAGE_FORMATS, onlyMainContent: true, pageType: 'product', }).catch(() => null)); const [aboutPage, productResults] = await Promise.all([ aboutPromise, Promise.all(productPromises), ]); const productPages = productResults.filter(Boolean); const items = [ plan.homepage, aboutPage, discoveryPage, ...productPages, ].filter(Boolean); return { startUrl, homepage: plan.homepage, aboutPage, discoveryPage, productPages, items, }; } module.exports = { scrapePage, buildBrandContextPlan, collectBrandContextPages, };