function normalizeText(value) { return typeof value === 'string' ? value.trim() : ''; } function normalizeList(value) { return Array.isArray(value) ? value : []; } function uniqueStrings(values) { const seen = new Set(); return normalizeList(values) .map((value) => normalizeText(value)) .filter((value) => { if (!value || seen.has(value)) return false; seen.add(value); return true; }); } function isHexColor(value) { return /^#(?:[0-9a-f]{3}|[0-9a-f]{6}|[0-9a-f]{8})$/i.test(normalizeText(value)); } function toColorEntry(name, value) { const hex = normalizeText(value); if (!isHexColor(hex)) return null; return { name: normalizeText(name) || 'color', hex: hex.toUpperCase(), }; } function extractHostname(url) { try { return new URL(url).hostname.replace(/^www\./i, '').toLowerCase(); } catch { return ''; } } function excerptText(page) { const summary = normalizeText(page?.summary); if (summary) return summary.slice(0, 800); return normalizeText(page?.markdown) .replace(/\n{3,}/g, '\n\n') .slice(0, 1600); } function normalizeLinkItem(link) { if (typeof link === 'string') { const href = normalizeText(link); return href ? { href, label: href } : null; } if (!link || typeof link !== 'object') return null; const href = normalizeText(link.href || link.url || link.link); if (!href) return null; return { href, label: normalizeText(link.text || link.title || link.label) || href, }; } function dedupeLinks(links) { const seen = new Set(); return normalizeList(links) .map(normalizeLinkItem) .filter((link) => { if (!link || seen.has(link.href)) return false; seen.add(link.href); return true; }); } function summarizePage(page, pageType) { const metadata = page?.metadata && typeof page.metadata === 'object' ? page.metadata : {}; return { url: normalizeText(page?.url), type: pageType, title: normalizeText(metadata.title || metadata.ogTitle), description: normalizeText(metadata.description || metadata.ogDescription), excerpt: excerptText(page), linkCount: normalizeList(page?.links).length, imageCount: uniqueStrings(page?.images).length, }; } function buildRepresentativeTextBlocks(homepage, aboutPage, productPages) { return [homepage, aboutPage, ...productPages] .filter(Boolean) .map((page) => ({ url: page.url, title: page.title, pageType: page.type, text: page.excerpt, })); } function flattenBranding(homepage) { const branding = homepage?.branding && typeof homepage.branding === 'object' ? homepage.branding : {}; const colorEntries = []; const logos = []; const brandNames = []; const colorSource = branding.colors || branding.colorPalette || branding.palette; if (Array.isArray(colorSource)) { colorSource.forEach((color, index) => { if (typeof color === 'string') { const entry = toColorEntry(`color_${index + 1}`, color); if (entry) colorEntries.push(entry); return; } if (color && typeof color === 'object') { const entry = toColorEntry( color.name || color.label || color.role || `color_${index + 1}`, color.hex || color.value || color.color ); if (entry) colorEntries.push(entry); } }); } else if (colorSource && typeof colorSource === 'object') { Object.entries(colorSource).forEach(([name, value]) => { const entry = toColorEntry(name, value); if (entry) colorEntries.push(entry); }); } normalizeList(branding.logos || branding.logoUrls || branding.logo_urls).forEach((logo) => { if (typeof logo === 'string') { logos.push(logo); } else if (logo && typeof logo === 'object') { logos.push(logo.url || logo.src || ''); } }); const brandName = normalizeText(branding.brandName || branding.brand_name || branding.name); if (brandName) brandNames.push(brandName); return { colors: uniqueStrings(colorEntries.map((entry) => entry.hex)), labeledColors: colorEntries.filter((entry, index, values) => ( values.findIndex((candidate) => candidate.name === entry.name && candidate.hex === entry.hex) === index )), logos: uniqueStrings(logos), brandNames: uniqueStrings(brandNames), }; } function buildSummaryText(startUrl, homepage, aboutPage, productPages) { const blocks = [`Site: ${startUrl}`]; [homepage, aboutPage, ...productPages].filter(Boolean).forEach((page, index) => { blocks.push([ `Page ${index + 1}: ${page.title || page.url}`, `Type: ${page.type}`, page.description ? `Description: ${page.description}` : '', page.excerpt ? `Excerpt: ${page.excerpt}` : '', ].filter(Boolean).join('\n')); }); return blocks.join('\n\n').slice(0, 24000); } function buildCrawlSummary(data = {}, startUrlOverride = '') { const startUrl = normalizeText(startUrlOverride || data.startUrl); const homepageRaw = data.homepage || null; const aboutRaw = data.aboutPage || null; const productRawPages = normalizeList(data.productPages); const domain = extractHostname(startUrl || homepageRaw?.url || ''); const homepage = homepageRaw ? summarizePage(homepageRaw, 'home') : null; const aboutPage = aboutRaw ? summarizePage(aboutRaw, 'about') : null; const productPages = productRawPages.map((page) => summarizePage(page, 'product')); const representativePages = [homepage, aboutPage, ...productPages].filter(Boolean); const representativeTextBlocks = buildRepresentativeTextBlocks(homepage, aboutPage, productPages); const homepageLinks = dedupeLinks(data?.links?.homepage || homepageRaw?.links || []); const discoveryLinks = dedupeLinks(data?.links?.discovery || []); const links = dedupeLinks([...homepageLinks, ...discoveryLinks]); const topImages = uniqueStrings([ ...normalizeList(homepageRaw?.images), ...normalizeList(aboutRaw?.images), ...productRawPages.flatMap((page) => normalizeList(page?.images)), ]).slice(0, 60); const branding = flattenBranding(homepageRaw); return { startUrl, domain, pageCount: representativePages.length, siteStats: { totalPages: representativePages.length, totalLinks: links.length, totalImages: topImages.length, aboutPages: aboutPage ? 1 : 0, productPages: productPages.length, }, homepage, aboutPage, contactPage: null, policyPages: [], productPages, representativePages, representativeTextBlocks, keyPages: { about: aboutPage ? [aboutPage] : [], products: productPages, }, navigation: homepageLinks.slice(0, 30), links, socialLinks: links.filter((link) => /instagram|facebook|x\.com|twitter|linkedin|youtube|pinterest/i.test(link.href)), topImages, screenshots: [], branding, summaryText: buildSummaryText(startUrl, homepage, aboutPage, productPages), }; } module.exports = { buildCrawlSummary };