function normalizeText(value) { return typeof value === 'string' ? value.trim() : ''; } function normalizeList(value) { return Array.isArray(value) ? value : []; } function isAbsoluteHttpUrl(value) { const normalized = normalizeText(value); if (!normalized) return false; try { const parsed = new URL(normalized); return parsed.protocol === 'http:' || parsed.protocol === 'https:'; } catch { return false; } } function collectImageUrls(value, bucket = []) { if (Array.isArray(value)) { value.forEach((entry) => collectImageUrls(entry, bucket)); return bucket; } if (typeof value === 'string') { if (isAbsoluteHttpUrl(value)) bucket.push(value); return bucket; } if (!value || typeof value !== 'object') { return bucket; } const url = normalizeText(value.url || value.href || value.src || value.secure_url); if (isAbsoluteHttpUrl(url)) bucket.push(url); return bucket; } function uniqueStrings(values) { const seen = new Set(); return normalizeList(values) .map((value) => normalizeText(value)) .filter((value) => { if (!value || seen.has(value)) return false; seen.add(value); return true; }); } function isHexColor(value) { return /^#(?:[0-9a-f]{3}|[0-9a-f]{6}|[0-9a-f]{8})$/i.test(normalizeText(value)); } function toColorEntry(name, value) { const hex = normalizeText(value); if (!isHexColor(hex)) return null; return { name: normalizeText(name) || 'color', hex: hex.toUpperCase(), }; } function extractHostname(url) { try { return new URL(url).hostname.replace(/^www\./i, '').toLowerCase(); } catch { return ''; } } function excerptText(page) { const summary = normalizeText(page?.summary); if (summary) return summary.slice(0, 800); return normalizeText(page?.markdown) .replace(/\n{3,}/g, '\n\n') .slice(0, 1600); } function normalizeLinkItem(link) { if (typeof link === 'string') { const href = normalizeText(link); return href ? { href, label: href } : null; } if (!link || typeof link !== 'object') return null; const href = normalizeText(link.href || link.url || link.link); if (!href) return null; return { href, label: normalizeText(link.text || link.title || link.label) || href, }; } function dedupeLinks(links) { const seen = new Set(); return normalizeList(links) .map(normalizeLinkItem) .filter((link) => { if (!link || seen.has(link.href)) return false; seen.add(link.href); return true; }); } function scoreLogoCandidate(url) { const normalized = normalizeText(url).toLowerCase(); if (!normalized) return -100; let score = 0; if (/logo|brandmark|wordmark|logomark/.test(normalized)) score += 40; if (/favicon|apple-touch-icon|android-chrome|mstile|site-icon|siteicon|icon/.test(normalized)) score += 25; if (/header|navbar|nav/.test(normalized)) score += 8; if (/hero|banner|carousel|slider|product|collection|catalog|lookbook/.test(normalized)) score -= 35; if (/social|facebook|instagram|twitter|linkedin|youtube|pinterest|avatar|profile/.test(normalized)) score -= 25; if (/sprite|tracking|pixel|placeholder/.test(normalized)) score -= 40; return score; } function rankLogoCandidates(values) { return uniqueStrings(values) .map((url, index) => ({ index, score: scoreLogoCandidate(url), url, })) .sort((left, right) => right.score - left.score || left.index - right.index) .map((entry) => entry.url); } function extractMetadataImageCandidates(metadata = {}) { const candidates = []; Object.entries(metadata || {}).forEach(([key, value]) => { const normalizedKey = normalizeText(key).toLowerCase(); if (!/(image|logo|icon|favicon|thumbnail|apple)/.test(normalizedKey)) return; collectImageUrls(value, candidates); }); return rankLogoCandidates(candidates).filter((url) => scoreLogoCandidate(url) > 0); } function summarizePage(page, pageType) { const metadata = page?.metadata && typeof page.metadata === 'object' ? page.metadata : {}; return { url: normalizeText(page?.url), type: pageType, title: normalizeText(metadata.title || metadata.ogTitle), description: normalizeText(metadata.description || metadata.ogDescription), excerpt: excerptText(page), linkCount: normalizeList(page?.links).length, imageCount: uniqueStrings(page?.images).length, }; } function buildRepresentativeTextBlocks(homepage, aboutPage, productPages) { return [homepage, aboutPage, ...productPages] .filter(Boolean) .map((page) => ({ url: page.url, title: page.title, pageType: page.type, text: page.excerpt, })); } function flattenBranding(homepage, topImages = []) { const branding = homepage?.branding && typeof homepage.branding === 'object' ? homepage.branding : {}; const colorEntries = []; const logos = []; const brandNames = []; const colorSource = branding.colors || branding.colorPalette || branding.palette; if (Array.isArray(colorSource)) { colorSource.forEach((color, index) => { if (typeof color === 'string') { const entry = toColorEntry(`color_${index + 1}`, color); if (entry) colorEntries.push(entry); return; } if (color && typeof color === 'object') { const entry = toColorEntry( color.name || color.label || color.role || `color_${index + 1}`, color.hex || color.value || color.color ); if (entry) colorEntries.push(entry); } }); } else if (colorSource && typeof colorSource === 'object') { Object.entries(colorSource).forEach(([name, value]) => { const entry = toColorEntry(name, value); if (entry) colorEntries.push(entry); }); } normalizeList(branding.logos || branding.logoUrls || branding.logo_urls).forEach((logo) => { if (typeof logo === 'string') { logos.push(logo); } else if (logo && typeof logo === 'object') { logos.push(logo.url || logo.src || ''); } }); const brandName = normalizeText(branding.brandName || branding.brand_name || branding.name); if (brandName) brandNames.push(brandName); const metadataImageCandidates = extractMetadataImageCandidates(homepage?.metadata || {}); const topLogoCandidates = rankLogoCandidates(topImages).filter((url) => scoreLogoCandidate(url) > 0); const logoCandidates = uniqueStrings([ ...logos, ...metadataImageCandidates, ...topLogoCandidates, ]); return { colors: uniqueStrings(colorEntries.map((entry) => entry.hex)), labeledColors: colorEntries.filter((entry, index, values) => ( values.findIndex((candidate) => candidate.name === entry.name && candidate.hex === entry.hex) === index )), logos: uniqueStrings(logos), logoCandidates, primaryLogoUrl: logoCandidates[0] || '', brandNames: uniqueStrings(brandNames), }; } function buildSummaryText(startUrl, homepage, aboutPage, productPages) { const blocks = [`Site: ${startUrl}`]; [homepage, aboutPage, ...productPages].filter(Boolean).forEach((page, index) => { blocks.push([ `Page ${index + 1}: ${page.title || page.url}`, `Type: ${page.type}`, page.description ? `Description: ${page.description}` : '', page.excerpt ? `Excerpt: ${page.excerpt}` : '', ].filter(Boolean).join('\n')); }); return blocks.join('\n\n').slice(0, 24000); } function buildCrawlSummary(data = {}, startUrlOverride = '') { const startUrl = normalizeText(startUrlOverride || data.startUrl); const homepageRaw = data.homepage || null; const aboutRaw = data.aboutPage || null; const productRawPages = normalizeList(data.productPages); const domain = extractHostname(startUrl || homepageRaw?.url || ''); const homepage = homepageRaw ? summarizePage(homepageRaw, 'home') : null; const aboutPage = aboutRaw ? summarizePage(aboutRaw, 'about') : null; const productPages = productRawPages.map((page) => summarizePage(page, 'product')); const representativePages = [homepage, aboutPage, ...productPages].filter(Boolean); const representativeTextBlocks = buildRepresentativeTextBlocks(homepage, aboutPage, productPages); const homepageLinks = dedupeLinks(data?.links?.homepage || homepageRaw?.links || []); const discoveryLinks = dedupeLinks(data?.links?.discovery || []); const links = dedupeLinks([...homepageLinks, ...discoveryLinks]); const topImages = uniqueStrings([ ...normalizeList(homepageRaw?.images), ...normalizeList(aboutRaw?.images), ...productRawPages.flatMap((page) => normalizeList(page?.images)), ]).slice(0, 60); const branding = flattenBranding(homepageRaw, topImages); return { startUrl, domain, pageCount: representativePages.length, siteStats: { totalPages: representativePages.length, totalLinks: links.length, totalImages: topImages.length, aboutPages: aboutPage ? 1 : 0, productPages: productPages.length, }, homepage, aboutPage, contactPage: null, policyPages: [], productPages, representativePages, representativeTextBlocks, keyPages: { about: aboutPage ? [aboutPage] : [], products: productPages, }, navigation: homepageLinks.slice(0, 30), links, socialLinks: links.filter((link) => /instagram|facebook|x\.com|twitter|linkedin|youtube|pinterest/i.test(link.href)), topImages, screenshots: [], branding, summaryText: buildSummaryText(startUrl, homepage, aboutPage, productPages), }; } module.exports = { buildCrawlSummary };