226 lines
6.8 KiB
JavaScript
226 lines
6.8 KiB
JavaScript
function normalizeText(value) {
|
|
return typeof value === 'string' ? value.trim() : '';
|
|
}
|
|
|
|
function normalizeList(value) {
|
|
return Array.isArray(value) ? value : [];
|
|
}
|
|
|
|
function uniqueStrings(values) {
|
|
const seen = new Set();
|
|
return normalizeList(values)
|
|
.map((value) => normalizeText(value))
|
|
.filter((value) => {
|
|
if (!value || seen.has(value)) return false;
|
|
seen.add(value);
|
|
return true;
|
|
});
|
|
}
|
|
|
|
function isHexColor(value) {
|
|
return /^#(?:[0-9a-f]{3}|[0-9a-f]{6}|[0-9a-f]{8})$/i.test(normalizeText(value));
|
|
}
|
|
|
|
function toColorEntry(name, value) {
|
|
const hex = normalizeText(value);
|
|
if (!isHexColor(hex)) return null;
|
|
|
|
return {
|
|
name: normalizeText(name) || 'color',
|
|
hex: hex.toUpperCase(),
|
|
};
|
|
}
|
|
|
|
function extractHostname(url) {
|
|
try {
|
|
return new URL(url).hostname.replace(/^www\./i, '').toLowerCase();
|
|
} catch {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
function excerptText(page) {
|
|
const summary = normalizeText(page?.summary);
|
|
if (summary) return summary.slice(0, 800);
|
|
|
|
return normalizeText(page?.markdown)
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.slice(0, 1600);
|
|
}
|
|
|
|
function normalizeLinkItem(link) {
|
|
if (typeof link === 'string') {
|
|
const href = normalizeText(link);
|
|
return href ? { href, label: href } : null;
|
|
}
|
|
|
|
if (!link || typeof link !== 'object') return null;
|
|
|
|
const href = normalizeText(link.href || link.url || link.link);
|
|
if (!href) return null;
|
|
|
|
return {
|
|
href,
|
|
label: normalizeText(link.text || link.title || link.label) || href,
|
|
};
|
|
}
|
|
|
|
function dedupeLinks(links) {
|
|
const seen = new Set();
|
|
return normalizeList(links)
|
|
.map(normalizeLinkItem)
|
|
.filter((link) => {
|
|
if (!link || seen.has(link.href)) return false;
|
|
seen.add(link.href);
|
|
return true;
|
|
});
|
|
}
|
|
|
|
function summarizePage(page, pageType) {
|
|
const metadata = page?.metadata && typeof page.metadata === 'object' ? page.metadata : {};
|
|
|
|
return {
|
|
url: normalizeText(page?.url),
|
|
type: pageType,
|
|
title: normalizeText(metadata.title || metadata.ogTitle),
|
|
description: normalizeText(metadata.description || metadata.ogDescription),
|
|
excerpt: excerptText(page),
|
|
linkCount: normalizeList(page?.links).length,
|
|
imageCount: uniqueStrings(page?.images).length,
|
|
};
|
|
}
|
|
|
|
function buildRepresentativeTextBlocks(homepage, aboutPage, productPages) {
|
|
return [homepage, aboutPage, ...productPages]
|
|
.filter(Boolean)
|
|
.map((page) => ({
|
|
url: page.url,
|
|
title: page.title,
|
|
pageType: page.type,
|
|
text: page.excerpt,
|
|
}));
|
|
}
|
|
|
|
function flattenBranding(homepage) {
|
|
const branding = homepage?.branding && typeof homepage.branding === 'object' ? homepage.branding : {};
|
|
const colorEntries = [];
|
|
const logos = [];
|
|
const brandNames = [];
|
|
|
|
const colorSource = branding.colors || branding.colorPalette || branding.palette;
|
|
if (Array.isArray(colorSource)) {
|
|
colorSource.forEach((color, index) => {
|
|
if (typeof color === 'string') {
|
|
const entry = toColorEntry(`color_${index + 1}`, color);
|
|
if (entry) colorEntries.push(entry);
|
|
return;
|
|
}
|
|
|
|
if (color && typeof color === 'object') {
|
|
const entry = toColorEntry(
|
|
color.name || color.label || color.role || `color_${index + 1}`,
|
|
color.hex || color.value || color.color
|
|
);
|
|
if (entry) colorEntries.push(entry);
|
|
}
|
|
});
|
|
} else if (colorSource && typeof colorSource === 'object') {
|
|
Object.entries(colorSource).forEach(([name, value]) => {
|
|
const entry = toColorEntry(name, value);
|
|
if (entry) colorEntries.push(entry);
|
|
});
|
|
}
|
|
|
|
normalizeList(branding.logos || branding.logoUrls || branding.logo_urls).forEach((logo) => {
|
|
if (typeof logo === 'string') {
|
|
logos.push(logo);
|
|
} else if (logo && typeof logo === 'object') {
|
|
logos.push(logo.url || logo.src || '');
|
|
}
|
|
});
|
|
|
|
const brandName = normalizeText(branding.brandName || branding.brand_name || branding.name);
|
|
if (brandName) brandNames.push(brandName);
|
|
|
|
return {
|
|
colors: uniqueStrings(colorEntries.map((entry) => entry.hex)),
|
|
labeledColors: colorEntries.filter((entry, index, values) => (
|
|
values.findIndex((candidate) => candidate.name === entry.name && candidate.hex === entry.hex) === index
|
|
)),
|
|
logos: uniqueStrings(logos),
|
|
brandNames: uniqueStrings(brandNames),
|
|
};
|
|
}
|
|
|
|
function buildSummaryText(startUrl, homepage, aboutPage, productPages) {
|
|
const blocks = [`Site: ${startUrl}`];
|
|
|
|
[homepage, aboutPage, ...productPages].filter(Boolean).forEach((page, index) => {
|
|
blocks.push([
|
|
`Page ${index + 1}: ${page.title || page.url}`,
|
|
`Type: ${page.type}`,
|
|
page.description ? `Description: ${page.description}` : '',
|
|
page.excerpt ? `Excerpt: ${page.excerpt}` : '',
|
|
].filter(Boolean).join('\n'));
|
|
});
|
|
|
|
return blocks.join('\n\n').slice(0, 24000);
|
|
}
|
|
|
|
function buildCrawlSummary(data = {}, startUrlOverride = '') {
|
|
const startUrl = normalizeText(startUrlOverride || data.startUrl);
|
|
const homepageRaw = data.homepage || null;
|
|
const aboutRaw = data.aboutPage || null;
|
|
const productRawPages = normalizeList(data.productPages);
|
|
const domain = extractHostname(startUrl || homepageRaw?.url || '');
|
|
|
|
const homepage = homepageRaw ? summarizePage(homepageRaw, 'home') : null;
|
|
const aboutPage = aboutRaw ? summarizePage(aboutRaw, 'about') : null;
|
|
const productPages = productRawPages.map((page) => summarizePage(page, 'product'));
|
|
const representativePages = [homepage, aboutPage, ...productPages].filter(Boolean);
|
|
const representativeTextBlocks = buildRepresentativeTextBlocks(homepage, aboutPage, productPages);
|
|
|
|
const homepageLinks = dedupeLinks(data?.links?.homepage || homepageRaw?.links || []);
|
|
const discoveryLinks = dedupeLinks(data?.links?.discovery || []);
|
|
const links = dedupeLinks([...homepageLinks, ...discoveryLinks]);
|
|
const topImages = uniqueStrings([
|
|
...normalizeList(homepageRaw?.images),
|
|
...normalizeList(aboutRaw?.images),
|
|
...productRawPages.flatMap((page) => normalizeList(page?.images)),
|
|
]).slice(0, 60);
|
|
const branding = flattenBranding(homepageRaw);
|
|
|
|
return {
|
|
startUrl,
|
|
domain,
|
|
pageCount: representativePages.length,
|
|
siteStats: {
|
|
totalPages: representativePages.length,
|
|
totalLinks: links.length,
|
|
totalImages: topImages.length,
|
|
aboutPages: aboutPage ? 1 : 0,
|
|
productPages: productPages.length,
|
|
},
|
|
homepage,
|
|
aboutPage,
|
|
contactPage: null,
|
|
policyPages: [],
|
|
productPages,
|
|
representativePages,
|
|
representativeTextBlocks,
|
|
keyPages: {
|
|
about: aboutPage ? [aboutPage] : [],
|
|
products: productPages,
|
|
},
|
|
navigation: homepageLinks.slice(0, 30),
|
|
links,
|
|
socialLinks: links.filter((link) => /instagram|facebook|x\.com|twitter|linkedin|youtube|pinterest/i.test(link.href)),
|
|
topImages,
|
|
screenshots: [],
|
|
branding,
|
|
summaryText: buildSummaryText(startUrl, homepage, aboutPage, productPages),
|
|
};
|
|
}
|
|
|
|
module.exports = { buildCrawlSummary };
|