bolt-templates-sms-extensio.../server/services/crawlSummary.js
2026-04-09 15:30:14 +05:30

308 lines
9.4 KiB
JavaScript

function normalizeText(value) {
return typeof value === 'string' ? value.trim() : '';
}
function normalizeList(value) {
return Array.isArray(value) ? value : [];
}
function isAbsoluteHttpUrl(value) {
const normalized = normalizeText(value);
if (!normalized) return false;
try {
const parsed = new URL(normalized);
return parsed.protocol === 'http:' || parsed.protocol === 'https:';
} catch {
return false;
}
}
function collectImageUrls(value, bucket = []) {
if (Array.isArray(value)) {
value.forEach((entry) => collectImageUrls(entry, bucket));
return bucket;
}
if (typeof value === 'string') {
if (isAbsoluteHttpUrl(value)) bucket.push(value);
return bucket;
}
if (!value || typeof value !== 'object') {
return bucket;
}
const url = normalizeText(value.url || value.href || value.src || value.secure_url);
if (isAbsoluteHttpUrl(url)) bucket.push(url);
return bucket;
}
function uniqueStrings(values) {
const seen = new Set();
return normalizeList(values)
.map((value) => normalizeText(value))
.filter((value) => {
if (!value || seen.has(value)) return false;
seen.add(value);
return true;
});
}
function isHexColor(value) {
return /^#(?:[0-9a-f]{3}|[0-9a-f]{6}|[0-9a-f]{8})$/i.test(normalizeText(value));
}
function toColorEntry(name, value) {
const hex = normalizeText(value);
if (!isHexColor(hex)) return null;
return {
name: normalizeText(name) || 'color',
hex: hex.toUpperCase(),
};
}
function extractHostname(url) {
try {
return new URL(url).hostname.replace(/^www\./i, '').toLowerCase();
} catch {
return '';
}
}
function excerptText(page) {
const summary = normalizeText(page?.summary);
if (summary) return summary.slice(0, 800);
return normalizeText(page?.markdown)
.replace(/\n{3,}/g, '\n\n')
.slice(0, 1600);
}
function normalizeLinkItem(link) {
if (typeof link === 'string') {
const href = normalizeText(link);
return href ? { href, label: href } : null;
}
if (!link || typeof link !== 'object') return null;
const href = normalizeText(link.href || link.url || link.link);
if (!href) return null;
return {
href,
label: normalizeText(link.text || link.title || link.label) || href,
};
}
function dedupeLinks(links) {
const seen = new Set();
return normalizeList(links)
.map(normalizeLinkItem)
.filter((link) => {
if (!link || seen.has(link.href)) return false;
seen.add(link.href);
return true;
});
}
function scoreLogoCandidate(url) {
const normalized = normalizeText(url).toLowerCase();
if (!normalized) return -100;
let score = 0;
if (/logo|brandmark|wordmark|logomark/.test(normalized)) score += 40;
if (/favicon|apple-touch-icon|android-chrome|mstile|site-icon|siteicon|icon/.test(normalized)) score += 25;
if (/header|navbar|nav/.test(normalized)) score += 8;
if (/hero|banner|carousel|slider|product|collection|catalog|lookbook/.test(normalized)) score -= 35;
if (/social|facebook|instagram|twitter|linkedin|youtube|pinterest|avatar|profile/.test(normalized)) score -= 25;
if (/sprite|tracking|pixel|placeholder/.test(normalized)) score -= 40;
return score;
}
function rankLogoCandidates(values) {
return uniqueStrings(values)
.map((url, index) => ({
index,
score: scoreLogoCandidate(url),
url,
}))
.sort((left, right) => right.score - left.score || left.index - right.index)
.map((entry) => entry.url);
}
function extractMetadataImageCandidates(metadata = {}) {
const candidates = [];
Object.entries(metadata || {}).forEach(([key, value]) => {
const normalizedKey = normalizeText(key).toLowerCase();
if (!/(image|logo|icon|favicon|thumbnail|apple)/.test(normalizedKey)) return;
collectImageUrls(value, candidates);
});
return rankLogoCandidates(candidates).filter((url) => scoreLogoCandidate(url) > 0);
}
function summarizePage(page, pageType) {
const metadata = page?.metadata && typeof page.metadata === 'object' ? page.metadata : {};
return {
url: normalizeText(page?.url),
type: pageType,
title: normalizeText(metadata.title || metadata.ogTitle),
description: normalizeText(metadata.description || metadata.ogDescription),
excerpt: excerptText(page),
linkCount: normalizeList(page?.links).length,
imageCount: uniqueStrings(page?.images).length,
};
}
function buildRepresentativeTextBlocks(homepage, aboutPage, productPages) {
return [homepage, aboutPage, ...productPages]
.filter(Boolean)
.map((page) => ({
url: page.url,
title: page.title,
pageType: page.type,
text: page.excerpt,
}));
}
function flattenBranding(homepage, topImages = []) {
const branding = homepage?.branding && typeof homepage.branding === 'object' ? homepage.branding : {};
const colorEntries = [];
const logos = [];
const brandNames = [];
const colorSource = branding.colors || branding.colorPalette || branding.palette;
if (Array.isArray(colorSource)) {
colorSource.forEach((color, index) => {
if (typeof color === 'string') {
const entry = toColorEntry(`color_${index + 1}`, color);
if (entry) colorEntries.push(entry);
return;
}
if (color && typeof color === 'object') {
const entry = toColorEntry(
color.name || color.label || color.role || `color_${index + 1}`,
color.hex || color.value || color.color
);
if (entry) colorEntries.push(entry);
}
});
} else if (colorSource && typeof colorSource === 'object') {
Object.entries(colorSource).forEach(([name, value]) => {
const entry = toColorEntry(name, value);
if (entry) colorEntries.push(entry);
});
}
normalizeList(branding.logos || branding.logoUrls || branding.logo_urls).forEach((logo) => {
if (typeof logo === 'string') {
logos.push(logo);
} else if (logo && typeof logo === 'object') {
logos.push(logo.url || logo.src || '');
}
});
const brandName = normalizeText(branding.brandName || branding.brand_name || branding.name);
if (brandName) brandNames.push(brandName);
const metadataImageCandidates = extractMetadataImageCandidates(homepage?.metadata || {});
const topLogoCandidates = rankLogoCandidates(topImages).filter((url) => scoreLogoCandidate(url) > 0);
const logoCandidates = uniqueStrings([
...logos,
...metadataImageCandidates,
...topLogoCandidates,
]);
return {
colors: uniqueStrings(colorEntries.map((entry) => entry.hex)),
labeledColors: colorEntries.filter((entry, index, values) => (
values.findIndex((candidate) => candidate.name === entry.name && candidate.hex === entry.hex) === index
)),
logos: uniqueStrings(logos),
logoCandidates,
primaryLogoUrl: logoCandidates[0] || '',
brandNames: uniqueStrings(brandNames),
};
}
function buildSummaryText(startUrl, homepage, aboutPage, productPages) {
const blocks = [`Site: ${startUrl}`];
[homepage, aboutPage, ...productPages].filter(Boolean).forEach((page, index) => {
blocks.push([
`Page ${index + 1}: ${page.title || page.url}`,
`Type: ${page.type}`,
page.description ? `Description: ${page.description}` : '',
page.excerpt ? `Excerpt: ${page.excerpt}` : '',
].filter(Boolean).join('\n'));
});
return blocks.join('\n\n').slice(0, 24000);
}
function buildCrawlSummary(data = {}, startUrlOverride = '') {
const startUrl = normalizeText(startUrlOverride || data.startUrl);
const homepageRaw = data.homepage || null;
const aboutRaw = data.aboutPage || null;
const productRawPages = normalizeList(data.productPages);
const domain = extractHostname(startUrl || homepageRaw?.url || '');
const homepage = homepageRaw ? summarizePage(homepageRaw, 'home') : null;
const aboutPage = aboutRaw ? summarizePage(aboutRaw, 'about') : null;
const productPages = productRawPages.map((page) => summarizePage(page, 'product'));
const representativePages = [homepage, aboutPage, ...productPages].filter(Boolean);
const representativeTextBlocks = buildRepresentativeTextBlocks(homepage, aboutPage, productPages);
const homepageLinks = dedupeLinks(data?.links?.homepage || homepageRaw?.links || []);
const discoveryLinks = dedupeLinks(data?.links?.discovery || []);
const links = dedupeLinks([...homepageLinks, ...discoveryLinks]);
const topImages = uniqueStrings([
...normalizeList(homepageRaw?.images),
...normalizeList(aboutRaw?.images),
...productRawPages.flatMap((page) => normalizeList(page?.images)),
]).slice(0, 60);
const branding = flattenBranding(homepageRaw, topImages);
return {
startUrl,
domain,
pageCount: representativePages.length,
siteStats: {
totalPages: representativePages.length,
totalLinks: links.length,
totalImages: topImages.length,
aboutPages: aboutPage ? 1 : 0,
productPages: productPages.length,
},
homepage,
aboutPage,
contactPage: null,
policyPages: [],
productPages,
representativePages,
representativeTextBlocks,
keyPages: {
about: aboutPage ? [aboutPage] : [],
products: productPages,
},
navigation: homepageLinks.slice(0, 30),
links,
socialLinks: links.filter((link) => /instagram|facebook|x\.com|twitter|linkedin|youtube|pinterest/i.test(link.href)),
topImages,
screenshots: [],
branding,
summaryText: buildSummaryText(startUrl, homepage, aboutPage, productPages),
};
}
module.exports = { buildCrawlSummary };