274 lines
8.1 KiB
JavaScript
274 lines
8.1 KiB
JavaScript
const axios = require('axios');
|
|
|
|
const FIRECRAWL_BASE_URL = 'https://api.firecrawl.dev/v2';
|
|
const DEFAULT_PRODUCT_PAGE_LIMIT = 5;
|
|
const HOMEPAGE_FORMATS = ['markdown', 'links', 'images', 'branding'];
|
|
const DISCOVERY_PAGE_FORMATS = ['markdown', 'links', 'images'];
|
|
const CONTENT_PAGE_FORMATS = ['markdown', 'images'];
|
|
|
|
function getApiKey() {
|
|
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
if (!apiKey) throw new Error('FIRECRAWL_API_KEY not set');
|
|
return apiKey;
|
|
}
|
|
|
|
function getClient() {
|
|
return axios.create({
|
|
baseURL: FIRECRAWL_BASE_URL,
|
|
timeout: 60000,
|
|
headers: {
|
|
Authorization: `Bearer ${getApiKey()}`,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
validateStatus: () => true,
|
|
});
|
|
}
|
|
|
|
function normalizeDataEnvelope(data) {
|
|
if (data && typeof data === 'object' && data.data && typeof data.data === 'object') {
|
|
return data.data;
|
|
}
|
|
return data && typeof data === 'object' ? data : {};
|
|
}
|
|
|
|
function normalizeText(value) {
|
|
return typeof value === 'string' ? value.trim() : '';
|
|
}
|
|
|
|
function normalizeUrl(rawUrl, baseUrl = '') {
|
|
const value = normalizeText(rawUrl);
|
|
if (!value) return '';
|
|
|
|
try {
|
|
const url = baseUrl ? new URL(value, baseUrl) : new URL(value);
|
|
url.hash = '';
|
|
url.search = '';
|
|
return url.toString().replace(/\/$/, '');
|
|
} catch {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
function normalizePageResult(page = {}, pageType = '') {
|
|
const metadata = page?.metadata && typeof page.metadata === 'object' ? page.metadata : {};
|
|
|
|
return {
|
|
url: normalizeUrl(page.url),
|
|
pageType: normalizeText(pageType || page.pageType),
|
|
markdown: typeof page.markdown === 'string' ? page.markdown : '',
|
|
summary: typeof page.summary === 'string' ? page.summary : '',
|
|
metadata,
|
|
links: Array.isArray(page.links) ? page.links : [],
|
|
images: Array.isArray(page.images) ? page.images : [],
|
|
branding: page?.branding && typeof page.branding === 'object' ? page.branding : {},
|
|
};
|
|
}
|
|
|
|
function normalizeLinkItem(link, baseUrl = '') {
|
|
if (typeof link === 'string') {
|
|
const href = normalizeUrl(link, baseUrl);
|
|
return href ? { href, label: href } : null;
|
|
}
|
|
|
|
if (!link || typeof link !== 'object') return null;
|
|
|
|
const href = normalizeUrl(link.href || link.url || link.link, baseUrl);
|
|
if (!href) return null;
|
|
|
|
return {
|
|
href,
|
|
label: normalizeText(link.text || link.title || link.label) || href,
|
|
};
|
|
}
|
|
|
|
function dedupeLinkItems(links, baseUrl = '') {
|
|
const seen = new Set();
|
|
return (Array.isArray(links) ? links : [])
|
|
.map((link) => normalizeLinkItem(link, baseUrl))
|
|
.filter((link) => {
|
|
if (!link || seen.has(link.href)) return false;
|
|
seen.add(link.href);
|
|
return true;
|
|
});
|
|
}
|
|
|
|
function getHostname(url) {
|
|
try {
|
|
return new URL(url).hostname;
|
|
} catch {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
function getPathname(url) {
|
|
try {
|
|
return new URL(url).pathname.toLowerCase();
|
|
} catch {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
function isSameDomain(url, startUrl) {
|
|
return getHostname(url) === getHostname(startUrl);
|
|
}
|
|
|
|
function isUtilityLink(link) {
|
|
const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase();
|
|
return /(login|sign[- ]?in|sign[- ]?up|account|cart|checkout|wishlist|track|tracking|privacy|refund|return|shipping|terms|policy|policies|contact|support|help|faq|blog|blogs|journal|careers?|gift card|stores?)/i.test(value);
|
|
}
|
|
|
|
function isAboutLink(link) {
|
|
const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase();
|
|
return /(about|about-us|our story|our-story|story|brand story|who we are|who-we-are)/i.test(value);
|
|
}
|
|
|
|
function isDiscoveryLink(link) {
|
|
const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase();
|
|
return /(shop|products?|collections?|catalog|storefront|category|categories|new arrivals|best sellers|featured)/i.test(value);
|
|
}
|
|
|
|
function scoreProductCandidate(link) {
|
|
if (!link || isUtilityLink(link) || isAboutLink(link) || isDiscoveryLink(link)) return -100;
|
|
|
|
const href = normalizeText(link.href).toLowerCase();
|
|
const label = normalizeText(link.label);
|
|
const pathname = getPathname(href);
|
|
const segments = pathname.split('/').filter(Boolean);
|
|
let score = 0;
|
|
|
|
if (/(^|\/)(product|products|p|item|items|buy)(\/|$)/i.test(pathname)) score += 12;
|
|
if (/-\d{4,}$/.test(pathname) || /\d{4,}/.test(pathname)) score += 5;
|
|
if (pathname.includes('/collections/') || pathname.includes('/category/')) score -= 8;
|
|
if (segments.length >= 2) score += 3;
|
|
if (label.length >= 8 && label.length <= 120) score += 3;
|
|
if (/buy|shop now|view product|details/i.test(label)) score += 4;
|
|
if (href.split('-').length >= 4) score += 2;
|
|
|
|
return score;
|
|
}
|
|
|
|
function getSameDomainLinks(links, startUrl) {
|
|
return dedupeLinkItems(links, startUrl).filter((link) => isSameDomain(link.href, startUrl));
|
|
}
|
|
|
|
function selectAboutUrl(links, startUrl) {
|
|
return getSameDomainLinks(links, startUrl).find(isAboutLink)?.href || '';
|
|
}
|
|
|
|
function selectDiscoveryUrl(links, startUrl) {
|
|
return getSameDomainLinks(links, startUrl)
|
|
.filter((link) => isDiscoveryLink(link) && !isUtilityLink(link))
|
|
.map((link) => link.href)[0] || '';
|
|
}
|
|
|
|
function selectProductUrls(links, startUrl, limit = DEFAULT_PRODUCT_PAGE_LIMIT) {
|
|
return getSameDomainLinks(links, startUrl)
|
|
.map((link) => ({ ...link, score: scoreProductCandidate(link) }))
|
|
.filter((link) => link.score > 0)
|
|
.sort((left, right) => right.score - left.score)
|
|
.slice(0, limit)
|
|
.map((link) => link.href);
|
|
}
|
|
|
|
async function scrapePage(url, { formats, onlyMainContent = true, pageType = '' } = {}) {
|
|
const client = getClient();
|
|
const response = await client.post('/scrape', {
|
|
url,
|
|
formats,
|
|
onlyMainContent,
|
|
});
|
|
|
|
if (response.status < 200 || response.status >= 300) {
|
|
throw new Error(`Firecrawl scrape failed for ${url} with status ${response.status}: ${JSON.stringify(response.data)}`);
|
|
}
|
|
|
|
return normalizePageResult(normalizeDataEnvelope(response.data), pageType);
|
|
}
|
|
|
|
async function buildBrandContextPlan(startUrl) {
|
|
const normalizedStartUrl = normalizeUrl(startUrl);
|
|
const homepage = await scrapePage(normalizedStartUrl, {
|
|
formats: HOMEPAGE_FORMATS,
|
|
onlyMainContent: false,
|
|
pageType: 'home',
|
|
});
|
|
|
|
const homepageLinks = getSameDomainLinks(homepage.links, normalizedStartUrl);
|
|
const aboutUrl = selectAboutUrl(homepageLinks, normalizedStartUrl);
|
|
const directProductUrls = selectProductUrls(homepageLinks, normalizedStartUrl);
|
|
const discoveryUrl = directProductUrls.length === 0
|
|
? selectDiscoveryUrl(homepageLinks, normalizedStartUrl)
|
|
: '';
|
|
|
|
return {
|
|
startUrl: normalizedStartUrl,
|
|
homepage,
|
|
aboutUrl,
|
|
discoveryUrl,
|
|
productUrls: directProductUrls,
|
|
};
|
|
}
|
|
|
|
async function collectBrandContextPages(plan) {
|
|
const startUrl = normalizeUrl(plan?.startUrl);
|
|
if (!startUrl || !plan?.homepage) {
|
|
throw new Error('Brand context plan is missing a homepage scrape');
|
|
}
|
|
|
|
let discoveryPage = null;
|
|
let productUrls = Array.isArray(plan.productUrls) ? plan.productUrls.slice(0, DEFAULT_PRODUCT_PAGE_LIMIT) : [];
|
|
|
|
if (productUrls.length === 0 && normalizeText(plan.discoveryUrl)) {
|
|
discoveryPage = await scrapePage(plan.discoveryUrl, {
|
|
formats: DISCOVERY_PAGE_FORMATS,
|
|
onlyMainContent: true,
|
|
pageType: 'discovery',
|
|
});
|
|
productUrls = selectProductUrls(discoveryPage.links, startUrl);
|
|
}
|
|
|
|
const aboutPromise = normalizeText(plan.aboutUrl)
|
|
? scrapePage(plan.aboutUrl, {
|
|
formats: CONTENT_PAGE_FORMATS,
|
|
onlyMainContent: true,
|
|
pageType: 'about',
|
|
})
|
|
: Promise.resolve(null);
|
|
|
|
const productPromises = productUrls
|
|
.slice(0, DEFAULT_PRODUCT_PAGE_LIMIT)
|
|
.map((productUrl) => scrapePage(productUrl, {
|
|
formats: CONTENT_PAGE_FORMATS,
|
|
onlyMainContent: true,
|
|
pageType: 'product',
|
|
}).catch(() => null));
|
|
|
|
const [aboutPage, productResults] = await Promise.all([
|
|
aboutPromise,
|
|
Promise.all(productPromises),
|
|
]);
|
|
|
|
const productPages = productResults.filter(Boolean);
|
|
const items = [
|
|
plan.homepage,
|
|
aboutPage,
|
|
discoveryPage,
|
|
...productPages,
|
|
].filter(Boolean);
|
|
|
|
return {
|
|
startUrl,
|
|
homepage: plan.homepage,
|
|
aboutPage,
|
|
discoveryPage,
|
|
productPages,
|
|
items,
|
|
};
|
|
}
|
|
|
|
module.exports = {
|
|
scrapePage,
|
|
buildBrandContextPlan,
|
|
collectBrandContextPages,
|
|
};
|