sms-extension/server/services/firecrawl.js

274 lines
8.1 KiB
JavaScript

const axios = require('axios');
const FIRECRAWL_BASE_URL = 'https://api.firecrawl.dev/v2';
const DEFAULT_PRODUCT_PAGE_LIMIT = 5;
const HOMEPAGE_FORMATS = ['markdown', 'links', 'images', 'branding'];
const DISCOVERY_PAGE_FORMATS = ['markdown', 'links', 'images'];
const CONTENT_PAGE_FORMATS = ['markdown', 'images'];
function getApiKey() {
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) throw new Error('FIRECRAWL_API_KEY not set');
return apiKey;
}
function getClient() {
return axios.create({
baseURL: FIRECRAWL_BASE_URL,
timeout: 60000,
headers: {
Authorization: `Bearer ${getApiKey()}`,
'Content-Type': 'application/json',
},
validateStatus: () => true,
});
}
function normalizeDataEnvelope(data) {
if (data && typeof data === 'object' && data.data && typeof data.data === 'object') {
return data.data;
}
return data && typeof data === 'object' ? data : {};
}
function normalizeText(value) {
return typeof value === 'string' ? value.trim() : '';
}
function normalizeUrl(rawUrl, baseUrl = '') {
const value = normalizeText(rawUrl);
if (!value) return '';
try {
const url = baseUrl ? new URL(value, baseUrl) : new URL(value);
url.hash = '';
url.search = '';
return url.toString().replace(/\/$/, '');
} catch {
return '';
}
}
function normalizePageResult(page = {}, pageType = '') {
const metadata = page?.metadata && typeof page.metadata === 'object' ? page.metadata : {};
return {
url: normalizeUrl(page.url),
pageType: normalizeText(pageType || page.pageType),
markdown: typeof page.markdown === 'string' ? page.markdown : '',
summary: typeof page.summary === 'string' ? page.summary : '',
metadata,
links: Array.isArray(page.links) ? page.links : [],
images: Array.isArray(page.images) ? page.images : [],
branding: page?.branding && typeof page.branding === 'object' ? page.branding : {},
};
}
function normalizeLinkItem(link, baseUrl = '') {
if (typeof link === 'string') {
const href = normalizeUrl(link, baseUrl);
return href ? { href, label: href } : null;
}
if (!link || typeof link !== 'object') return null;
const href = normalizeUrl(link.href || link.url || link.link, baseUrl);
if (!href) return null;
return {
href,
label: normalizeText(link.text || link.title || link.label) || href,
};
}
function dedupeLinkItems(links, baseUrl = '') {
const seen = new Set();
return (Array.isArray(links) ? links : [])
.map((link) => normalizeLinkItem(link, baseUrl))
.filter((link) => {
if (!link || seen.has(link.href)) return false;
seen.add(link.href);
return true;
});
}
function getHostname(url) {
try {
return new URL(url).hostname;
} catch {
return '';
}
}
function getPathname(url) {
try {
return new URL(url).pathname.toLowerCase();
} catch {
return '';
}
}
function isSameDomain(url, startUrl) {
return getHostname(url) === getHostname(startUrl);
}
function isUtilityLink(link) {
const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase();
return /(login|sign[- ]?in|sign[- ]?up|account|cart|checkout|wishlist|track|tracking|privacy|refund|return|shipping|terms|policy|policies|contact|support|help|faq|blog|blogs|journal|careers?|gift card|stores?)/i.test(value);
}
function isAboutLink(link) {
const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase();
return /(about|about-us|our story|our-story|story|brand story|who we are|who-we-are)/i.test(value);
}
function isDiscoveryLink(link) {
const value = `${normalizeText(link?.href)} ${normalizeText(link?.label)}`.toLowerCase();
return /(shop|products?|collections?|catalog|storefront|category|categories|new arrivals|best sellers|featured)/i.test(value);
}
function scoreProductCandidate(link) {
if (!link || isUtilityLink(link) || isAboutLink(link) || isDiscoveryLink(link)) return -100;
const href = normalizeText(link.href).toLowerCase();
const label = normalizeText(link.label);
const pathname = getPathname(href);
const segments = pathname.split('/').filter(Boolean);
let score = 0;
if (/(^|\/)(product|products|p|item|items|buy)(\/|$)/i.test(pathname)) score += 12;
if (/-\d{4,}$/.test(pathname) || /\d{4,}/.test(pathname)) score += 5;
if (pathname.includes('/collections/') || pathname.includes('/category/')) score -= 8;
if (segments.length >= 2) score += 3;
if (label.length >= 8 && label.length <= 120) score += 3;
if (/buy|shop now|view product|details/i.test(label)) score += 4;
if (href.split('-').length >= 4) score += 2;
return score;
}
function getSameDomainLinks(links, startUrl) {
return dedupeLinkItems(links, startUrl).filter((link) => isSameDomain(link.href, startUrl));
}
function selectAboutUrl(links, startUrl) {
return getSameDomainLinks(links, startUrl).find(isAboutLink)?.href || '';
}
function selectDiscoveryUrl(links, startUrl) {
return getSameDomainLinks(links, startUrl)
.filter((link) => isDiscoveryLink(link) && !isUtilityLink(link))
.map((link) => link.href)[0] || '';
}
function selectProductUrls(links, startUrl, limit = DEFAULT_PRODUCT_PAGE_LIMIT) {
return getSameDomainLinks(links, startUrl)
.map((link) => ({ ...link, score: scoreProductCandidate(link) }))
.filter((link) => link.score > 0)
.sort((left, right) => right.score - left.score)
.slice(0, limit)
.map((link) => link.href);
}
async function scrapePage(url, { formats, onlyMainContent = true, pageType = '' } = {}) {
const client = getClient();
const response = await client.post('/scrape', {
url,
formats,
onlyMainContent,
});
if (response.status < 200 || response.status >= 300) {
throw new Error(`Firecrawl scrape failed for ${url} with status ${response.status}: ${JSON.stringify(response.data)}`);
}
return normalizePageResult(normalizeDataEnvelope(response.data), pageType);
}
async function buildBrandContextPlan(startUrl) {
const normalizedStartUrl = normalizeUrl(startUrl);
const homepage = await scrapePage(normalizedStartUrl, {
formats: HOMEPAGE_FORMATS,
onlyMainContent: false,
pageType: 'home',
});
const homepageLinks = getSameDomainLinks(homepage.links, normalizedStartUrl);
const aboutUrl = selectAboutUrl(homepageLinks, normalizedStartUrl);
const directProductUrls = selectProductUrls(homepageLinks, normalizedStartUrl);
const discoveryUrl = directProductUrls.length === 0
? selectDiscoveryUrl(homepageLinks, normalizedStartUrl)
: '';
return {
startUrl: normalizedStartUrl,
homepage,
aboutUrl,
discoveryUrl,
productUrls: directProductUrls,
};
}
async function collectBrandContextPages(plan) {
const startUrl = normalizeUrl(plan?.startUrl);
if (!startUrl || !plan?.homepage) {
throw new Error('Brand context plan is missing a homepage scrape');
}
let discoveryPage = null;
let productUrls = Array.isArray(plan.productUrls) ? plan.productUrls.slice(0, DEFAULT_PRODUCT_PAGE_LIMIT) : [];
if (productUrls.length === 0 && normalizeText(plan.discoveryUrl)) {
discoveryPage = await scrapePage(plan.discoveryUrl, {
formats: DISCOVERY_PAGE_FORMATS,
onlyMainContent: true,
pageType: 'discovery',
});
productUrls = selectProductUrls(discoveryPage.links, startUrl);
}
const aboutPromise = normalizeText(plan.aboutUrl)
? scrapePage(plan.aboutUrl, {
formats: CONTENT_PAGE_FORMATS,
onlyMainContent: true,
pageType: 'about',
})
: Promise.resolve(null);
const productPromises = productUrls
.slice(0, DEFAULT_PRODUCT_PAGE_LIMIT)
.map((productUrl) => scrapePage(productUrl, {
formats: CONTENT_PAGE_FORMATS,
onlyMainContent: true,
pageType: 'product',
}).catch(() => null));
const [aboutPage, productResults] = await Promise.all([
aboutPromise,
Promise.all(productPromises),
]);
const productPages = productResults.filter(Boolean);
const items = [
plan.homepage,
aboutPage,
discoveryPage,
...productPages,
].filter(Boolean);
return {
startUrl,
homepage: plan.homepage,
aboutPage,
discoveryPage,
productPages,
items,
};
}
module.exports = {
scrapePage,
buildBrandContextPlan,
collectBrandContextPages,
};