107 lines
4.1 KiB
JavaScript
107 lines
4.1 KiB
JavaScript
|
import { Document } from "@langchain/core/documents";
|
||
|
import { chunkArray } from "@langchain/core/utils/chunk_array";
|
||
|
import { CheerioWebBaseLoader } from "./cheerio.js";
|
||
|
const DEFAULT_CHUNK_SIZE = 300;
|
||
|
export class SitemapLoader extends CheerioWebBaseLoader {
|
||
|
constructor(webPath, params = {}) {
|
||
|
const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
|
||
|
let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
|
||
|
// Allow for custom sitemap paths to be passed in with the url.
|
||
|
path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
|
||
|
super(path, paramsWithDefaults);
|
||
|
Object.defineProperty(this, "webPath", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: webPath
|
||
|
});
|
||
|
Object.defineProperty(this, "allowUrlPatterns", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "chunkSize", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
this.webPath = path;
|
||
|
this.allowUrlPatterns = paramsWithDefaults.filterUrls;
|
||
|
this.chunkSize = paramsWithDefaults.chunkSize;
|
||
|
}
|
||
|
_checkUrlPatterns(url) {
|
||
|
if (!this.allowUrlPatterns) {
|
||
|
return false;
|
||
|
}
|
||
|
return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
|
||
|
}
|
||
|
async parseSitemap() {
|
||
|
const $ = await CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
|
||
|
xmlMode: true,
|
||
|
xml: true,
|
||
|
});
|
||
|
const elements = [];
|
||
|
$("url").each((_, element) => {
|
||
|
const loc = $(element).find("loc").text();
|
||
|
if (!loc) {
|
||
|
return;
|
||
|
}
|
||
|
if (this._checkUrlPatterns(loc)) {
|
||
|
return;
|
||
|
}
|
||
|
const changefreq = $(element).find("changefreq").text();
|
||
|
const lastmod = $(element).find("lastmod").text();
|
||
|
const priority = $(element).find("priority").text();
|
||
|
elements.push({ loc, changefreq, lastmod, priority });
|
||
|
});
|
||
|
$("sitemap").each((_, element) => {
|
||
|
const loc = $(element).find("loc").text();
|
||
|
if (!loc) {
|
||
|
return;
|
||
|
}
|
||
|
const changefreq = $(element).find("changefreq").text();
|
||
|
const lastmod = $(element).find("lastmod").text();
|
||
|
const priority = $(element).find("priority").text();
|
||
|
elements.push({ loc, changefreq, lastmod, priority });
|
||
|
});
|
||
|
return elements;
|
||
|
}
|
||
|
async _loadSitemapUrls(elements) {
|
||
|
const all = await CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
|
||
|
const documents = all.map(($, i) => {
|
||
|
if (!elements[i]) {
|
||
|
throw new Error("Scraped docs and elements not in sync");
|
||
|
}
|
||
|
const text = $(this.selector).text();
|
||
|
const { loc: source, ...metadata } = elements[i];
|
||
|
// extract page metadata
|
||
|
const description = $("meta[name='description']").attr("content");
|
||
|
const title = $("meta[property='og:title']").attr("content");
|
||
|
const lang = $("meta[property='og:locale']").attr("content");
|
||
|
return new Document({
|
||
|
pageContent: text,
|
||
|
metadata: {
|
||
|
...metadata,
|
||
|
description,
|
||
|
title,
|
||
|
lang,
|
||
|
source: source.trim(),
|
||
|
},
|
||
|
});
|
||
|
});
|
||
|
return documents;
|
||
|
}
|
||
|
async load() {
|
||
|
const elements = await this.parseSitemap();
|
||
|
const chunks = chunkArray(elements, this.chunkSize);
|
||
|
const documents = [];
|
||
|
for await (const chunk of chunks) {
|
||
|
const chunkedDocuments = await this._loadSitemapUrls(chunk);
|
||
|
documents.push(...chunkedDocuments);
|
||
|
}
|
||
|
return documents;
|
||
|
}
|
||
|
}
|