agsamantha/node_modules/@langchain/community/dist/document_loaders/web/sitemap.js
2024-10-02 15:15:21 -05:00

106 lines
4.1 KiB
JavaScript

import { Document } from "@langchain/core/documents";
import { chunkArray } from "@langchain/core/utils/chunk_array";
import { CheerioWebBaseLoader } from "./cheerio.js";
const DEFAULT_CHUNK_SIZE = 300;
export class SitemapLoader extends CheerioWebBaseLoader {
constructor(webPath, params = {}) {
const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params };
let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath;
// Allow for custom sitemap paths to be passed in with the url.
path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`;
super(path, paramsWithDefaults);
Object.defineProperty(this, "webPath", {
enumerable: true,
configurable: true,
writable: true,
value: webPath
});
Object.defineProperty(this, "allowUrlPatterns", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "chunkSize", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.webPath = path;
this.allowUrlPatterns = paramsWithDefaults.filterUrls;
this.chunkSize = paramsWithDefaults.chunkSize;
}
_checkUrlPatterns(url) {
if (!this.allowUrlPatterns) {
return false;
}
return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url));
}
async parseSitemap() {
const $ = await CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, {
xmlMode: true,
xml: true,
});
const elements = [];
$("url").each((_, element) => {
const loc = $(element).find("loc").text();
if (!loc) {
return;
}
if (this._checkUrlPatterns(loc)) {
return;
}
const changefreq = $(element).find("changefreq").text();
const lastmod = $(element).find("lastmod").text();
const priority = $(element).find("priority").text();
elements.push({ loc, changefreq, lastmod, priority });
});
$("sitemap").each((_, element) => {
const loc = $(element).find("loc").text();
if (!loc) {
return;
}
const changefreq = $(element).find("changefreq").text();
const lastmod = $(element).find("lastmod").text();
const priority = $(element).find("priority").text();
elements.push({ loc, changefreq, lastmod, priority });
});
return elements;
}
async _loadSitemapUrls(elements) {
const all = await CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder);
const documents = all.map(($, i) => {
if (!elements[i]) {
throw new Error("Scraped docs and elements not in sync");
}
const text = $(this.selector).text();
const { loc: source, ...metadata } = elements[i];
// extract page metadata
const description = $("meta[name='description']").attr("content");
const title = $("meta[property='og:title']").attr("content");
const lang = $("meta[property='og:locale']").attr("content");
return new Document({
pageContent: text,
metadata: {
...metadata,
description,
title,
lang,
source: source.trim(),
},
});
});
return documents;
}
async load() {
const elements = await this.parseSitemap();
const chunks = chunkArray(elements, this.chunkSize);
const documents = [];
for await (const chunk of chunks) {
const chunkedDocuments = await this._loadSitemapUrls(chunk);
documents.push(...chunkedDocuments);
}
return documents;
}
}