agsamantha/node_modules/@langchain/community/dist/document_loaders/web/sitemap.d.ts
2024-10-02 15:15:21 -05:00

36 lines
1.4 KiB
TypeScript

import { Document, DocumentInterface } from "@langchain/core/documents";
import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js";
/**
* Interface representing the parameters for initializing a SitemapLoader.
* @interface SitemapLoaderParams
* @extends WebBaseLoaderParams
*/
export interface SitemapLoaderParams extends WebBaseLoaderParams {
/**
* @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded.
* WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed.
*/
filterUrls?: (string | RegExp)[];
/**
* The size to chunk the sitemap URLs into for scraping.
* @default {300}
*/
chunkSize?: number;
}
type SiteMapElement = {
loc: string;
changefreq?: string;
lastmod?: string;
priority?: string;
};
export declare class SitemapLoader extends CheerioWebBaseLoader implements SitemapLoaderParams {
webPath: string;
allowUrlPatterns: (string | RegExp)[] | undefined;
chunkSize: number;
constructor(webPath: string, params?: SitemapLoaderParams);
_checkUrlPatterns(url: string): boolean;
parseSitemap(): Promise<SiteMapElement[]>;
_loadSitemapUrls(elements: Array<SiteMapElement>): Promise<DocumentInterface[]>;
load(): Promise<Document[]>;
}
export {};