33 lines
1.3 KiB
JavaScript
33 lines
1.3 KiB
JavaScript
import { Document } from "@langchain/core/documents";
|
|
import { CheerioWebBaseLoader } from "./cheerio.js";
|
|
/**
|
|
* A class that extends the CheerioWebBaseLoader class. It represents a
|
|
* loader for loading web pages from the IMSDB (Internet Movie Script
|
|
* Database) website.
|
|
*/
|
|
export class IMSDBLoader extends CheerioWebBaseLoader {
|
|
constructor(webPath) {
|
|
super(webPath);
|
|
Object.defineProperty(this, "webPath", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: webPath
|
|
});
|
|
}
|
|
/**
|
|
* An asynchronous method that loads the web page using the scrape()
|
|
* method inherited from the base class. It selects the element with the
|
|
* class 'scrtext' using the $ function provided by Cheerio and extracts
|
|
* the text content. It creates a Document instance with the text content
|
|
* as the page content and the source as metadata. It returns an array
|
|
* containing the Document instance.
|
|
* @returns An array containing a Document instance.
|
|
*/
|
|
async load() {
|
|
const $ = await this.scrape();
|
|
const text = $("td[class='scrtext']").text().trim();
|
|
const metadata = { source: this.webPath };
|
|
return [new Document({ pageContent: text, metadata })];
|
|
}
|
|
}
|