80 lines
3.1 KiB
JavaScript
80 lines
3.1 KiB
JavaScript
|
import { Document } from "@langchain/core/documents";
|
||
|
import { CheerioWebBaseLoader } from "./cheerio.js";
|
||
|
/**
|
||
|
* A class that extends the CheerioWebBaseLoader class. It represents a
|
||
|
* loader for loading web pages from the Hacker News website.
|
||
|
*/
|
||
|
export class HNLoader extends CheerioWebBaseLoader {
|
||
|
constructor(webPath) {
|
||
|
super(webPath);
|
||
|
Object.defineProperty(this, "webPath", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: webPath
|
||
|
});
|
||
|
}
|
||
|
/**
|
||
|
* An asynchronous method that loads the web page. If the webPath includes
|
||
|
* "item", it calls the loadComments() method to load the comments from
|
||
|
* the web page. Otherwise, it calls the loadResults() method to load the
|
||
|
* results from the web page.
|
||
|
* @returns A Promise that resolves to an array of Document instances.
|
||
|
*/
|
||
|
async load() {
|
||
|
const $ = await this.scrape();
|
||
|
if (this.webPath.includes("item")) {
|
||
|
return this.loadComments($);
|
||
|
}
|
||
|
return this.loadResults($);
|
||
|
}
|
||
|
/**
|
||
|
* A private method that loads the comments from the web page. It selects
|
||
|
* the elements with the class "athing comtr" using the $ function
|
||
|
* provided by Cheerio. It also extracts the title of the web page from
|
||
|
* the element with the id "pagespace". It creates Document instances for
|
||
|
* each comment, with the comment text as the page content and the source
|
||
|
* and title as metadata.
|
||
|
* @param $ A CheerioAPI instance.
|
||
|
* @returns An array of Document instances.
|
||
|
*/
|
||
|
loadComments($) {
|
||
|
const comments = $("tr[class='athing comtr']");
|
||
|
const title = $("tr[id='pagespace']").attr("title");
|
||
|
const documents = [];
|
||
|
comments.each((_index, comment) => {
|
||
|
const text = $(comment).text().trim();
|
||
|
const metadata = { source: this.webPath, title };
|
||
|
documents.push(new Document({ pageContent: text, metadata }));
|
||
|
});
|
||
|
return documents;
|
||
|
}
|
||
|
/**
|
||
|
* A private method that loads the results from the web page. It selects
|
||
|
* the elements with the class "athing" using the $ function provided by
|
||
|
* Cheerio. It extracts the ranking, link, title, and other metadata from
|
||
|
* each result item. It creates Document instances for each result item,
|
||
|
* with the title as the page content and the source, title, link, and
|
||
|
* ranking as metadata.
|
||
|
* @param $ A CheerioAPI instance.
|
||
|
* @returns An array of Document instances.
|
||
|
*/
|
||
|
loadResults($) {
|
||
|
const items = $("tr[class='athing']");
|
||
|
const documents = [];
|
||
|
items.each((_index, item) => {
|
||
|
const ranking = $(item).find("span[class='rank']").text();
|
||
|
const link = $(item).find("span[class='titleline'] a").attr("href");
|
||
|
const title = $(item).find("span[class='titleline']").text().trim();
|
||
|
const metadata = {
|
||
|
source: this.webPath,
|
||
|
title,
|
||
|
link,
|
||
|
ranking,
|
||
|
};
|
||
|
documents.push(new Document({ pageContent: title, metadata }));
|
||
|
});
|
||
|
return documents;
|
||
|
}
|
||
|
}
|