83 lines
3.3 KiB
JavaScript
83 lines
3.3 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.HNLoader = void 0;
|
|
const documents_1 = require("@langchain/core/documents");
|
|
const cheerio_js_1 = require("./cheerio.cjs");
|
|
/**
|
|
* A class that extends the CheerioWebBaseLoader class. It represents a
|
|
* loader for loading web pages from the Hacker News website.
|
|
*/
|
|
class HNLoader extends cheerio_js_1.CheerioWebBaseLoader {
|
|
constructor(webPath) {
|
|
super(webPath);
|
|
Object.defineProperty(this, "webPath", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: webPath
|
|
});
|
|
}
|
|
/**
|
|
* An asynchronous method that loads the web page. If the webPath includes
|
|
* "item", it calls the loadComments() method to load the comments from
|
|
* the web page. Otherwise, it calls the loadResults() method to load the
|
|
* results from the web page.
|
|
* @returns A Promise that resolves to an array of Document instances.
|
|
*/
|
|
async load() {
|
|
const $ = await this.scrape();
|
|
if (this.webPath.includes("item")) {
|
|
return this.loadComments($);
|
|
}
|
|
return this.loadResults($);
|
|
}
|
|
/**
|
|
* A private method that loads the comments from the web page. It selects
|
|
* the elements with the class "athing comtr" using the $ function
|
|
* provided by Cheerio. It also extracts the title of the web page from
|
|
* the element with the id "pagespace". It creates Document instances for
|
|
* each comment, with the comment text as the page content and the source
|
|
* and title as metadata.
|
|
* @param $ A CheerioAPI instance.
|
|
* @returns An array of Document instances.
|
|
*/
|
|
loadComments($) {
|
|
const comments = $("tr[class='athing comtr']");
|
|
const title = $("tr[id='pagespace']").attr("title");
|
|
const documents = [];
|
|
comments.each((_index, comment) => {
|
|
const text = $(comment).text().trim();
|
|
const metadata = { source: this.webPath, title };
|
|
documents.push(new documents_1.Document({ pageContent: text, metadata }));
|
|
});
|
|
return documents;
|
|
}
|
|
/**
|
|
* A private method that loads the results from the web page. It selects
|
|
* the elements with the class "athing" using the $ function provided by
|
|
* Cheerio. It extracts the ranking, link, title, and other metadata from
|
|
* each result item. It creates Document instances for each result item,
|
|
* with the title as the page content and the source, title, link, and
|
|
* ranking as metadata.
|
|
* @param $ A CheerioAPI instance.
|
|
* @returns An array of Document instances.
|
|
*/
|
|
loadResults($) {
|
|
const items = $("tr[class='athing']");
|
|
const documents = [];
|
|
items.each((_index, item) => {
|
|
const ranking = $(item).find("span[class='rank']").text();
|
|
const link = $(item).find("span[class='titleline'] a").attr("href");
|
|
const title = $(item).find("span[class='titleline']").text().trim();
|
|
const metadata = {
|
|
source: this.webPath,
|
|
title,
|
|
link,
|
|
ranking,
|
|
};
|
|
documents.push(new documents_1.Document({ pageContent: title, metadata }));
|
|
});
|
|
return documents;
|
|
}
|
|
}
|
|
exports.HNLoader = HNLoader;
|