agsamantha/node_modules/langchain/dist/tools/webbrowser.cjs

256 lines
10 KiB
JavaScript
Raw Normal View History

2024-10-02 15:15:21 -05:00
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.WebBrowser = exports.getText = exports.parseInputs = void 0;
const documents_1 = require("@langchain/core/documents");
const axios_1 = __importDefault(require("axios"));
const cheerio = __importStar(require("cheerio"));
const env_1 = require("@langchain/core/utils/env");
const tools_1 = require("@langchain/core/tools");
const runnables_1 = require("@langchain/core/runnables");
const output_parsers_1 = require("@langchain/core/output_parsers");
const text_splitter_js_1 = require("../text_splitter.cjs");
const memory_js_1 = require("../vectorstores/memory.cjs");
const axios_fetch_adapter_js_1 = __importDefault(require("../util/axios-fetch-adapter.cjs"));
const document_js_1 = require("../util/document.cjs");
const parseInputs = (inputs) => {
const [baseUrl, task] = inputs.split(",").map((input) => {
let t = input.trim();
t = t.startsWith('"') ? t.slice(1) : t;
t = t.endsWith('"') ? t.slice(0, -1) : t;
// it likes to put / at the end of urls, wont matter for task
t = t.endsWith("/") ? t.slice(0, -1) : t;
return t.trim();
});
return [baseUrl, task];
};
exports.parseInputs = parseInputs;
const getText = (html, baseUrl, summary) => {
// scriptingEnabled so noscript elements are parsed
const $ = cheerio.load(html, { scriptingEnabled: true });
let text = "";
// lets only get the body if its a summary, dont need to summarize header or footer etc
const rootElement = summary ? "body " : "*";
// eslint-disable-next-line @typescript-eslint/no-explicit-any
$(`${rootElement}:not(style):not(script):not(svg)`).each((_i, elem) => {
// we dont want duplicated content as we drill down so remove children
let content = $(elem).clone().children().remove().end().text().trim();
const $el = $(elem);
// if its an ahref, print the content and url
let href = $el.attr("href");
if ($el.prop("tagName")?.toLowerCase() === "a" && href) {
if (!href.startsWith("http")) {
try {
href = new URL(href, baseUrl).toString();
}
catch {
// if this fails thats fine, just no url for this
href = "";
}
}
const imgAlt = $el.find("img[alt]").attr("alt")?.trim();
if (imgAlt) {
content += ` ${imgAlt}`;
}
text += ` [${content}](${href})`;
}
// otherwise just print the content
else if (content !== "") {
text += ` ${content}`;
}
});
return text.trim().replace(/\n+/g, " ");
};
exports.getText = getText;
const getHtml = async (baseUrl, h, config) => {
const axios = ("default" in axios_1.default ? axios_1.default.default : axios_1.default);
const domain = new URL(baseUrl).hostname;
const headers = { ...h };
// these appear to be positional, which means they have to exist in the headers passed in
headers.Host = domain;
headers["Alt-Used"] = domain;
let htmlResponse;
try {
htmlResponse = await axios.get(baseUrl, {
...config,
headers,
});
}
catch (e) {
if (axios.isAxiosError(e) && e.response && e.response.status) {
throw new Error(`http response ${e.response.status}`);
}
throw e;
}
const allowedContentTypes = [
"text/html",
"application/json",
"application/xml",
"application/javascript",
"text/plain",
];
const contentType = htmlResponse.headers["content-type"];
const contentTypeArray = contentType.split(";");
if (contentTypeArray[0] &&
!allowedContentTypes.includes(contentTypeArray[0])) {
throw new Error("returned page was not utf8");
}
return htmlResponse.data;
};
const DEFAULT_HEADERS = {
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Alt-Used": "LEAVE-THIS-KEY-SET-BY-TOOL",
Connection: "keep-alive",
Host: "LEAVE-THIS-KEY-SET-BY-TOOL",
Referer: "https://www.google.com/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
};
/**
* A class designed to interact with web pages, either to extract
* information from them or to summarize their content. It uses the axios
* library to send HTTP requests and the cheerio library to parse the
* returned HTML.
* @example
* ```typescript
* const browser = new WebBrowser({
* model: new ChatOpenAI({ temperature: 0 }),
* embeddings: new OpenAIEmbeddings({}),
* });
* const result = await browser.invoke("https:exampleurl.com");
* ```
*/
class WebBrowser extends tools_1.Tool {
static lc_name() {
return "WebBrowser";
}
get lc_namespace() {
return [...super.lc_namespace, "webbrowser"];
}
constructor({ model, headers, embeddings, axiosConfig, textSplitter, }) {
super(...arguments);
Object.defineProperty(this, "model", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "embeddings", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "headers", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "axiosConfig", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "textSplitter", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "name", {
enumerable: true,
configurable: true,
writable: true,
value: "web-browser"
});
Object.defineProperty(this, "description", {
enumerable: true,
configurable: true,
writable: true,
value: `useful for when you need to find something on or summarize a webpage. input should be a comma separated list of "ONE valid http URL including protocol","what you want to find on the page or empty string for a summary".`
});
this.model = model;
this.embeddings = embeddings;
this.headers = headers ?? DEFAULT_HEADERS;
this.axiosConfig = {
withCredentials: true,
adapter: (0, env_1.isNode)() ? undefined : axios_fetch_adapter_js_1.default,
...axiosConfig,
};
this.textSplitter =
textSplitter ??
new text_splitter_js_1.RecursiveCharacterTextSplitter({
chunkSize: 2000,
chunkOverlap: 200,
});
}
/** @ignore */
async _call(inputs, runManager) {
const [baseUrl, task] = (0, exports.parseInputs)(inputs);
const doSummary = !task;
let text;
try {
const html = await getHtml(baseUrl, this.headers, this.axiosConfig);
text = (0, exports.getText)(html, baseUrl, doSummary);
}
catch (e) {
if (e) {
return e.toString();
}
return "There was a problem connecting to the site";
}
const texts = await this.textSplitter.splitText(text);
let context;
// if we want a summary grab first 4
if (doSummary) {
context = texts.slice(0, 4).join("\n");
}
// search term well embed and grab top 4
else {
const docs = texts.map((pageContent) => new documents_1.Document({
pageContent,
metadata: [],
}));
const vectorStore = await memory_js_1.MemoryVectorStore.fromDocuments(docs, this.embeddings);
const results = await vectorStore.similaritySearch(task, 4, undefined, runManager?.getChild("vectorstore"));
context = (0, document_js_1.formatDocumentsAsString)(results);
}
const input = `Text:${context}\n\nI need ${doSummary ? "a summary" : task} from the above text, also provide up to 5 markdown links from within that would be of interest (always including URL and text). Links should be provided, if present, in markdown syntax as a list under the heading "Relevant Links:".`;
const chain = runnables_1.RunnableSequence.from([this.model, new output_parsers_1.StringOutputParser()]);
return chain.invoke(input, runManager?.getChild());
}
}
exports.WebBrowser = WebBrowser;