"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.WebBrowser = exports.getText = exports.parseInputs = void 0; const documents_1 = require("@langchain/core/documents"); const axios_1 = __importDefault(require("axios")); const cheerio = __importStar(require("cheerio")); const env_1 = require("@langchain/core/utils/env"); const tools_1 = require("@langchain/core/tools"); const runnables_1 = require("@langchain/core/runnables"); const output_parsers_1 = require("@langchain/core/output_parsers"); const text_splitter_js_1 = require("../text_splitter.cjs"); const memory_js_1 = require("../vectorstores/memory.cjs"); const axios_fetch_adapter_js_1 = __importDefault(require("../util/axios-fetch-adapter.cjs")); const document_js_1 = require("../util/document.cjs"); const parseInputs = (inputs) => { const [baseUrl, task] = inputs.split(",").map((input) => { let t = input.trim(); t = t.startsWith('"') ? t.slice(1) : t; t = t.endsWith('"') ? t.slice(0, -1) : t; // it likes to put / at the end of urls, wont matter for task t = t.endsWith("/") ? t.slice(0, -1) : t; return t.trim(); }); return [baseUrl, task]; }; exports.parseInputs = parseInputs; const getText = (html, baseUrl, summary) => { // scriptingEnabled so noscript elements are parsed const $ = cheerio.load(html, { scriptingEnabled: true }); let text = ""; // lets only get the body if its a summary, dont need to summarize header or footer etc const rootElement = summary ? "body " : "*"; // eslint-disable-next-line @typescript-eslint/no-explicit-any $(`${rootElement}:not(style):not(script):not(svg)`).each((_i, elem) => { // we dont want duplicated content as we drill down so remove children let content = $(elem).clone().children().remove().end().text().trim(); const $el = $(elem); // if its an ahref, print the content and url let href = $el.attr("href"); if ($el.prop("tagName")?.toLowerCase() === "a" && href) { if (!href.startsWith("http")) { try { href = new URL(href, baseUrl).toString(); } catch { // if this fails thats fine, just no url for this href = ""; } } const imgAlt = $el.find("img[alt]").attr("alt")?.trim(); if (imgAlt) { content += ` ${imgAlt}`; } text += ` [${content}](${href})`; } // otherwise just print the content else if (content !== "") { text += ` ${content}`; } }); return text.trim().replace(/\n+/g, " "); }; exports.getText = getText; const getHtml = async (baseUrl, h, config) => { const axios = ("default" in axios_1.default ? axios_1.default.default : axios_1.default); const domain = new URL(baseUrl).hostname; const headers = { ...h }; // these appear to be positional, which means they have to exist in the headers passed in headers.Host = domain; headers["Alt-Used"] = domain; let htmlResponse; try { htmlResponse = await axios.get(baseUrl, { ...config, headers, }); } catch (e) { if (axios.isAxiosError(e) && e.response && e.response.status) { throw new Error(`http response ${e.response.status}`); } throw e; } const allowedContentTypes = [ "text/html", "application/json", "application/xml", "application/javascript", "text/plain", ]; const contentType = htmlResponse.headers["content-type"]; const contentTypeArray = contentType.split(";"); if (contentTypeArray[0] && !allowedContentTypes.includes(contentTypeArray[0])) { throw new Error("returned page was not utf8"); } return htmlResponse.data; }; const DEFAULT_HEADERS = { Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "en-US,en;q=0.5", "Alt-Used": "LEAVE-THIS-KEY-SET-BY-TOOL", Connection: "keep-alive", Host: "LEAVE-THIS-KEY-SET-BY-TOOL", Referer: "https://www.google.com/", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "cross-site", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", }; /** * A class designed to interact with web pages, either to extract * information from them or to summarize their content. It uses the axios * library to send HTTP requests and the cheerio library to parse the * returned HTML. * @example * ```typescript * const browser = new WebBrowser({ * model: new ChatOpenAI({ temperature: 0 }), * embeddings: new OpenAIEmbeddings({}), * }); * const result = await browser.invoke("https:exampleurl.com"); * ``` */ class WebBrowser extends tools_1.Tool { static lc_name() { return "WebBrowser"; } get lc_namespace() { return [...super.lc_namespace, "webbrowser"]; } constructor({ model, headers, embeddings, axiosConfig, textSplitter, }) { super(...arguments); Object.defineProperty(this, "model", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "embeddings", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "headers", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "axiosConfig", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "textSplitter", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "name", { enumerable: true, configurable: true, writable: true, value: "web-browser" }); Object.defineProperty(this, "description", { enumerable: true, configurable: true, writable: true, value: `useful for when you need to find something on or summarize a webpage. input should be a comma separated list of "ONE valid http URL including protocol","what you want to find on the page or empty string for a summary".` }); this.model = model; this.embeddings = embeddings; this.headers = headers ?? DEFAULT_HEADERS; this.axiosConfig = { withCredentials: true, adapter: (0, env_1.isNode)() ? undefined : axios_fetch_adapter_js_1.default, ...axiosConfig, }; this.textSplitter = textSplitter ?? new text_splitter_js_1.RecursiveCharacterTextSplitter({ chunkSize: 2000, chunkOverlap: 200, }); } /** @ignore */ async _call(inputs, runManager) { const [baseUrl, task] = (0, exports.parseInputs)(inputs); const doSummary = !task; let text; try { const html = await getHtml(baseUrl, this.headers, this.axiosConfig); text = (0, exports.getText)(html, baseUrl, doSummary); } catch (e) { if (e) { return e.toString(); } return "There was a problem connecting to the site"; } const texts = await this.textSplitter.splitText(text); let context; // if we want a summary grab first 4 if (doSummary) { context = texts.slice(0, 4).join("\n"); } // search term well embed and grab top 4 else { const docs = texts.map((pageContent) => new documents_1.Document({ pageContent, metadata: [], })); const vectorStore = await memory_js_1.MemoryVectorStore.fromDocuments(docs, this.embeddings); const results = await vectorStore.similaritySearch(task, 4, undefined, runManager?.getChild("vectorstore")); context = (0, document_js_1.formatDocumentsAsString)(results); } const input = `Text:${context}\n\nI need ${doSummary ? "a summary" : task} from the above text, also provide up to 5 markdown links from within that would be of interest (always including URL and text). Links should be provided, if present, in markdown syntax as a list under the heading "Relevant Links:".`; const chain = runnables_1.RunnableSequence.from([this.model, new output_parsers_1.StringOutputParser()]); return chain.invoke(input, runManager?.getChild()); } } exports.WebBrowser = WebBrowser;