This commit is contained in:
The Ghost of FOSS' Past 2024-10-02 17:29:42 -05:00
parent a6b33dc8d1
commit 71ab0488f5
4 changed files with 23 additions and 4 deletions

View file

@ -9,6 +9,7 @@ import { promisify } from 'util';
import readline from 'node:readline';
import ollama from 'ollama'
import { Readability } from "@mozilla/readability";
// Promisify exec for using async/await
const execPromise = promisify(exec);
@ -148,12 +149,10 @@ async function siteCrawler(hostname) {
try {
const crawled = generateSearchTerm(hostname);
const compiledConvert = compile({ wordwrap: 130 });
const loader = new RecursiveUrlLoader(crawled, {
extractor: compiledConvert,
extractor: Readability(document).parse(),
maxDepth: maxDepthCount,
excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org"],
excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org", "*.css"],
});
const webContents = await loader.load();
webContents.forEach(content => contexts.push(content));

9
node_modules/.package-lock.json generated vendored
View file

@ -570,6 +570,15 @@
"@langchain/core": ">=0.2.21 <0.4.0"
}
},
"node_modules/@mozilla/readability": {
"version": "0.5.0",
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
"license": "Apache-2.0",
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/@selderee/plugin-htmlparser2": {
"version": "0.11.0",
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",

10
package-lock.json generated
View file

@ -11,6 +11,7 @@
"dependencies": {
"@langchain/community": "^0.3.0",
"@langchain/core": "^0.2.33",
"@mozilla/readability": "^0.5.0",
"html-to-text": "^9.0.5",
"ignore": "^5.3.2",
"jsdom": "^25.0.0",
@ -585,6 +586,15 @@
"@langchain/core": ">=0.2.21 <0.4.0"
}
},
"node_modules/@mozilla/readability": {
"version": "0.5.0",
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
"license": "Apache-2.0",
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/@selderee/plugin-htmlparser2": {
"version": "0.11.0",
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",

View file

@ -11,6 +11,7 @@
"dependencies": {
"@langchain/community": "^0.3.0",
"@langchain/core": "^0.2.33",
"@mozilla/readability": "^0.5.0",
"html-to-text": "^9.0.5",
"ignore": "^5.3.2",
"jsdom": "^25.0.0",