This commit is contained in:
The Ghost of FOSS' Past 2024-10-02 17:29:42 -05:00
parent a6b33dc8d1
commit 71ab0488f5
4 changed files with 23 additions and 4 deletions

View file

@ -9,6 +9,7 @@ import { promisify } from 'util';
import readline from 'node:readline'; import readline from 'node:readline';
import ollama from 'ollama' import ollama from 'ollama'
import { Readability } from "@mozilla/readability";
// Promisify exec for using async/await // Promisify exec for using async/await
const execPromise = promisify(exec); const execPromise = promisify(exec);
@ -148,12 +149,10 @@ async function siteCrawler(hostname) {
try { try {
const crawled = generateSearchTerm(hostname); const crawled = generateSearchTerm(hostname);
const compiledConvert = compile({ wordwrap: 130 });
const loader = new RecursiveUrlLoader(crawled, { const loader = new RecursiveUrlLoader(crawled, {
extractor: compiledConvert, extractor: Readability(document).parse(),
maxDepth: maxDepthCount, maxDepth: maxDepthCount,
excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org"], excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org", "*.css"],
}); });
const webContents = await loader.load(); const webContents = await loader.load();
webContents.forEach(content => contexts.push(content)); webContents.forEach(content => contexts.push(content));

9
node_modules/.package-lock.json generated vendored
View file

@ -570,6 +570,15 @@
"@langchain/core": ">=0.2.21 <0.4.0" "@langchain/core": ">=0.2.21 <0.4.0"
} }
}, },
"node_modules/@mozilla/readability": {
"version": "0.5.0",
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
"license": "Apache-2.0",
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/@selderee/plugin-htmlparser2": { "node_modules/@selderee/plugin-htmlparser2": {
"version": "0.11.0", "version": "0.11.0",
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz", "resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",

10
package-lock.json generated
View file

@ -11,6 +11,7 @@
"dependencies": { "dependencies": {
"@langchain/community": "^0.3.0", "@langchain/community": "^0.3.0",
"@langchain/core": "^0.2.33", "@langchain/core": "^0.2.33",
"@mozilla/readability": "^0.5.0",
"html-to-text": "^9.0.5", "html-to-text": "^9.0.5",
"ignore": "^5.3.2", "ignore": "^5.3.2",
"jsdom": "^25.0.0", "jsdom": "^25.0.0",
@ -585,6 +586,15 @@
"@langchain/core": ">=0.2.21 <0.4.0" "@langchain/core": ">=0.2.21 <0.4.0"
} }
}, },
"node_modules/@mozilla/readability": {
"version": "0.5.0",
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
"license": "Apache-2.0",
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/@selderee/plugin-htmlparser2": { "node_modules/@selderee/plugin-htmlparser2": {
"version": "0.11.0", "version": "0.11.0",
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz", "resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",

View file

@ -11,6 +11,7 @@
"dependencies": { "dependencies": {
"@langchain/community": "^0.3.0", "@langchain/community": "^0.3.0",
"@langchain/core": "^0.2.33", "@langchain/core": "^0.2.33",
"@mozilla/readability": "^0.5.0",
"html-to-text": "^9.0.5", "html-to-text": "^9.0.5",
"ignore": "^5.3.2", "ignore": "^5.3.2",
"jsdom": "^25.0.0", "jsdom": "^25.0.0",