bup
This commit is contained in:
parent
a6b33dc8d1
commit
71ab0488f5
4 changed files with 23 additions and 4 deletions
|
@ -9,6 +9,7 @@ import { promisify } from 'util';
|
|||
import readline from 'node:readline';
|
||||
|
||||
import ollama from 'ollama'
|
||||
import { Readability } from "@mozilla/readability";
|
||||
|
||||
// Promisify exec for using async/await
|
||||
const execPromise = promisify(exec);
|
||||
|
@ -148,12 +149,10 @@ async function siteCrawler(hostname) {
|
|||
try {
|
||||
const crawled = generateSearchTerm(hostname);
|
||||
|
||||
const compiledConvert = compile({ wordwrap: 130 });
|
||||
|
||||
const loader = new RecursiveUrlLoader(crawled, {
|
||||
extractor: compiledConvert,
|
||||
extractor: Readability(document).parse(),
|
||||
maxDepth: maxDepthCount,
|
||||
excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org"],
|
||||
excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org", "*.css"],
|
||||
});
|
||||
const webContents = await loader.load();
|
||||
webContents.forEach(content => contexts.push(content));
|
||||
|
|
9
node_modules/.package-lock.json
generated
vendored
9
node_modules/.package-lock.json
generated
vendored
|
@ -570,6 +570,15 @@
|
|||
"@langchain/core": ">=0.2.21 <0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@mozilla/readability": {
|
||||
"version": "0.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
|
||||
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@selderee/plugin-htmlparser2": {
|
||||
"version": "0.11.0",
|
||||
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",
|
||||
|
|
10
package-lock.json
generated
10
package-lock.json
generated
|
@ -11,6 +11,7 @@
|
|||
"dependencies": {
|
||||
"@langchain/community": "^0.3.0",
|
||||
"@langchain/core": "^0.2.33",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"html-to-text": "^9.0.5",
|
||||
"ignore": "^5.3.2",
|
||||
"jsdom": "^25.0.0",
|
||||
|
@ -585,6 +586,15 @@
|
|||
"@langchain/core": ">=0.2.21 <0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@mozilla/readability": {
|
||||
"version": "0.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
|
||||
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@selderee/plugin-htmlparser2": {
|
||||
"version": "0.11.0",
|
||||
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
"dependencies": {
|
||||
"@langchain/community": "^0.3.0",
|
||||
"@langchain/core": "^0.2.33",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"html-to-text": "^9.0.5",
|
||||
"ignore": "^5.3.2",
|
||||
"jsdom": "^25.0.0",
|
||||
|
|
Loading…
Reference in a new issue