bup
This commit is contained in:
parent
a6b33dc8d1
commit
71ab0488f5
4 changed files with 23 additions and 4 deletions
|
@ -9,6 +9,7 @@ import { promisify } from 'util';
|
||||||
import readline from 'node:readline';
|
import readline from 'node:readline';
|
||||||
|
|
||||||
import ollama from 'ollama'
|
import ollama from 'ollama'
|
||||||
|
import { Readability } from "@mozilla/readability";
|
||||||
|
|
||||||
// Promisify exec for using async/await
|
// Promisify exec for using async/await
|
||||||
const execPromise = promisify(exec);
|
const execPromise = promisify(exec);
|
||||||
|
@ -148,12 +149,10 @@ async function siteCrawler(hostname) {
|
||||||
try {
|
try {
|
||||||
const crawled = generateSearchTerm(hostname);
|
const crawled = generateSearchTerm(hostname);
|
||||||
|
|
||||||
const compiledConvert = compile({ wordwrap: 130 });
|
|
||||||
|
|
||||||
const loader = new RecursiveUrlLoader(crawled, {
|
const loader = new RecursiveUrlLoader(crawled, {
|
||||||
extractor: compiledConvert,
|
extractor: Readability(document).parse(),
|
||||||
maxDepth: maxDepthCount,
|
maxDepth: maxDepthCount,
|
||||||
excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org"],
|
excludeDirs: ["https://doubleclick.net", "https://paypal.com", "https://archive.org", "*.css"],
|
||||||
});
|
});
|
||||||
const webContents = await loader.load();
|
const webContents = await loader.load();
|
||||||
webContents.forEach(content => contexts.push(content));
|
webContents.forEach(content => contexts.push(content));
|
||||||
|
|
9
node_modules/.package-lock.json
generated
vendored
9
node_modules/.package-lock.json
generated
vendored
|
@ -570,6 +570,15 @@
|
||||||
"@langchain/core": ">=0.2.21 <0.4.0"
|
"@langchain/core": ">=0.2.21 <0.4.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@mozilla/readability": {
|
||||||
|
"version": "0.5.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
|
||||||
|
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@selderee/plugin-htmlparser2": {
|
"node_modules/@selderee/plugin-htmlparser2": {
|
||||||
"version": "0.11.0",
|
"version": "0.11.0",
|
||||||
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",
|
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",
|
||||||
|
|
10
package-lock.json
generated
10
package-lock.json
generated
|
@ -11,6 +11,7 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@langchain/community": "^0.3.0",
|
"@langchain/community": "^0.3.0",
|
||||||
"@langchain/core": "^0.2.33",
|
"@langchain/core": "^0.2.33",
|
||||||
|
"@mozilla/readability": "^0.5.0",
|
||||||
"html-to-text": "^9.0.5",
|
"html-to-text": "^9.0.5",
|
||||||
"ignore": "^5.3.2",
|
"ignore": "^5.3.2",
|
||||||
"jsdom": "^25.0.0",
|
"jsdom": "^25.0.0",
|
||||||
|
@ -585,6 +586,15 @@
|
||||||
"@langchain/core": ">=0.2.21 <0.4.0"
|
"@langchain/core": ">=0.2.21 <0.4.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@mozilla/readability": {
|
||||||
|
"version": "0.5.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
|
||||||
|
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@selderee/plugin-htmlparser2": {
|
"node_modules/@selderee/plugin-htmlparser2": {
|
||||||
"version": "0.11.0",
|
"version": "0.11.0",
|
||||||
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",
|
"resolved": "https://registry.npmjs.org/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz",
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@langchain/community": "^0.3.0",
|
"@langchain/community": "^0.3.0",
|
||||||
"@langchain/core": "^0.2.33",
|
"@langchain/core": "^0.2.33",
|
||||||
|
"@mozilla/readability": "^0.5.0",
|
||||||
"html-to-text": "^9.0.5",
|
"html-to-text": "^9.0.5",
|
||||||
"ignore": "^5.3.2",
|
"ignore": "^5.3.2",
|
||||||
"jsdom": "^25.0.0",
|
"jsdom": "^25.0.0",
|
||||||
|
|
Loading…
Reference in a new issue