132 lines
5.1 KiB
JavaScript
132 lines
5.1 KiB
JavaScript
|
import { getEnv } from "@langchain/core/utils/env";
|
||
|
import { BaseDocumentLoader } from "../base.js";
|
||
|
// TypeScript enums are not tree-shakeable, so doing this instead
|
||
|
// See https://bargsten.org/jsts/enums/
|
||
|
export const UnknownHandling = {
|
||
|
Ignore: "ignore",
|
||
|
Warn: "warn",
|
||
|
Error: "error",
|
||
|
};
|
||
|
/**
|
||
|
* A document loader that loads documents from a directory. It extends the
|
||
|
* `BaseDocumentLoader` class and implements the `load()` method.
|
||
|
* @example
|
||
|
* ```typescript
|
||
|
*
|
||
|
* const directoryLoader = new DirectoryLoader(
|
||
|
* "src/document_loaders/example_data/",
|
||
|
* {
|
||
|
* ".pdf": (path: string) => new PDFLoader(path),
|
||
|
* },
|
||
|
* );
|
||
|
*
|
||
|
* const docs = await directoryLoader.load();
|
||
|
* console.log({ docs });
|
||
|
*
|
||
|
* ```
|
||
|
*/
|
||
|
export class DirectoryLoader extends BaseDocumentLoader {
|
||
|
constructor(directoryPath, loaders, recursive = true, unknown = UnknownHandling.Warn) {
|
||
|
super();
|
||
|
Object.defineProperty(this, "directoryPath", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: directoryPath
|
||
|
});
|
||
|
Object.defineProperty(this, "loaders", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: loaders
|
||
|
});
|
||
|
Object.defineProperty(this, "recursive", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: recursive
|
||
|
});
|
||
|
Object.defineProperty(this, "unknown", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: unknown
|
||
|
});
|
||
|
if (Object.keys(loaders).length === 0) {
|
||
|
throw new Error("Must provide at least one loader");
|
||
|
}
|
||
|
for (const extension in loaders) {
|
||
|
if (Object.hasOwn(loaders, extension)) {
|
||
|
if (extension[0] !== ".") {
|
||
|
throw new Error(`Extension must start with a dot: ${extension}`);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* Loads the documents from the directory. If a file is a directory and
|
||
|
* `recursive` is `true`, it recursively loads documents from the
|
||
|
* subdirectory. If a file is a file, it checks if there is a
|
||
|
* corresponding loader function for the file extension in the `loaders`
|
||
|
* mapping. If there is, it loads the documents. If there is no
|
||
|
* corresponding loader function and `unknown` is set to `Warn`, it logs a
|
||
|
* warning message. If `unknown` is set to `Error`, it throws an error.
|
||
|
* @returns A promise that resolves to an array of loaded documents.
|
||
|
*/
|
||
|
async load() {
|
||
|
const { readdir, extname, resolve } = await DirectoryLoader.imports();
|
||
|
const files = await readdir(this.directoryPath, { withFileTypes: true });
|
||
|
const documents = [];
|
||
|
for (const file of files) {
|
||
|
const fullPath = resolve(this.directoryPath, file.name);
|
||
|
if (file.isDirectory()) {
|
||
|
if (this.recursive) {
|
||
|
const loader = new DirectoryLoader(fullPath, this.loaders, this.recursive, this.unknown);
|
||
|
documents.push(...(await loader.load()));
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
// I'm aware some things won't be files,
|
||
|
// but they will be caught by the "unknown" handling below.
|
||
|
const loaderFactory = this.loaders[extname(file.name)];
|
||
|
if (loaderFactory) {
|
||
|
const loader = loaderFactory(fullPath);
|
||
|
documents.push(...(await loader.load()));
|
||
|
}
|
||
|
else {
|
||
|
switch (this.unknown) {
|
||
|
case UnknownHandling.Ignore:
|
||
|
break;
|
||
|
case UnknownHandling.Warn:
|
||
|
console.warn(`Unknown file type: ${file.name}`);
|
||
|
break;
|
||
|
case UnknownHandling.Error:
|
||
|
throw new Error(`Unknown file type: ${file.name}`);
|
||
|
default:
|
||
|
throw new Error(`Unknown unknown handling: ${this.unknown}`);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return documents;
|
||
|
}
|
||
|
/**
|
||
|
* Imports the necessary functions from the `node:path` and
|
||
|
* `node:fs/promises` modules. It is used to dynamically import the
|
||
|
* functions when needed. If the import fails, it throws an error
|
||
|
* indicating that the modules failed to load.
|
||
|
* @returns A promise that resolves to an object containing the imported functions.
|
||
|
*/
|
||
|
static async imports() {
|
||
|
try {
|
||
|
const { extname, resolve } = await import("node:path");
|
||
|
const { readdir } = await import("node:fs/promises");
|
||
|
return { readdir, extname, resolve };
|
||
|
}
|
||
|
catch (e) {
|
||
|
console.error(e);
|
||
|
throw new Error(`Failed to load fs/promises. DirectoryLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.`);
|
||
|
}
|
||
|
}
|
||
|
}
|