368 lines
14 KiB
JavaScript
368 lines
14 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.UnknownHandling = exports.UnstructuredDirectoryLoader = exports.UnstructuredLoader = exports.UNSTRUCTURED_API_FILETYPES = void 0;
|
|
const documents_1 = require("@langchain/core/documents");
|
|
const env_1 = require("@langchain/core/utils/env");
|
|
const directory_1 = require("langchain/document_loaders/fs/directory");
|
|
Object.defineProperty(exports, "UnknownHandling", { enumerable: true, get: function () { return directory_1.UnknownHandling; } });
|
|
const base_1 = require("@langchain/core/document_loaders/base");
|
|
exports.UNSTRUCTURED_API_FILETYPES = [
|
|
".txt",
|
|
".text",
|
|
".pdf",
|
|
".docx",
|
|
".doc",
|
|
".jpg",
|
|
".jpeg",
|
|
".eml",
|
|
".html",
|
|
".htm",
|
|
".md",
|
|
".pptx",
|
|
".ppt",
|
|
".msg",
|
|
".rtf",
|
|
".xlsx",
|
|
".xls",
|
|
".odt",
|
|
".epub",
|
|
];
|
|
/**
|
|
* A document loader that uses the Unstructured API to load unstructured
|
|
* documents. It supports both the new syntax with options object and the
|
|
* legacy syntax for backward compatibility. The load() method sends a
|
|
* partitioning request to the Unstructured API and retrieves the
|
|
* partitioned elements. It creates a Document instance for each element
|
|
* and returns an array of Document instances.
|
|
*
|
|
* It accepts either a filepath or an object containing a buffer and a filename
|
|
* as input.
|
|
*/
|
|
class UnstructuredLoader extends base_1.BaseDocumentLoader {
|
|
constructor(filepathOrBufferOptions, unstructuredOptions = {}) {
|
|
super();
|
|
Object.defineProperty(this, "filePath", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "buffer", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "fileName", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "apiUrl", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: "https://api.unstructured.io/general/v0/general"
|
|
});
|
|
Object.defineProperty(this, "apiKey", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "strategy", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: "hi_res"
|
|
});
|
|
Object.defineProperty(this, "encoding", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "ocrLanguages", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: []
|
|
});
|
|
Object.defineProperty(this, "coordinates", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "pdfInferTableStructure", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "xmlKeepTags", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "skipInferTableTypes", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "hiResModelName", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "includePageBreaks", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "chunkingStrategy", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "multiPageSections", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "combineUnderNChars", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "newAfterNChars", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "maxCharacters", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "extractImageBlockTypes", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "overlap", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "overlapAll", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
// Temporary shim to avoid breaking existing users
|
|
// Remove when API keys are enforced by Unstructured and existing code will break anyway
|
|
const isLegacySyntax = typeof unstructuredOptions === "string";
|
|
const isMemorySyntax = typeof filepathOrBufferOptions === "object";
|
|
if (isMemorySyntax) {
|
|
this.buffer = filepathOrBufferOptions.buffer;
|
|
this.fileName = filepathOrBufferOptions.fileName;
|
|
}
|
|
else if (isLegacySyntax) {
|
|
this.filePath = unstructuredOptions;
|
|
this.apiUrl = filepathOrBufferOptions;
|
|
}
|
|
else {
|
|
this.filePath = filepathOrBufferOptions;
|
|
}
|
|
if (!isLegacySyntax) {
|
|
const options = unstructuredOptions;
|
|
this.apiKey =
|
|
options.apiKey ?? (0, env_1.getEnvironmentVariable)("UNSTRUCTURED_API_KEY");
|
|
this.apiUrl =
|
|
options.apiUrl ??
|
|
(0, env_1.getEnvironmentVariable)("UNSTRUCTURED_API_URL") ??
|
|
this.apiUrl;
|
|
this.strategy = options.strategy ?? this.strategy;
|
|
this.encoding = options.encoding;
|
|
this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages;
|
|
this.coordinates = options.coordinates;
|
|
this.pdfInferTableStructure = options.pdfInferTableStructure;
|
|
this.xmlKeepTags = options.xmlKeepTags;
|
|
this.skipInferTableTypes = options.skipInferTableTypes;
|
|
this.hiResModelName = options.hiResModelName;
|
|
this.includePageBreaks = options.includePageBreaks;
|
|
this.chunkingStrategy = options.chunkingStrategy;
|
|
this.multiPageSections = options.multiPageSections;
|
|
this.combineUnderNChars = options.combineUnderNChars;
|
|
this.newAfterNChars = options.newAfterNChars;
|
|
this.maxCharacters = options.maxCharacters;
|
|
this.extractImageBlockTypes = options.extractImageBlockTypes;
|
|
this.overlap = options.overlap;
|
|
this.overlapAll = options.overlapAll ?? false;
|
|
}
|
|
}
|
|
async _partition() {
|
|
let buffer = this.buffer;
|
|
let fileName = this.fileName;
|
|
if (!buffer) {
|
|
const { readFile, basename } = await this.imports();
|
|
buffer = await readFile(this.filePath);
|
|
fileName = basename(this.filePath);
|
|
// I'm aware this reads the file into memory first, but we have lots of work
|
|
// to do on then consuming Documents in a streaming fashion anyway, so not
|
|
// worried about this for now.
|
|
}
|
|
const formData = new FormData();
|
|
formData.append("files", new Blob([buffer]), fileName);
|
|
formData.append("strategy", this.strategy);
|
|
this.ocrLanguages.forEach((language) => {
|
|
formData.append("ocr_languages", language);
|
|
});
|
|
if (this.encoding) {
|
|
formData.append("encoding", this.encoding);
|
|
}
|
|
if (this.coordinates === true) {
|
|
formData.append("coordinates", "true");
|
|
}
|
|
if (this.pdfInferTableStructure === true) {
|
|
formData.append("pdf_infer_table_structure", "true");
|
|
}
|
|
if (this.xmlKeepTags === true) {
|
|
formData.append("xml_keep_tags", "true");
|
|
}
|
|
if (this.skipInferTableTypes) {
|
|
formData.append("skip_infer_table_types", JSON.stringify(this.skipInferTableTypes));
|
|
}
|
|
if (this.hiResModelName) {
|
|
formData.append("hi_res_model_name", this.hiResModelName);
|
|
}
|
|
if (this.includePageBreaks) {
|
|
formData.append("include_page_breaks", "true");
|
|
}
|
|
if (this.chunkingStrategy) {
|
|
formData.append("chunking_strategy", this.chunkingStrategy);
|
|
}
|
|
if (this.multiPageSections !== undefined) {
|
|
formData.append("multipage_sections", this.multiPageSections ? "true" : "false");
|
|
}
|
|
if (this.combineUnderNChars !== undefined) {
|
|
formData.append("combine_under_n_chars", String(this.combineUnderNChars));
|
|
}
|
|
if (this.newAfterNChars !== undefined) {
|
|
formData.append("new_after_n_chars", String(this.newAfterNChars));
|
|
}
|
|
if (this.maxCharacters !== undefined) {
|
|
formData.append("max_characters", String(this.maxCharacters));
|
|
}
|
|
if (this.extractImageBlockTypes !== undefined) {
|
|
formData.append("extract_image_block_types", JSON.stringify(this.extractImageBlockTypes));
|
|
}
|
|
if (this.overlap !== undefined) {
|
|
formData.append("overlap", String(this.overlap));
|
|
}
|
|
if (this.overlapAll === true) {
|
|
formData.append("overlap_all", "true");
|
|
}
|
|
const headers = {
|
|
"UNSTRUCTURED-API-KEY": this.apiKey ?? "",
|
|
};
|
|
const response = await fetch(this.apiUrl, {
|
|
method: "POST",
|
|
body: formData,
|
|
headers,
|
|
});
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to partition file ${this.filePath} with error ${response.status} and message ${await response.text()}`);
|
|
}
|
|
const elements = await response.json();
|
|
if (!Array.isArray(elements)) {
|
|
throw new Error(`Expected partitioning request to return an array, but got ${elements}`);
|
|
}
|
|
return elements.filter((el) => typeof el.text === "string");
|
|
}
|
|
async load() {
|
|
const elements = await this._partition();
|
|
const documents = [];
|
|
for (const element of elements) {
|
|
const { metadata, text } = element;
|
|
if (typeof text === "string" && text !== "") {
|
|
documents.push(new documents_1.Document({
|
|
pageContent: text,
|
|
metadata: {
|
|
...metadata,
|
|
category: element.type,
|
|
},
|
|
}));
|
|
}
|
|
}
|
|
return documents;
|
|
}
|
|
async imports() {
|
|
try {
|
|
const { readFile } = await import("node:fs/promises");
|
|
const { basename } = await import("node:path");
|
|
return { readFile, basename };
|
|
}
|
|
catch (e) {
|
|
console.error(e);
|
|
throw new Error(`Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${(0, env_1.getEnv)()}'. See https://<link to docs> for alternatives.`);
|
|
}
|
|
}
|
|
}
|
|
exports.UnstructuredLoader = UnstructuredLoader;
|
|
/**
|
|
* A document loader that loads unstructured documents from a directory
|
|
* using the UnstructuredLoader. It creates a UnstructuredLoader instance
|
|
* for each supported file type and passes it to the DirectoryLoader
|
|
* constructor.
|
|
* @example
|
|
* ```typescript
|
|
* const loader = new UnstructuredDirectoryLoader("path/to/directory", {
|
|
* apiKey: "MY_API_KEY",
|
|
* });
|
|
* const docs = await loader.load();
|
|
* ```
|
|
*/
|
|
class UnstructuredDirectoryLoader extends directory_1.DirectoryLoader {
|
|
constructor(directoryPathOrLegacyApiUrl, optionsOrLegacyDirectoryPath, legacyOptionRecursive = true, legacyOptionUnknown = directory_1.UnknownHandling.Warn) {
|
|
let directoryPath;
|
|
let options;
|
|
// Temporary shim to avoid breaking existing users
|
|
// Remove when API keys are enforced by Unstructured and existing code will break anyway
|
|
const isLegacySyntax = typeof optionsOrLegacyDirectoryPath === "string";
|
|
if (isLegacySyntax) {
|
|
directoryPath = optionsOrLegacyDirectoryPath;
|
|
options = {
|
|
apiUrl: directoryPathOrLegacyApiUrl,
|
|
recursive: legacyOptionRecursive,
|
|
unknown: legacyOptionUnknown,
|
|
};
|
|
}
|
|
else {
|
|
directoryPath = directoryPathOrLegacyApiUrl;
|
|
options = optionsOrLegacyDirectoryPath;
|
|
}
|
|
const loader = (p) => new UnstructuredLoader(p, options);
|
|
const loaders = exports.UNSTRUCTURED_API_FILETYPES.reduce((loadersObject, filetype) => {
|
|
// eslint-disable-next-line no-param-reassign
|
|
loadersObject[filetype] = loader;
|
|
return loadersObject;
|
|
}, {});
|
|
super(directoryPath, loaders, options.recursive, options.unknown);
|
|
}
|
|
}
|
|
exports.UnstructuredDirectoryLoader = UnstructuredDirectoryLoader;
|