"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.UnknownHandling = exports.UnstructuredDirectoryLoader = exports.UnstructuredLoader = exports.UNSTRUCTURED_API_FILETYPES = void 0; const documents_1 = require("@langchain/core/documents"); const env_1 = require("@langchain/core/utils/env"); const directory_1 = require("langchain/document_loaders/fs/directory"); Object.defineProperty(exports, "UnknownHandling", { enumerable: true, get: function () { return directory_1.UnknownHandling; } }); const base_1 = require("@langchain/core/document_loaders/base"); exports.UNSTRUCTURED_API_FILETYPES = [ ".txt", ".text", ".pdf", ".docx", ".doc", ".jpg", ".jpeg", ".eml", ".html", ".htm", ".md", ".pptx", ".ppt", ".msg", ".rtf", ".xlsx", ".xls", ".odt", ".epub", ]; /** * A document loader that uses the Unstructured API to load unstructured * documents. It supports both the new syntax with options object and the * legacy syntax for backward compatibility. The load() method sends a * partitioning request to the Unstructured API and retrieves the * partitioned elements. It creates a Document instance for each element * and returns an array of Document instances. * * It accepts either a filepath or an object containing a buffer and a filename * as input. */ class UnstructuredLoader extends base_1.BaseDocumentLoader { constructor(filepathOrBufferOptions, unstructuredOptions = {}) { super(); Object.defineProperty(this, "filePath", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "buffer", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "fileName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "apiUrl", { enumerable: true, configurable: true, writable: true, value: "https://api.unstructured.io/general/v0/general" }); Object.defineProperty(this, "apiKey", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "strategy", { enumerable: true, configurable: true, writable: true, value: "hi_res" }); Object.defineProperty(this, "encoding", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "ocrLanguages", { enumerable: true, configurable: true, writable: true, value: [] }); Object.defineProperty(this, "coordinates", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "pdfInferTableStructure", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "xmlKeepTags", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "skipInferTableTypes", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "hiResModelName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "includePageBreaks", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "chunkingStrategy", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "multiPageSections", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "combineUnderNChars", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "newAfterNChars", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "maxCharacters", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "extractImageBlockTypes", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "overlap", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "overlapAll", { enumerable: true, configurable: true, writable: true, value: void 0 }); // Temporary shim to avoid breaking existing users // Remove when API keys are enforced by Unstructured and existing code will break anyway const isLegacySyntax = typeof unstructuredOptions === "string"; const isMemorySyntax = typeof filepathOrBufferOptions === "object"; if (isMemorySyntax) { this.buffer = filepathOrBufferOptions.buffer; this.fileName = filepathOrBufferOptions.fileName; } else if (isLegacySyntax) { this.filePath = unstructuredOptions; this.apiUrl = filepathOrBufferOptions; } else { this.filePath = filepathOrBufferOptions; } if (!isLegacySyntax) { const options = unstructuredOptions; this.apiKey = options.apiKey ?? (0, env_1.getEnvironmentVariable)("UNSTRUCTURED_API_KEY"); this.apiUrl = options.apiUrl ?? (0, env_1.getEnvironmentVariable)("UNSTRUCTURED_API_URL") ?? this.apiUrl; this.strategy = options.strategy ?? this.strategy; this.encoding = options.encoding; this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages; this.coordinates = options.coordinates; this.pdfInferTableStructure = options.pdfInferTableStructure; this.xmlKeepTags = options.xmlKeepTags; this.skipInferTableTypes = options.skipInferTableTypes; this.hiResModelName = options.hiResModelName; this.includePageBreaks = options.includePageBreaks; this.chunkingStrategy = options.chunkingStrategy; this.multiPageSections = options.multiPageSections; this.combineUnderNChars = options.combineUnderNChars; this.newAfterNChars = options.newAfterNChars; this.maxCharacters = options.maxCharacters; this.extractImageBlockTypes = options.extractImageBlockTypes; this.overlap = options.overlap; this.overlapAll = options.overlapAll ?? false; } } async _partition() { let buffer = this.buffer; let fileName = this.fileName; if (!buffer) { const { readFile, basename } = await this.imports(); buffer = await readFile(this.filePath); fileName = basename(this.filePath); // I'm aware this reads the file into memory first, but we have lots of work // to do on then consuming Documents in a streaming fashion anyway, so not // worried about this for now. } const formData = new FormData(); formData.append("files", new Blob([buffer]), fileName); formData.append("strategy", this.strategy); this.ocrLanguages.forEach((language) => { formData.append("ocr_languages", language); }); if (this.encoding) { formData.append("encoding", this.encoding); } if (this.coordinates === true) { formData.append("coordinates", "true"); } if (this.pdfInferTableStructure === true) { formData.append("pdf_infer_table_structure", "true"); } if (this.xmlKeepTags === true) { formData.append("xml_keep_tags", "true"); } if (this.skipInferTableTypes) { formData.append("skip_infer_table_types", JSON.stringify(this.skipInferTableTypes)); } if (this.hiResModelName) { formData.append("hi_res_model_name", this.hiResModelName); } if (this.includePageBreaks) { formData.append("include_page_breaks", "true"); } if (this.chunkingStrategy) { formData.append("chunking_strategy", this.chunkingStrategy); } if (this.multiPageSections !== undefined) { formData.append("multipage_sections", this.multiPageSections ? "true" : "false"); } if (this.combineUnderNChars !== undefined) { formData.append("combine_under_n_chars", String(this.combineUnderNChars)); } if (this.newAfterNChars !== undefined) { formData.append("new_after_n_chars", String(this.newAfterNChars)); } if (this.maxCharacters !== undefined) { formData.append("max_characters", String(this.maxCharacters)); } if (this.extractImageBlockTypes !== undefined) { formData.append("extract_image_block_types", JSON.stringify(this.extractImageBlockTypes)); } if (this.overlap !== undefined) { formData.append("overlap", String(this.overlap)); } if (this.overlapAll === true) { formData.append("overlap_all", "true"); } const headers = { "UNSTRUCTURED-API-KEY": this.apiKey ?? "", }; const response = await fetch(this.apiUrl, { method: "POST", body: formData, headers, }); if (!response.ok) { throw new Error(`Failed to partition file ${this.filePath} with error ${response.status} and message ${await response.text()}`); } const elements = await response.json(); if (!Array.isArray(elements)) { throw new Error(`Expected partitioning request to return an array, but got ${elements}`); } return elements.filter((el) => typeof el.text === "string"); } async load() { const elements = await this._partition(); const documents = []; for (const element of elements) { const { metadata, text } = element; if (typeof text === "string" && text !== "") { documents.push(new documents_1.Document({ pageContent: text, metadata: { ...metadata, category: element.type, }, })); } } return documents; } async imports() { try { const { readFile } = await import("node:fs/promises"); const { basename } = await import("node:path"); return { readFile, basename }; } catch (e) { console.error(e); throw new Error(`Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${(0, env_1.getEnv)()}'. See https:// for alternatives.`); } } } exports.UnstructuredLoader = UnstructuredLoader; /** * A document loader that loads unstructured documents from a directory * using the UnstructuredLoader. It creates a UnstructuredLoader instance * for each supported file type and passes it to the DirectoryLoader * constructor. * @example * ```typescript * const loader = new UnstructuredDirectoryLoader("path/to/directory", { * apiKey: "MY_API_KEY", * }); * const docs = await loader.load(); * ``` */ class UnstructuredDirectoryLoader extends directory_1.DirectoryLoader { constructor(directoryPathOrLegacyApiUrl, optionsOrLegacyDirectoryPath, legacyOptionRecursive = true, legacyOptionUnknown = directory_1.UnknownHandling.Warn) { let directoryPath; let options; // Temporary shim to avoid breaking existing users // Remove when API keys are enforced by Unstructured and existing code will break anyway const isLegacySyntax = typeof optionsOrLegacyDirectoryPath === "string"; if (isLegacySyntax) { directoryPath = optionsOrLegacyDirectoryPath; options = { apiUrl: directoryPathOrLegacyApiUrl, recursive: legacyOptionRecursive, unknown: legacyOptionUnknown, }; } else { directoryPath = directoryPathOrLegacyApiUrl; options = optionsOrLegacyDirectoryPath; } const loader = (p) => new UnstructuredLoader(p, options); const loaders = exports.UNSTRUCTURED_API_FILETYPES.reduce((loadersObject, filetype) => { // eslint-disable-next-line no-param-reassign loadersObject[filetype] = loader; return loadersObject; }, {}); super(directoryPath, loaders, options.recursive, options.unknown); } } exports.UnstructuredDirectoryLoader = UnstructuredDirectoryLoader;