agsamantha/node_modules/@langchain/community/dist/document_loaders/fs/unstructured.d.ts

/// <reference types="node" resolution-mode="require"/>
/// <reference types="node" resolution-mode="require"/>
/// <reference types="node" resolution-mode="require"/>
import type { basename as BasenameT } from "node:path";
import type { readFile as ReadFileT } from "node:fs/promises";
import { Document } from "@langchain/core/documents";
import { StringWithAutocomplete } from "@langchain/core/utils/types";
import { DirectoryLoader, UnknownHandling } from "langchain/document_loaders/fs/directory";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
export declare const UNSTRUCTURED_API_FILETYPES: string[];
/**
 * Represents an element returned by the Unstructured API. It has
 * properties for the element type, text content, and metadata.
 */
type Element = {
    type: string;
    text: string;
    metadata: {
        [key: string]: unknown;
    };
};
/**
 * Represents the available strategies for the UnstructuredLoader. It can
 * be one of "hi_res", "fast", "ocr_only", or "auto".
 */
export type UnstructuredLoaderStrategy = "hi_res" | "fast" | "ocr_only" | "auto";
/**
 * Represents the available hi-res models for the UnstructuredLoader. It can
 * be one of "chipper".
 */
export type HiResModelName = "chipper";
/**
 * To enable or disable table extraction for file types other than PDF, set
 * the skipInferTableTypes property in the UnstructuredLoaderOptions object.
 * The skipInferTableTypes property is an array of file types for which table
 * extraction is disabled. For example, to disable table extraction for .docx
 * and .doc files, set the skipInferTableTypes property to ["docx", "doc"].
 * You can also disable table extraction for all file types other than PDF by
 * setting the skipInferTableTypes property to [].
 */
export type SkipInferTableTypes = "txt" | "text" | "pdf" | "docx" | "doc" | "jpg" | "jpeg" | "eml" | "html" | "htm" | "md" | "pptx" | "ppt" | "msg" | "rtf" | "xlsx" | "xls" | "odt" | "epub";
/**
 * Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title
 */
export type ChunkingStrategy = "None" | "by_title";
export type UnstructuredLoaderOptions = {
    apiKey?: string;
    apiUrl?: string;
    strategy?: StringWithAutocomplete<UnstructuredLoaderStrategy>;
    encoding?: string;
    ocrLanguages?: Array<string>;
    coordinates?: boolean;
    pdfInferTableStructure?: boolean;
    xmlKeepTags?: boolean;
    skipInferTableTypes?: Array<StringWithAutocomplete<SkipInferTableTypes>>;
    hiResModelName?: StringWithAutocomplete<HiResModelName>;
    includePageBreaks?: boolean;
    chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>;
    multiPageSections?: boolean;
    combineUnderNChars?: number;
    newAfterNChars?: number;
    maxCharacters?: number;
    extractImageBlockTypes?: string[];
    overlap?: number;
    overlapAll?: boolean;
};
export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
    recursive?: boolean;
    unknown?: UnknownHandling;
};
export type UnstructuredMemoryLoaderOptions = {
    buffer: Buffer;
    fileName: string;
};
/**
 * A document loader that uses the Unstructured API to load unstructured
 * documents. It supports both the new syntax with options object and the
 * legacy syntax for backward compatibility. The load() method sends a
 * partitioning request to the Unstructured API and retrieves the
 * partitioned elements. It creates a Document instance for each element
 * and returns an array of Document instances.
 *
 * It accepts either a filepath or an object containing a buffer and a filename
 * as input.
 */
export declare class UnstructuredLoader extends BaseDocumentLoader {
    filePath: string;
    private buffer?;
    private fileName?;
    private apiUrl;
    private apiKey?;
    private strategy;
    private encoding?;
    private ocrLanguages;
    private coordinates?;
    private pdfInferTableStructure?;
    private xmlKeepTags?;
    private skipInferTableTypes?;
    private hiResModelName?;
    private includePageBreaks?;
    private chunkingStrategy?;
    private multiPageSections?;
    private combineUnderNChars?;
    private newAfterNChars?;
    private maxCharacters?;
    private extractImageBlockTypes?;
    private overlap?;
    private overlapAll?;
    constructor(filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions, unstructuredOptions?: UnstructuredLoaderOptions | string);
    _partition(): Promise<Element[]>;
    load(): Promise<Document[]>;
    imports(): Promise<{
        readFile: typeof ReadFileT;
        basename: typeof BasenameT;
    }>;
}
/**
 * A document loader that loads unstructured documents from a directory
 * using the UnstructuredLoader. It creates a UnstructuredLoader instance
 * for each supported file type and passes it to the DirectoryLoader
 * constructor.
 * @example
 * ```typescript
 * const loader = new UnstructuredDirectoryLoader("path/to/directory", {
 *   apiKey: "MY_API_KEY",
 * });
 * const docs = await loader.load();
 * ```
 */
export declare class UnstructuredDirectoryLoader extends DirectoryLoader {
    constructor(directoryPathOrLegacyApiUrl: string, optionsOrLegacyDirectoryPath: UnstructuredDirectoryLoaderOptions | string, legacyOptionRecursive?: boolean, legacyOptionUnknown?: UnknownHandling);
}
export { UnknownHandling };