agsamantha/node_modules/@langchain/community/dist/document_loaders/fs/unstructured.d.ts
2024-10-02 15:15:21 -05:00

133 lines
5.2 KiB
TypeScript

/// <reference types="node" resolution-mode="require"/>
/// <reference types="node" resolution-mode="require"/>
/// <reference types="node" resolution-mode="require"/>
import type { basename as BasenameT } from "node:path";
import type { readFile as ReadFileT } from "node:fs/promises";
import { Document } from "@langchain/core/documents";
import { StringWithAutocomplete } from "@langchain/core/utils/types";
import { DirectoryLoader, UnknownHandling } from "langchain/document_loaders/fs/directory";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
export declare const UNSTRUCTURED_API_FILETYPES: string[];
/**
* Represents an element returned by the Unstructured API. It has
* properties for the element type, text content, and metadata.
*/
type Element = {
type: string;
text: string;
metadata: {
[key: string]: unknown;
};
};
/**
* Represents the available strategies for the UnstructuredLoader. It can
* be one of "hi_res", "fast", "ocr_only", or "auto".
*/
export type UnstructuredLoaderStrategy = "hi_res" | "fast" | "ocr_only" | "auto";
/**
* Represents the available hi-res models for the UnstructuredLoader. It can
* be one of "chipper".
*/
export type HiResModelName = "chipper";
/**
* To enable or disable table extraction for file types other than PDF, set
* the skipInferTableTypes property in the UnstructuredLoaderOptions object.
* The skipInferTableTypes property is an array of file types for which table
* extraction is disabled. For example, to disable table extraction for .docx
* and .doc files, set the skipInferTableTypes property to ["docx", "doc"].
* You can also disable table extraction for all file types other than PDF by
* setting the skipInferTableTypes property to [].
*/
export type SkipInferTableTypes = "txt" | "text" | "pdf" | "docx" | "doc" | "jpg" | "jpeg" | "eml" | "html" | "htm" | "md" | "pptx" | "ppt" | "msg" | "rtf" | "xlsx" | "xls" | "odt" | "epub";
/**
* Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title
*/
export type ChunkingStrategy = "None" | "by_title";
export type UnstructuredLoaderOptions = {
apiKey?: string;
apiUrl?: string;
strategy?: StringWithAutocomplete<UnstructuredLoaderStrategy>;
encoding?: string;
ocrLanguages?: Array<string>;
coordinates?: boolean;
pdfInferTableStructure?: boolean;
xmlKeepTags?: boolean;
skipInferTableTypes?: Array<StringWithAutocomplete<SkipInferTableTypes>>;
hiResModelName?: StringWithAutocomplete<HiResModelName>;
includePageBreaks?: boolean;
chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>;
multiPageSections?: boolean;
combineUnderNChars?: number;
newAfterNChars?: number;
maxCharacters?: number;
extractImageBlockTypes?: string[];
overlap?: number;
overlapAll?: boolean;
};
export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
recursive?: boolean;
unknown?: UnknownHandling;
};
export type UnstructuredMemoryLoaderOptions = {
buffer: Buffer;
fileName: string;
};
/**
* A document loader that uses the Unstructured API to load unstructured
* documents. It supports both the new syntax with options object and the
* legacy syntax for backward compatibility. The load() method sends a
* partitioning request to the Unstructured API and retrieves the
* partitioned elements. It creates a Document instance for each element
* and returns an array of Document instances.
*
* It accepts either a filepath or an object containing a buffer and a filename
* as input.
*/
export declare class UnstructuredLoader extends BaseDocumentLoader {
filePath: string;
private buffer?;
private fileName?;
private apiUrl;
private apiKey?;
private strategy;
private encoding?;
private ocrLanguages;
private coordinates?;
private pdfInferTableStructure?;
private xmlKeepTags?;
private skipInferTableTypes?;
private hiResModelName?;
private includePageBreaks?;
private chunkingStrategy?;
private multiPageSections?;
private combineUnderNChars?;
private newAfterNChars?;
private maxCharacters?;
private extractImageBlockTypes?;
private overlap?;
private overlapAll?;
constructor(filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions, unstructuredOptions?: UnstructuredLoaderOptions | string);
_partition(): Promise<Element[]>;
load(): Promise<Document[]>;
imports(): Promise<{
readFile: typeof ReadFileT;
basename: typeof BasenameT;
}>;
}
/**
* A document loader that loads unstructured documents from a directory
* using the UnstructuredLoader. It creates a UnstructuredLoader instance
* for each supported file type and passes it to the DirectoryLoader
* constructor.
* @example
* ```typescript
* const loader = new UnstructuredDirectoryLoader("path/to/directory", {
* apiKey: "MY_API_KEY",
* });
* const docs = await loader.load();
* ```
*/
export declare class UnstructuredDirectoryLoader extends DirectoryLoader {
constructor(directoryPathOrLegacyApiUrl: string, optionsOrLegacyDirectoryPath: UnstructuredDirectoryLoaderOptions | string, legacyOptionRecursive?: boolean, legacyOptionUnknown?: UnknownHandling);
}
export { UnknownHandling };