import * as uuid from "uuid"; import { Document } from "@langchain/core/documents"; import { MultiVectorRetriever, } from "./multi_vector.js"; /** * A type of document retriever that splits input documents into smaller chunks * while separately storing and preserving the original documents. * The small chunks are embedded, then on retrieval, the original * "parent" documents are retrieved. * * This strikes a balance between better targeted retrieval with small documents * and the more context-rich larger documents. * @example * ```typescript * const retriever = new ParentDocumentRetriever({ * vectorstore: new MemoryVectorStore(new OpenAIEmbeddings()), * byteStore: new InMemoryStore(), * parentSplitter: new RecursiveCharacterTextSplitter({ * chunkOverlap: 0, * chunkSize: 500, * }), * childSplitter: new RecursiveCharacterTextSplitter({ * chunkOverlap: 0, * chunkSize: 50, * }), * childK: 20, * parentK: 5, * }); * * const parentDocuments = await getDocuments(); * await retriever.addDocuments(parentDocuments); * const retrievedDocs = await retriever.getRelevantDocuments("justice breyer"); * ``` */ export class ParentDocumentRetriever extends MultiVectorRetriever { static lc_name() { return "ParentDocumentRetriever"; } constructor(fields) { super(fields); Object.defineProperty(this, "lc_namespace", { enumerable: true, configurable: true, writable: true, value: ["langchain", "retrievers", "parent_document"] }); Object.defineProperty(this, "vectorstore", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "childSplitter", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "parentSplitter", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "idKey", { enumerable: true, configurable: true, writable: true, value: "doc_id" }); Object.defineProperty(this, "childK", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "parentK", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "childDocumentRetriever", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "documentCompressor", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "documentCompressorFilteringFn", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.vectorstore = fields.vectorstore; this.childSplitter = fields.childSplitter; this.parentSplitter = fields.parentSplitter; this.idKey = fields.idKey ?? this.idKey; this.childK = fields.childK; this.parentK = fields.parentK; this.childDocumentRetriever = fields.childDocumentRetriever; this.documentCompressor = fields.documentCompressor; this.documentCompressorFilteringFn = fields.documentCompressorFilteringFn; } async _getRelevantDocuments(query) { let subDocs = []; if (this.childDocumentRetriever) { subDocs = await this.childDocumentRetriever.getRelevantDocuments(query); } else { subDocs = await this.vectorstore.similaritySearch(query, this.childK); } if (this.documentCompressor && subDocs.length) { subDocs = await this.documentCompressor.compressDocuments(subDocs, query); if (this.documentCompressorFilteringFn) { subDocs = this.documentCompressorFilteringFn(subDocs); } } // Maintain order const parentDocIds = []; for (const doc of subDocs) { if (!parentDocIds.includes(doc.metadata[this.idKey])) { parentDocIds.push(doc.metadata[this.idKey]); } } const parentDocs = []; const storedParentDocs = await this.docstore.mget(parentDocIds); const retrievedDocs = storedParentDocs.filter((doc) => doc !== undefined); parentDocs.push(...retrievedDocs); return parentDocs.slice(0, this.parentK); } async _storeDocuments(parentDoc, childDocs, addToDocstore) { if (this.childDocumentRetriever) { await this.childDocumentRetriever.addDocuments(childDocs); } else { await this.vectorstore.addDocuments(childDocs); } if (addToDocstore) { await this.docstore.mset(Object.entries(parentDoc)); } } /** * Adds documents to the docstore and vectorstores. * If a retriever is provided, it will be used to add documents instead of the vectorstore. * @param docs The documents to add * @param config.ids Optional list of ids for documents. If provided should be the same * length as the list of documents. Can provided if parent documents * are already in the document store and you don't want to re-add * to the docstore. If not provided, random UUIDs will be used as ids. * @param config.addToDocstore Boolean of whether to add documents to docstore. * This can be false if and only if `ids` are provided. You may want * to set this to False if the documents are already in the docstore * and you don't want to re-add them. * @param config.chunkHeaderOptions Object with options for adding Contextual chunk headers */ async addDocuments(docs, config) { const { ids, addToDocstore = true, childDocChunkHeaderOptions = {}, } = config ?? {}; const parentDocs = this.parentSplitter ? await this.parentSplitter.splitDocuments(docs) : docs; let parentDocIds; if (ids === undefined) { if (!addToDocstore) { throw new Error(`If ids are not passed in, "config.addToDocstore" MUST be true`); } parentDocIds = parentDocs.map((_doc) => uuid.v4()); } else { parentDocIds = ids; } if (parentDocs.length !== parentDocIds.length) { throw new Error(`Got uneven list of documents and ids.\nIf "ids" is provided, should be same length as "documents".`); } for (let i = 0; i < parentDocs.length; i += 1) { const parentDoc = parentDocs[i]; const parentDocId = parentDocIds[i]; const subDocs = await this.childSplitter.splitDocuments([parentDoc], childDocChunkHeaderOptions); const taggedSubDocs = subDocs.map((subDoc) => new Document({ pageContent: subDoc.pageContent, metadata: { ...subDoc.metadata, [this.idKey]: parentDocId }, })); await this._storeDocuments({ [parentDocId]: parentDoc }, taggedSubDocs, addToDocstore); } } }