agsamantha/node_modules/langchain/dist/retrievers/parent_document.js
2024-10-02 15:15:21 -05:00

191 lines
7.4 KiB
JavaScript

import * as uuid from "uuid";
import { Document } from "@langchain/core/documents";
import { MultiVectorRetriever, } from "./multi_vector.js";
/**
* A type of document retriever that splits input documents into smaller chunks
* while separately storing and preserving the original documents.
* The small chunks are embedded, then on retrieval, the original
* "parent" documents are retrieved.
*
* This strikes a balance between better targeted retrieval with small documents
* and the more context-rich larger documents.
* @example
* ```typescript
* const retriever = new ParentDocumentRetriever({
* vectorstore: new MemoryVectorStore(new OpenAIEmbeddings()),
* byteStore: new InMemoryStore<Uint8Array>(),
* parentSplitter: new RecursiveCharacterTextSplitter({
* chunkOverlap: 0,
* chunkSize: 500,
* }),
* childSplitter: new RecursiveCharacterTextSplitter({
* chunkOverlap: 0,
* chunkSize: 50,
* }),
* childK: 20,
* parentK: 5,
* });
*
* const parentDocuments = await getDocuments();
* await retriever.addDocuments(parentDocuments);
* const retrievedDocs = await retriever.getRelevantDocuments("justice breyer");
* ```
*/
export class ParentDocumentRetriever extends MultiVectorRetriever {
static lc_name() {
return "ParentDocumentRetriever";
}
constructor(fields) {
super(fields);
Object.defineProperty(this, "lc_namespace", {
enumerable: true,
configurable: true,
writable: true,
value: ["langchain", "retrievers", "parent_document"]
});
Object.defineProperty(this, "vectorstore", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "childSplitter", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "parentSplitter", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "idKey", {
enumerable: true,
configurable: true,
writable: true,
value: "doc_id"
});
Object.defineProperty(this, "childK", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "parentK", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "childDocumentRetriever", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "documentCompressor", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "documentCompressorFilteringFn", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.vectorstore = fields.vectorstore;
this.childSplitter = fields.childSplitter;
this.parentSplitter = fields.parentSplitter;
this.idKey = fields.idKey ?? this.idKey;
this.childK = fields.childK;
this.parentK = fields.parentK;
this.childDocumentRetriever = fields.childDocumentRetriever;
this.documentCompressor = fields.documentCompressor;
this.documentCompressorFilteringFn = fields.documentCompressorFilteringFn;
}
async _getRelevantDocuments(query) {
let subDocs = [];
if (this.childDocumentRetriever) {
subDocs = await this.childDocumentRetriever.getRelevantDocuments(query);
}
else {
subDocs = await this.vectorstore.similaritySearch(query, this.childK);
}
if (this.documentCompressor && subDocs.length) {
subDocs = await this.documentCompressor.compressDocuments(subDocs, query);
if (this.documentCompressorFilteringFn) {
subDocs = this.documentCompressorFilteringFn(subDocs);
}
}
// Maintain order
const parentDocIds = [];
for (const doc of subDocs) {
if (!parentDocIds.includes(doc.metadata[this.idKey])) {
parentDocIds.push(doc.metadata[this.idKey]);
}
}
const parentDocs = [];
const storedParentDocs = await this.docstore.mget(parentDocIds);
const retrievedDocs = storedParentDocs.filter((doc) => doc !== undefined);
parentDocs.push(...retrievedDocs);
return parentDocs.slice(0, this.parentK);
}
async _storeDocuments(parentDoc, childDocs, addToDocstore) {
if (this.childDocumentRetriever) {
await this.childDocumentRetriever.addDocuments(childDocs);
}
else {
await this.vectorstore.addDocuments(childDocs);
}
if (addToDocstore) {
await this.docstore.mset(Object.entries(parentDoc));
}
}
/**
* Adds documents to the docstore and vectorstores.
* If a retriever is provided, it will be used to add documents instead of the vectorstore.
* @param docs The documents to add
* @param config.ids Optional list of ids for documents. If provided should be the same
* length as the list of documents. Can provided if parent documents
* are already in the document store and you don't want to re-add
* to the docstore. If not provided, random UUIDs will be used as ids.
* @param config.addToDocstore Boolean of whether to add documents to docstore.
* This can be false if and only if `ids` are provided. You may want
* to set this to False if the documents are already in the docstore
* and you don't want to re-add them.
* @param config.chunkHeaderOptions Object with options for adding Contextual chunk headers
*/
async addDocuments(docs, config) {
const { ids, addToDocstore = true, childDocChunkHeaderOptions = {}, } = config ?? {};
const parentDocs = this.parentSplitter
? await this.parentSplitter.splitDocuments(docs)
: docs;
let parentDocIds;
if (ids === undefined) {
if (!addToDocstore) {
throw new Error(`If ids are not passed in, "config.addToDocstore" MUST be true`);
}
parentDocIds = parentDocs.map((_doc) => uuid.v4());
}
else {
parentDocIds = ids;
}
if (parentDocs.length !== parentDocIds.length) {
throw new Error(`Got uneven list of documents and ids.\nIf "ids" is provided, should be same length as "documents".`);
}
for (let i = 0; i < parentDocs.length; i += 1) {
const parentDoc = parentDocs[i];
const parentDocId = parentDocIds[i];
const subDocs = await this.childSplitter.splitDocuments([parentDoc], childDocChunkHeaderOptions);
const taggedSubDocs = subDocs.map((subDoc) => new Document({
pageContent: subDoc.pageContent,
metadata: { ...subDoc.metadata, [this.idKey]: parentDocId },
}));
await this._storeDocuments({ [parentDocId]: parentDoc }, taggedSubDocs, addToDocstore);
}
}
}