218 lines
8.6 KiB
JavaScript
218 lines
8.6 KiB
JavaScript
"use strict";
|
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
if (k2 === undefined) k2 = k;
|
|
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
}
|
|
Object.defineProperty(o, k2, desc);
|
|
}) : (function(o, m, k, k2) {
|
|
if (k2 === undefined) k2 = k;
|
|
o[k2] = m[k];
|
|
}));
|
|
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
}) : function(o, v) {
|
|
o["default"] = v;
|
|
});
|
|
var __importStar = (this && this.__importStar) || function (mod) {
|
|
if (mod && mod.__esModule) return mod;
|
|
var result = {};
|
|
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
__setModuleDefault(result, mod);
|
|
return result;
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.ParentDocumentRetriever = void 0;
|
|
const uuid = __importStar(require("uuid"));
|
|
const documents_1 = require("@langchain/core/documents");
|
|
const multi_vector_js_1 = require("./multi_vector.cjs");
|
|
/**
|
|
* A type of document retriever that splits input documents into smaller chunks
|
|
* while separately storing and preserving the original documents.
|
|
* The small chunks are embedded, then on retrieval, the original
|
|
* "parent" documents are retrieved.
|
|
*
|
|
* This strikes a balance between better targeted retrieval with small documents
|
|
* and the more context-rich larger documents.
|
|
* @example
|
|
* ```typescript
|
|
* const retriever = new ParentDocumentRetriever({
|
|
* vectorstore: new MemoryVectorStore(new OpenAIEmbeddings()),
|
|
* byteStore: new InMemoryStore<Uint8Array>(),
|
|
* parentSplitter: new RecursiveCharacterTextSplitter({
|
|
* chunkOverlap: 0,
|
|
* chunkSize: 500,
|
|
* }),
|
|
* childSplitter: new RecursiveCharacterTextSplitter({
|
|
* chunkOverlap: 0,
|
|
* chunkSize: 50,
|
|
* }),
|
|
* childK: 20,
|
|
* parentK: 5,
|
|
* });
|
|
*
|
|
* const parentDocuments = await getDocuments();
|
|
* await retriever.addDocuments(parentDocuments);
|
|
* const retrievedDocs = await retriever.getRelevantDocuments("justice breyer");
|
|
* ```
|
|
*/
|
|
class ParentDocumentRetriever extends multi_vector_js_1.MultiVectorRetriever {
|
|
static lc_name() {
|
|
return "ParentDocumentRetriever";
|
|
}
|
|
constructor(fields) {
|
|
super(fields);
|
|
Object.defineProperty(this, "lc_namespace", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: ["langchain", "retrievers", "parent_document"]
|
|
});
|
|
Object.defineProperty(this, "vectorstore", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "childSplitter", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "parentSplitter", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "idKey", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: "doc_id"
|
|
});
|
|
Object.defineProperty(this, "childK", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "parentK", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "childDocumentRetriever", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "documentCompressor", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "documentCompressorFilteringFn", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
this.vectorstore = fields.vectorstore;
|
|
this.childSplitter = fields.childSplitter;
|
|
this.parentSplitter = fields.parentSplitter;
|
|
this.idKey = fields.idKey ?? this.idKey;
|
|
this.childK = fields.childK;
|
|
this.parentK = fields.parentK;
|
|
this.childDocumentRetriever = fields.childDocumentRetriever;
|
|
this.documentCompressor = fields.documentCompressor;
|
|
this.documentCompressorFilteringFn = fields.documentCompressorFilteringFn;
|
|
}
|
|
async _getRelevantDocuments(query) {
|
|
let subDocs = [];
|
|
if (this.childDocumentRetriever) {
|
|
subDocs = await this.childDocumentRetriever.getRelevantDocuments(query);
|
|
}
|
|
else {
|
|
subDocs = await this.vectorstore.similaritySearch(query, this.childK);
|
|
}
|
|
if (this.documentCompressor && subDocs.length) {
|
|
subDocs = await this.documentCompressor.compressDocuments(subDocs, query);
|
|
if (this.documentCompressorFilteringFn) {
|
|
subDocs = this.documentCompressorFilteringFn(subDocs);
|
|
}
|
|
}
|
|
// Maintain order
|
|
const parentDocIds = [];
|
|
for (const doc of subDocs) {
|
|
if (!parentDocIds.includes(doc.metadata[this.idKey])) {
|
|
parentDocIds.push(doc.metadata[this.idKey]);
|
|
}
|
|
}
|
|
const parentDocs = [];
|
|
const storedParentDocs = await this.docstore.mget(parentDocIds);
|
|
const retrievedDocs = storedParentDocs.filter((doc) => doc !== undefined);
|
|
parentDocs.push(...retrievedDocs);
|
|
return parentDocs.slice(0, this.parentK);
|
|
}
|
|
async _storeDocuments(parentDoc, childDocs, addToDocstore) {
|
|
if (this.childDocumentRetriever) {
|
|
await this.childDocumentRetriever.addDocuments(childDocs);
|
|
}
|
|
else {
|
|
await this.vectorstore.addDocuments(childDocs);
|
|
}
|
|
if (addToDocstore) {
|
|
await this.docstore.mset(Object.entries(parentDoc));
|
|
}
|
|
}
|
|
/**
|
|
* Adds documents to the docstore and vectorstores.
|
|
* If a retriever is provided, it will be used to add documents instead of the vectorstore.
|
|
* @param docs The documents to add
|
|
* @param config.ids Optional list of ids for documents. If provided should be the same
|
|
* length as the list of documents. Can provided if parent documents
|
|
* are already in the document store and you don't want to re-add
|
|
* to the docstore. If not provided, random UUIDs will be used as ids.
|
|
* @param config.addToDocstore Boolean of whether to add documents to docstore.
|
|
* This can be false if and only if `ids` are provided. You may want
|
|
* to set this to False if the documents are already in the docstore
|
|
* and you don't want to re-add them.
|
|
* @param config.chunkHeaderOptions Object with options for adding Contextual chunk headers
|
|
*/
|
|
async addDocuments(docs, config) {
|
|
const { ids, addToDocstore = true, childDocChunkHeaderOptions = {}, } = config ?? {};
|
|
const parentDocs = this.parentSplitter
|
|
? await this.parentSplitter.splitDocuments(docs)
|
|
: docs;
|
|
let parentDocIds;
|
|
if (ids === undefined) {
|
|
if (!addToDocstore) {
|
|
throw new Error(`If ids are not passed in, "config.addToDocstore" MUST be true`);
|
|
}
|
|
parentDocIds = parentDocs.map((_doc) => uuid.v4());
|
|
}
|
|
else {
|
|
parentDocIds = ids;
|
|
}
|
|
if (parentDocs.length !== parentDocIds.length) {
|
|
throw new Error(`Got uneven list of documents and ids.\nIf "ids" is provided, should be same length as "documents".`);
|
|
}
|
|
for (let i = 0; i < parentDocs.length; i += 1) {
|
|
const parentDoc = parentDocs[i];
|
|
const parentDocId = parentDocIds[i];
|
|
const subDocs = await this.childSplitter.splitDocuments([parentDoc], childDocChunkHeaderOptions);
|
|
const taggedSubDocs = subDocs.map((subDoc) => new documents_1.Document({
|
|
pageContent: subDoc.pageContent,
|
|
metadata: { ...subDoc.metadata, [this.idKey]: parentDocId },
|
|
}));
|
|
await this._storeDocuments({ [parentDocId]: parentDoc }, taggedSubDocs, addToDocstore);
|
|
}
|
|
}
|
|
}
|
|
exports.ParentDocumentRetriever = ParentDocumentRetriever;
|