import * as uuid from "uuid"; import { VectorStore } from "@langchain/core/vectorstores"; import { Document } from "@langchain/core/documents"; /** * Class for interacting with an Elasticsearch database. It extends the * VectorStore base class and provides methods for adding documents and * vectors to the Elasticsearch database, performing similarity searches, * deleting documents, and more. */ export class ElasticVectorSearch extends VectorStore { _vectorstoreType() { return "elasticsearch"; } constructor(embeddings, args) { super(embeddings, args); Object.defineProperty(this, "client", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "indexName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "engine", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "similarity", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "efConstruction", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "m", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "candidates", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.engine = args.vectorSearchOptions?.engine ?? "hnsw"; this.similarity = args.vectorSearchOptions?.similarity ?? "l2_norm"; this.m = args.vectorSearchOptions?.m ?? 16; this.efConstruction = args.vectorSearchOptions?.efConstruction ?? 100; this.candidates = args.vectorSearchOptions?.candidates ?? 200; this.client = args.client.child({ headers: { "user-agent": "langchain-js-vs/0.0.1" }, }); this.indexName = args.indexName ?? "documents"; } /** * Method to add documents to the Elasticsearch database. It first * converts the documents to vectors using the embeddings, then adds the * vectors to the database. * @param documents The documents to add to the database. * @param options Optional parameter that can contain the IDs for the documents. * @returns A promise that resolves with the IDs of the added documents. */ async addDocuments(documents, options) { const texts = documents.map(({ pageContent }) => pageContent); return this.addVectors(await this.embeddings.embedDocuments(texts), documents, options); } /** * Method to add vectors to the Elasticsearch database. It ensures the * index exists, then adds the vectors and their corresponding documents * to the database. * @param vectors The vectors to add to the database. * @param documents The documents corresponding to the vectors. * @param options Optional parameter that can contain the IDs for the documents. * @returns A promise that resolves with the IDs of the added documents. */ async addVectors(vectors, documents, options) { await this.ensureIndexExists(vectors[0].length, this.engine, this.similarity, this.efConstruction, this.m); const documentIds = options?.ids ?? Array.from({ length: vectors.length }, () => uuid.v4()); const operations = vectors.flatMap((embedding, idx) => [ { index: { _id: documentIds[idx], _index: this.indexName, }, }, { embedding, metadata: documents[idx].metadata, text: documents[idx].pageContent, }, ]); const results = await this.client.bulk({ refresh: true, operations }); if (results.errors) { const reasons = results.items.map((result) => result.index?.error?.reason); throw new Error(`Failed to insert documents:\n${reasons.join("\n")}`); } return documentIds; } /** * Method to perform a similarity search in the Elasticsearch database * using a vector. It returns the k most similar documents along with * their similarity scores. * @param query The query vector. * @param k The number of most similar documents to return. * @param filter Optional filter to apply to the search. * @returns A promise that resolves with an array of tuples, where each tuple contains a Document and its similarity score. */ async similaritySearchVectorWithScore(query, k, filter) { const result = await this.client.search({ index: this.indexName, size: k, knn: { field: "embedding", query_vector: query, filter: { bool: this.buildMetadataTerms(filter) }, k, num_candidates: this.candidates, }, }); // eslint-disable-next-line @typescript-eslint/no-explicit-any return result.hits.hits.map((hit) => [ new Document({ pageContent: hit._source.text, metadata: hit._source.metadata, }), hit._score, ]); } /** * Method to delete documents from the Elasticsearch database. * @param params Object containing the IDs of the documents to delete. * @returns A promise that resolves when the deletion is complete. */ async delete(params) { const operations = params.ids.map((id) => ({ delete: { _id: id, _index: this.indexName, }, })); if (operations.length > 0) await this.client.bulk({ refresh: true, operations }); } /** * Static method to create an ElasticVectorSearch instance from texts. It * creates Document instances from the texts and their corresponding * metadata, then calls the fromDocuments method to create the * ElasticVectorSearch instance. * @param texts The texts to create the ElasticVectorSearch instance from. * @param metadatas The metadata corresponding to the texts. * @param embeddings The embeddings to use for the documents. * @param args The arguments to create the Elasticsearch client. * @returns A promise that resolves with the created ElasticVectorSearch instance. */ static fromTexts(texts, metadatas, embeddings, args) { const documents = texts.map((text, idx) => { const metadata = Array.isArray(metadatas) ? metadatas[idx] : metadatas; return new Document({ pageContent: text, metadata }); }); return ElasticVectorSearch.fromDocuments(documents, embeddings, args); } /** * Static method to create an ElasticVectorSearch instance from Document * instances. It adds the documents to the Elasticsearch database, then * returns the ElasticVectorSearch instance. * @param docs The Document instances to create the ElasticVectorSearch instance from. * @param embeddings The embeddings to use for the documents. * @param dbConfig The configuration for the Elasticsearch database. * @returns A promise that resolves with the created ElasticVectorSearch instance. */ static async fromDocuments(docs, embeddings, dbConfig) { const store = new ElasticVectorSearch(embeddings, dbConfig); await store.addDocuments(docs).then(() => store); return store; } /** * Static method to create an ElasticVectorSearch instance from an * existing index in the Elasticsearch database. It checks if the index * exists, then returns the ElasticVectorSearch instance if it does. * @param embeddings The embeddings to use for the documents. * @param dbConfig The configuration for the Elasticsearch database. * @returns A promise that resolves with the created ElasticVectorSearch instance if the index exists, otherwise it throws an error. */ static async fromExistingIndex(embeddings, dbConfig) { const store = new ElasticVectorSearch(embeddings, dbConfig); const exists = await store.doesIndexExist(); if (exists) { return store; } throw new Error(`The index ${store.indexName} does not exist.`); } async ensureIndexExists(dimension, engine = "hnsw", similarity = "l2_norm", efConstruction = 100, m = 16) { const request = { index: this.indexName, mappings: { dynamic_templates: [ { // map all metadata properties to be keyword except loc metadata_except_loc: { match_mapping_type: "*", match: "metadata.*", unmatch: "metadata.loc", mapping: { type: "keyword" }, }, }, ], properties: { text: { type: "text" }, metadata: { type: "object", properties: { loc: { type: "object" }, // explicitly define loc as an object }, }, embedding: { type: "dense_vector", dims: dimension, index: true, similarity, index_options: { type: engine, m, ef_construction: efConstruction, }, }, }, }, }; const indexExists = await this.doesIndexExist(); if (indexExists) return; await this.client.indices.create(request); } buildMetadataTerms(filter) { if (filter == null) return { must: [], must_not: [] }; const filters = Array.isArray(filter) ? filter : Object.entries(filter).map(([key, value]) => ({ operator: "term", field: key, value, })); const must = []; const must_not = []; const should = []; for (const condition of filters) { const metadataField = `metadata.${condition.field}`; if (condition.operator === "exists") { must.push({ [condition.operator]: { field: metadataField, }, }); } else if (condition.operator === "exclude") { const toExclude = { [metadataField]: condition.value }; must_not.push({ ...(Array.isArray(condition.value) ? { terms: toExclude } : { term: toExclude }), }); } else if (condition.operator === "or") { should.push({ term: { [metadataField]: condition.value, }, }); } else { must.push({ [condition.operator]: { [metadataField]: condition.value, }, }); } } const result = { must, must_not }; if (should.length > 0) { result.should = should; result.minimum_should_match = 1; } return result; } /** * Method to check if an index exists in the Elasticsearch database. * @returns A promise that resolves with a boolean indicating whether the index exists. */ async doesIndexExist() { return await this.client.indices.exists({ index: this.indexName }); } /** * Method to delete an index from the Elasticsearch database if it exists. * @returns A promise that resolves when the deletion is complete. */ async deleteIfExists() { const indexExists = await this.doesIndexExist(); if (!indexExists) return; await this.client.indices.delete({ index: this.indexName }); } }