agsamantha/node_modules/@langchain/community/dist/vectorstores/elasticsearch.js
2024-10-02 15:15:21 -05:00

316 lines
12 KiB
JavaScript

import * as uuid from "uuid";
import { VectorStore } from "@langchain/core/vectorstores";
import { Document } from "@langchain/core/documents";
/**
* Class for interacting with an Elasticsearch database. It extends the
* VectorStore base class and provides methods for adding documents and
* vectors to the Elasticsearch database, performing similarity searches,
* deleting documents, and more.
*/
export class ElasticVectorSearch extends VectorStore {
_vectorstoreType() {
return "elasticsearch";
}
constructor(embeddings, args) {
super(embeddings, args);
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "indexName", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "engine", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "similarity", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "efConstruction", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "m", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "candidates", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.engine = args.vectorSearchOptions?.engine ?? "hnsw";
this.similarity = args.vectorSearchOptions?.similarity ?? "l2_norm";
this.m = args.vectorSearchOptions?.m ?? 16;
this.efConstruction = args.vectorSearchOptions?.efConstruction ?? 100;
this.candidates = args.vectorSearchOptions?.candidates ?? 200;
this.client = args.client.child({
headers: { "user-agent": "langchain-js-vs/0.0.1" },
});
this.indexName = args.indexName ?? "documents";
}
/**
* Method to add documents to the Elasticsearch database. It first
* converts the documents to vectors using the embeddings, then adds the
* vectors to the database.
* @param documents The documents to add to the database.
* @param options Optional parameter that can contain the IDs for the documents.
* @returns A promise that resolves with the IDs of the added documents.
*/
async addDocuments(documents, options) {
const texts = documents.map(({ pageContent }) => pageContent);
return this.addVectors(await this.embeddings.embedDocuments(texts), documents, options);
}
/**
* Method to add vectors to the Elasticsearch database. It ensures the
* index exists, then adds the vectors and their corresponding documents
* to the database.
* @param vectors The vectors to add to the database.
* @param documents The documents corresponding to the vectors.
* @param options Optional parameter that can contain the IDs for the documents.
* @returns A promise that resolves with the IDs of the added documents.
*/
async addVectors(vectors, documents, options) {
await this.ensureIndexExists(vectors[0].length, this.engine, this.similarity, this.efConstruction, this.m);
const documentIds = options?.ids ?? Array.from({ length: vectors.length }, () => uuid.v4());
const operations = vectors.flatMap((embedding, idx) => [
{
index: {
_id: documentIds[idx],
_index: this.indexName,
},
},
{
embedding,
metadata: documents[idx].metadata,
text: documents[idx].pageContent,
},
]);
const results = await this.client.bulk({ refresh: true, operations });
if (results.errors) {
const reasons = results.items.map((result) => result.index?.error?.reason);
throw new Error(`Failed to insert documents:\n${reasons.join("\n")}`);
}
return documentIds;
}
/**
* Method to perform a similarity search in the Elasticsearch database
* using a vector. It returns the k most similar documents along with
* their similarity scores.
* @param query The query vector.
* @param k The number of most similar documents to return.
* @param filter Optional filter to apply to the search.
* @returns A promise that resolves with an array of tuples, where each tuple contains a Document and its similarity score.
*/
async similaritySearchVectorWithScore(query, k, filter) {
const result = await this.client.search({
index: this.indexName,
size: k,
knn: {
field: "embedding",
query_vector: query,
filter: { bool: this.buildMetadataTerms(filter) },
k,
num_candidates: this.candidates,
},
});
// eslint-disable-next-line @typescript-eslint/no-explicit-any
return result.hits.hits.map((hit) => [
new Document({
pageContent: hit._source.text,
metadata: hit._source.metadata,
}),
hit._score,
]);
}
/**
* Method to delete documents from the Elasticsearch database.
* @param params Object containing the IDs of the documents to delete.
* @returns A promise that resolves when the deletion is complete.
*/
async delete(params) {
const operations = params.ids.map((id) => ({
delete: {
_id: id,
_index: this.indexName,
},
}));
if (operations.length > 0)
await this.client.bulk({ refresh: true, operations });
}
/**
* Static method to create an ElasticVectorSearch instance from texts. It
* creates Document instances from the texts and their corresponding
* metadata, then calls the fromDocuments method to create the
* ElasticVectorSearch instance.
* @param texts The texts to create the ElasticVectorSearch instance from.
* @param metadatas The metadata corresponding to the texts.
* @param embeddings The embeddings to use for the documents.
* @param args The arguments to create the Elasticsearch client.
* @returns A promise that resolves with the created ElasticVectorSearch instance.
*/
static fromTexts(texts, metadatas, embeddings, args) {
const documents = texts.map((text, idx) => {
const metadata = Array.isArray(metadatas) ? metadatas[idx] : metadatas;
return new Document({ pageContent: text, metadata });
});
return ElasticVectorSearch.fromDocuments(documents, embeddings, args);
}
/**
* Static method to create an ElasticVectorSearch instance from Document
* instances. It adds the documents to the Elasticsearch database, then
* returns the ElasticVectorSearch instance.
* @param docs The Document instances to create the ElasticVectorSearch instance from.
* @param embeddings The embeddings to use for the documents.
* @param dbConfig The configuration for the Elasticsearch database.
* @returns A promise that resolves with the created ElasticVectorSearch instance.
*/
static async fromDocuments(docs, embeddings, dbConfig) {
const store = new ElasticVectorSearch(embeddings, dbConfig);
await store.addDocuments(docs).then(() => store);
return store;
}
/**
* Static method to create an ElasticVectorSearch instance from an
* existing index in the Elasticsearch database. It checks if the index
* exists, then returns the ElasticVectorSearch instance if it does.
* @param embeddings The embeddings to use for the documents.
* @param dbConfig The configuration for the Elasticsearch database.
* @returns A promise that resolves with the created ElasticVectorSearch instance if the index exists, otherwise it throws an error.
*/
static async fromExistingIndex(embeddings, dbConfig) {
const store = new ElasticVectorSearch(embeddings, dbConfig);
const exists = await store.doesIndexExist();
if (exists) {
return store;
}
throw new Error(`The index ${store.indexName} does not exist.`);
}
async ensureIndexExists(dimension, engine = "hnsw", similarity = "l2_norm", efConstruction = 100, m = 16) {
const request = {
index: this.indexName,
mappings: {
dynamic_templates: [
{
// map all metadata properties to be keyword except loc
metadata_except_loc: {
match_mapping_type: "*",
match: "metadata.*",
unmatch: "metadata.loc",
mapping: { type: "keyword" },
},
},
],
properties: {
text: { type: "text" },
metadata: {
type: "object",
properties: {
loc: { type: "object" }, // explicitly define loc as an object
},
},
embedding: {
type: "dense_vector",
dims: dimension,
index: true,
similarity,
index_options: {
type: engine,
m,
ef_construction: efConstruction,
},
},
},
},
};
const indexExists = await this.doesIndexExist();
if (indexExists)
return;
await this.client.indices.create(request);
}
buildMetadataTerms(filter) {
if (filter == null)
return { must: [], must_not: [] };
const filters = Array.isArray(filter)
? filter
: Object.entries(filter).map(([key, value]) => ({
operator: "term",
field: key,
value,
}));
const must = [];
const must_not = [];
const should = [];
for (const condition of filters) {
const metadataField = `metadata.${condition.field}`;
if (condition.operator === "exists") {
must.push({
[condition.operator]: {
field: metadataField,
},
});
}
else if (condition.operator === "exclude") {
const toExclude = { [metadataField]: condition.value };
must_not.push({
...(Array.isArray(condition.value)
? { terms: toExclude }
: { term: toExclude }),
});
}
else if (condition.operator === "or") {
should.push({
term: {
[metadataField]: condition.value,
},
});
}
else {
must.push({
[condition.operator]: {
[metadataField]: condition.value,
},
});
}
}
const result = { must, must_not };
if (should.length > 0) {
result.should = should;
result.minimum_should_match = 1;
}
return result;
}
/**
* Method to check if an index exists in the Elasticsearch database.
* @returns A promise that resolves with a boolean indicating whether the index exists.
*/
async doesIndexExist() {
return await this.client.indices.exists({ index: this.indexName });
}
/**
* Method to delete an index from the Elasticsearch database if it exists.
* @returns A promise that resolves when the deletion is complete.
*/
async deleteIfExists() {
const indexExists = await this.doesIndexExist();
if (!indexExists)
return;
await this.client.indices.delete({ index: this.indexName });
}
}