agsamantha/node_modules/@langchain/community/dist/vectorstores/astradb.js

287 lines
12 KiB
JavaScript
Raw Normal View History

2024-10-02 15:15:21 -05:00
/* eslint-disable @typescript-eslint/no-explicit-any */
import * as uuid from "uuid";
import { DataAPIClient, } from "@datastax/astra-db-ts";
import { AsyncCaller, } from "@langchain/core/utils/async_caller";
import { Document } from "@langchain/core/documents";
import { maximalMarginalRelevance } from "@langchain/core/utils/math";
import { VectorStore, } from "@langchain/core/vectorstores";
export class AstraDBVectorStore extends VectorStore {
_vectorstoreType() {
return "astradb";
}
constructor(embeddings, args) {
super(embeddings, args);
Object.defineProperty(this, "astraDBClient", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "collectionName", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "collection", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "collectionOptions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "idKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "contentKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
}); // if undefined the entirety of the content aside from the id and embedding will be stored as content
Object.defineProperty(this, "caller", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "skipCollectionProvisioning", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
const { token, endpoint, collection, collectionOptions, namespace, idKey, contentKey, skipCollectionProvisioning, ...callerArgs } = args;
const dataAPIClient = new DataAPIClient(token, { caller: ["langchainjs"] });
this.astraDBClient = dataAPIClient.db(endpoint, { namespace });
this.skipCollectionProvisioning = skipCollectionProvisioning ?? false;
if (this.skipCollectionProvisioning && collectionOptions) {
throw new Error("If 'skipCollectionProvisioning' has been set to true, 'collectionOptions' must not be defined");
}
this.collectionName = collection;
this.collectionOptions =
AstraDBVectorStore.applyCollectionOptionsDefaults(collectionOptions);
this.idKey = idKey ?? "_id";
this.contentKey = contentKey ?? "text";
this.caller = new AsyncCaller(callerArgs);
if (args.batchSize) {
console.warn("[WARNING]: `batchSize` is deprecated, and no longer has any effect.\n`astra-db-ts` > 1.0.0 handles this internally.");
}
}
static applyCollectionOptionsDefaults(fromUser) {
const copy = fromUser ? { ...fromUser } : {};
if (copy.checkExists === undefined) {
copy.checkExists = false;
}
if (copy.indexing === undefined) {
// same default as langchain python AstraDBVectorStore.
// this enables to create the collection in python/ts and use it in ts/python with default options.
copy.indexing = { allow: ["metadata"] };
}
return copy;
}
/**
* Create a new collection in your Astra DB vector database and then connects to it.
* If the collection already exists, it will connect to it as well.
*
* @returns Promise that resolves if connected to the collection.
*/
async initialize() {
if (!this.skipCollectionProvisioning) {
await this.astraDBClient.createCollection(this.collectionName, this.collectionOptions);
}
this.collection = await this.astraDBClient.collection(this.collectionName);
console.debug("Connected to Astra DB collection");
}
/**
* Method to save vectors to AstraDB.
*
* @param vectors Vectors to save.
* @param documents The documents associated with the vectors.
* @returns Promise that resolves when the vectors have been added.
*/
async addVectors(vectors, documents, options) {
if (!this.collection) {
throw new Error("Must connect to a collection before adding vectors");
}
const docs = vectors.map((embedding, idx) => ({
[this.idKey]: options?.[idx] ?? uuid.v4(),
[this.contentKey]: documents[idx].pageContent,
$vector: embedding,
...documents[idx].metadata,
}));
let insertResults;
const isInsertManyError = (error) => error.name === "InsertManyError";
try {
insertResults = await this.collection.insertMany(docs, {
ordered: false,
});
}
catch (error) {
if (isInsertManyError(error)) {
insertResults = error.partialResult;
}
else {
throw error;
}
}
const insertedIds = insertResults.insertedIds;
if (insertedIds.length !== docs.length) {
const missingDocs = docs.filter((doc) => !insertedIds.includes(doc[this.idKey]));
for (let i = 0; i < missingDocs.length; i += 1) {
await this.caller.call(async () => {
await this.collection?.replaceOne({ [this.idKey]: missingDocs[i][this.idKey] }, missingDocs[i]);
});
}
}
}
/**
* Method that adds documents to AstraDB.
*
* @param documents Array of documents to add to AstraDB.
* @param options Optional ids for the documents.
* @returns Promise that resolves the documents have been added.
*/
async addDocuments(documents, options) {
if (!this.collection) {
throw new Error("Must connect to a collection before adding vectors");
}
return this.addVectors(await this.embeddings.embedDocuments(documents.map((d) => d.pageContent)), documents, options);
}
/**
* Method that deletes documents from AstraDB.
*
* @param params AstraDeleteParameters for the delete.
* @returns Promise that resolves when the documents have been deleted.
*/
async delete(params) {
if (!this.collection) {
throw new Error("Must connect to a collection before deleting");
}
await this.collection.deleteMany({ [this.idKey]: { $in: params.ids } });
console.log(`Deleted ${params.ids.length} documents`);
}
/**
* Method that performs a similarity search in AstraDB and returns and similarity scores.
*
* @param query Query vector for the similarity search.
* @param k Number of top results to return.
* @param filter Optional filter to apply to the search.
* @returns Promise that resolves with an array of documents and their scores.
*/
async similaritySearchVectorWithScore(query, k, filter) {
if (!this.collection) {
throw new Error("Must connect to a collection before adding vectors");
}
const cursor = await this.collection.find(filter ?? {}, {
sort: { $vector: query },
limit: k,
includeSimilarity: true,
});
const results = [];
for await (const row of cursor) {
const { $similarity: similarity, [this.contentKey]: content, ...metadata } = row;
const doc = new Document({
pageContent: content,
metadata,
});
results.push([doc, similarity]);
}
return results;
}
/**
* Return documents selected using the maximal marginal relevance.
* Maximal marginal relevance optimizes for similarity to the query AND diversity
* among selected documents.
*
* @param {string} query - Text to look up documents similar to.
* @param {number} options.k - Number of documents to return.
* @param {number} options.fetchK - Number of documents to fetch before passing to the MMR algorithm.
* @param {number} options.lambda - Number between 0 and 1 that determines the degree of diversity among the results,
* where 0 corresponds to maximum diversity and 1 to minimum diversity.
* @param {CollectionFilter} options.filter - Optional filter
*
* @returns {Promise<Document[]>} - List of documents selected by maximal marginal relevance.
*/
async maxMarginalRelevanceSearch(query, options) {
if (!this.collection) {
throw new Error("Must connect to a collection before adding vectors");
}
const queryEmbedding = await this.embeddings.embedQuery(query);
const cursor = await this.collection.find(options.filter ?? {}, {
sort: { $vector: queryEmbedding },
limit: options.k,
includeSimilarity: true,
});
const results = (await cursor.toArray()) ?? [];
const embeddingList = results.map((row) => row.$vector);
const mmrIndexes = maximalMarginalRelevance(queryEmbedding, embeddingList, options.lambda, options.k);
const topMmrMatches = mmrIndexes.map((idx) => results[idx]);
const docs = [];
topMmrMatches.forEach((match) => {
const { [this.contentKey]: content, ...metadata } = match;
const doc = {
pageContent: content,
metadata,
};
docs.push(doc);
});
return docs;
}
/**
* Static method to create an instance of AstraDBVectorStore from texts.
*
* @param texts The texts to use.
* @param metadatas The metadata associated with the texts.
* @param embeddings The embeddings to use.
* @param dbConfig The arguments for the AstraDBVectorStore.
* @returns Promise that resolves with a new instance of AstraDBVectorStore.
*/
static async fromTexts(texts, metadatas, embeddings, dbConfig) {
const docs = [];
for (let i = 0; i < texts.length; i += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
const doc = new Document({
pageContent: texts[i],
metadata,
});
docs.push(doc);
}
return AstraDBVectorStore.fromDocuments(docs, embeddings, dbConfig);
}
/**
* Static method to create an instance of AstraDBVectorStore from documents.
*
* @param docs The Documents to use.
* @param embeddings The embeddings to use.
* @param dbConfig The arguments for the AstraDBVectorStore.
* @returns Promise that resolves with a new instance of AstraDBVectorStore.
*/
static async fromDocuments(docs, embeddings, dbConfig) {
const instance = new this(embeddings, dbConfig);
await instance.initialize();
await instance.addDocuments(docs);
return instance;
}
/**
* Static method to create an instance of AstraDBVectorStore from an existing index.
*
* @param embeddings The embeddings to use.
* @param dbConfig The arguments for the AstraDBVectorStore.
* @returns Promise that resolves with a new instance of AstraDBVectorStore.
*/
static async fromExistingIndex(embeddings, dbConfig) {
const instance = new this(embeddings, dbConfig);
await instance.initialize();
return instance;
}
}