/* eslint-disable prefer-template */ import { v4 as uuidv4 } from "uuid"; import { VectorStore, } from "@langchain/core/vectorstores"; import { Document } from "@langchain/core/documents"; import { maximalMarginalRelevance } from "@langchain/core/utils/math"; import { CassandraTable, } from "../utils/cassandra.js"; /** * Class for interacting with the Cassandra database. It extends the * VectorStore class and provides methods for adding vectors and * documents, searching for similar vectors, and creating instances from * texts or documents. */ export class CassandraStore extends VectorStore { _vectorstoreType() { return "cassandra"; } _cleanArgs(args) { const { table, dimensions, primaryKey, nonKeyColumns, indices, metadataColumns, vectorType = "cosine", } = args; if (!table || !dimensions) { throw new Error("Missing required arguments"); } // Utility function to ensure the argument is treated as an array function _toArray(value) { return Array.isArray(value) ? value : [value]; } const indicesArg = indices || []; // Use the primary key if provided, else default to a single auto-generated UUID column let primaryKeyArg; if (primaryKey) { primaryKeyArg = _toArray(primaryKey); } else { primaryKeyArg = [ { name: this.idColumnAutoName, type: "uuid", partition: true }, ]; } // The combined nonKeyColumns and metadataColumns, de-duped by name const combinedColumns = [ ..._toArray(nonKeyColumns || []), ..._toArray(metadataColumns || []), ]; const deduplicatedColumns = combinedColumns.filter((col, index, self) => self.findIndex((c) => c.name === col.name) === index); const nonKeyColumnsArg = [...deduplicatedColumns]; // If no metadata columns are specified, add a default metadata column consistent with Langchain Python if (nonKeyColumnsArg.length === 0) { nonKeyColumnsArg.push({ name: this.metadataColumnDefaultName, type: "map", }); indicesArg.push({ name: `idx_${this.metadataColumnDefaultName}_${table}_keys`, value: `(keys(${this.metadataColumnDefaultName}))`, }); indicesArg.push({ name: `idx_${this.metadataColumnDefaultName}_${table}_entries`, value: `(entries(${this.metadataColumnDefaultName}))`, }); } const addDefaultNonKeyColumnIfNeeded = (defaultColumn) => { const column = nonKeyColumnsArg.find((col) => col.name === defaultColumn.name); if (!column) { nonKeyColumnsArg.push(defaultColumn); } }; addDefaultNonKeyColumnIfNeeded({ name: this.textColumnName, type: "text" }); addDefaultNonKeyColumnIfNeeded({ name: this.vectorColumnName, type: `VECTOR`, alias: this.embeddingColumnAlias, }); // If no index is specified for the vector column, add a default index if (!indicesArg.some((index) => new RegExp(`\\(\\s*${this.vectorColumnName.toLowerCase()}\\s*\\)`).test(index.value.toLowerCase()))) { indicesArg.push({ name: `idx_${this.vectorColumnName}_${table}`, value: `(${this.vectorColumnName})`, options: `{'similarity_function': '${vectorType.toLowerCase()}'}`, }); } // Metadata the user will see excludes vector column and text column const metadataColumnsArg = [...primaryKeyArg, ...nonKeyColumnsArg].filter((column) => column.name !== this.vectorColumnName && column.name !== this.textColumnName); return { ...args, vectorType, primaryKey: primaryKeyArg, nonKeyColumns: nonKeyColumnsArg, metadataColumns: metadataColumnsArg, indices: indicesArg, }; } _getColumnByName(columns, columnName) { const columnsArray = Array.isArray(columns) ? columns : [columns]; const column = columnsArray.find((col) => col.name === columnName); if (!column) { throw new Error(`Column ${columnName} not found`); } return column; } constructor(embeddings, args) { super(embeddings, args); Object.defineProperty(this, "table", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "idColumnAutoName", { enumerable: true, configurable: true, writable: true, value: "id" }); Object.defineProperty(this, "idColumnAutoGenerated", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "vectorColumnName", { enumerable: true, configurable: true, writable: true, value: "vector" }); Object.defineProperty(this, "vectorColumn", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "textColumnName", { enumerable: true, configurable: true, writable: true, value: "text" }); Object.defineProperty(this, "textColumn", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "metadataColumnDefaultName", { enumerable: true, configurable: true, writable: true, value: "metadata" }); Object.defineProperty(this, "metadataColumns", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "similarityColumn", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "embeddingColumnAlias", { enumerable: true, configurable: true, writable: true, value: "embedding" }); const cleanedArgs = this._cleanArgs(args); // This check here to help the compiler understand that nonKeyColumns will always // have values after the _cleanArgs call. It is the cleanest way to handle the fact // that the compiler is not able to make this determination, no matter how hard we try! if (!cleanedArgs.nonKeyColumns || cleanedArgs.nonKeyColumns.length === 0) { throw new Error("No non-key columns provided"); } this.vectorColumn = this._getColumnByName(cleanedArgs.nonKeyColumns, this.vectorColumnName); this.textColumn = this._getColumnByName(cleanedArgs.nonKeyColumns, this.textColumnName); this.similarityColumn = { name: `similarity_${cleanedArgs.vectorType}(${this.vectorColumnName},?)`, alias: "similarity_score", type: "", }; this.idColumnAutoGenerated = !args.primaryKey; this.metadataColumns = cleanedArgs.metadataColumns; this.table = new CassandraTable(cleanedArgs); } /** * Method to save vectors to the Cassandra database. * @param vectors Vectors to save. * @param documents The documents associated with the vectors. * @returns Promise that resolves when the vectors have been added. */ async addVectors(vectors, documents) { if (vectors.length === 0) { return; } // Prepare the values for upsert const values = vectors.map((vector, index) => { const document = documents[index]; const docMetadata = document.metadata || {}; // If idColumnAutoGenerated is true and ID is not provided, generate a UUID if (this.idColumnAutoGenerated && (docMetadata[this.idColumnAutoName] === undefined || docMetadata[this.idColumnAutoName] === "")) { docMetadata[this.idColumnAutoName] = uuidv4(); } // Construct the row const row = []; // Add values for each metadata column this.metadataColumns.forEach((col) => { row.push(docMetadata[col.name] || null); }); // Add the text content and vector row.push(document.pageContent); row.push(new Float32Array(vector)); return row; }); const columns = [ ...this.metadataColumns, { name: this.textColumnName, type: "" }, { name: this.vectorColumnName, type: "" }, ]; return this.table.upsert(values, columns); } getCassandraTable() { return this.table; } /** * Method to add documents to the Cassandra database. * @param documents The documents to add. * @returns Promise that resolves when the documents have been added. */ async addDocuments(documents) { return this.addVectors(await this.embeddings.embedDocuments(documents.map((d) => d.pageContent)), documents); } /** * Helper method to search for vectors that are similar to a given query vector. * @param query The query vector. * @param k The number of similar Documents to return. * @param filter Optional filter to be applied as a WHERE clause. * @param includeEmbedding Whether to include the embedding vectors in the results. * @returns Promise that resolves with an array of tuples, each containing a Document and a score. */ async search(query, k, filter, includeEmbedding) { const vectorAsFloat32Array = new Float32Array(query); const similarityColumnWithBinds = { ...this.similarityColumn, binds: [vectorAsFloat32Array], }; const queryCols = [ ...this.metadataColumns, this.textColumn, similarityColumnWithBinds, ]; if (includeEmbedding) { queryCols.push(this.vectorColumn); } const orderBy = { name: this.vectorColumnName, operator: "ANN OF", value: [vectorAsFloat32Array], }; const queryResultSet = await this.table.select(queryCols, filter, [orderBy], k); return queryResultSet?.rows.map((row) => { const textContent = row[this.textColumnName]; const sanitizedRow = { ...row }; delete sanitizedRow[this.textColumnName]; delete sanitizedRow.similarity_score; Object.keys(sanitizedRow).forEach((key) => { if (sanitizedRow[key] === null) { delete sanitizedRow[key]; } }); return [ new Document({ pageContent: textContent, metadata: sanitizedRow }), row.similarity_score, ]; }); } /** * Method to search for vectors that are similar to a given query vector. * @param query The query vector. * @param k The number of similar Documents to return. * @param filter Optional filter to be applied as a WHERE clause. * @returns Promise that resolves with an array of tuples, each containing a Document and a score. */ async similaritySearchVectorWithScore(query, k, filter) { return this.search(query, k, filter, false); } /** * Method to search for vectors that are similar to a given query vector, but with * the results selected using the maximal marginal relevance. * @param query The query string. * @param options.k The number of similar Documents to return. * @param options.fetchK=4*k The number of records to fetch before passing to the MMR algorithm. * @param options.lambda=0.5 The degree of diversity among the results between 0 (maximum diversity) and 1 (minimum diversity). * @param options.filter Optional filter to be applied as a WHERE clause. * @returns List of documents selected by maximal marginal relevance. */ async maxMarginalRelevanceSearch(query, options) { const { k, fetchK = 4 * k, lambda = 0.5, filter } = options; const queryEmbedding = await this.embeddings.embedQuery(query); const queryResults = await this.search(queryEmbedding, fetchK, filter, true); const embeddingList = queryResults.map((doc) => doc[0].metadata[this.embeddingColumnAlias]); const mmrIndexes = maximalMarginalRelevance(queryEmbedding, embeddingList, lambda, k); return mmrIndexes.map((idx) => { const doc = queryResults[idx][0]; delete doc.metadata[this.embeddingColumnAlias]; return doc; }); } /** * Static method to create an instance of CassandraStore from texts. * @param texts The texts to use. * @param metadatas The metadata associated with the texts. * @param embeddings The embeddings to use. * @param args The arguments for the CassandraStore. * @returns Promise that resolves with a new instance of CassandraStore. */ static async fromTexts(texts, metadatas, embeddings, args) { const docs = []; for (let index = 0; index < texts.length; index += 1) { const metadata = Array.isArray(metadatas) ? metadatas[index] : metadatas; const doc = new Document({ pageContent: texts[index], metadata, }); docs.push(doc); } return CassandraStore.fromDocuments(docs, embeddings, args); } /** * Static method to create an instance of CassandraStore from documents. * @param docs The documents to use. * @param embeddings The embeddings to use. * @param args The arguments for the CassandraStore. * @returns Promise that resolves with a new instance of CassandraStore. */ static async fromDocuments(docs, embeddings, args) { const instance = new this(embeddings, args); await instance.addDocuments(docs); return instance; } /** * Static method to create an instance of CassandraStore from an existing * index. * @param embeddings The embeddings to use. * @param args The arguments for the CassandraStore. * @returns Promise that resolves with a new instance of CassandraStore. */ static async fromExistingIndex(embeddings, args) { const instance = new this(embeddings, args); return instance; } }