agsamantha/node_modules/@langchain/community/dist/vectorstores/cassandra.js
2024-10-02 15:15:21 -05:00

353 lines
15 KiB
JavaScript

/* eslint-disable prefer-template */
import { v4 as uuidv4 } from "uuid";
import { VectorStore, } from "@langchain/core/vectorstores";
import { Document } from "@langchain/core/documents";
import { maximalMarginalRelevance } from "@langchain/core/utils/math";
import { CassandraTable, } from "../utils/cassandra.js";
/**
* Class for interacting with the Cassandra database. It extends the
* VectorStore class and provides methods for adding vectors and
* documents, searching for similar vectors, and creating instances from
* texts or documents.
*/
export class CassandraStore extends VectorStore {
_vectorstoreType() {
return "cassandra";
}
_cleanArgs(args) {
const { table, dimensions, primaryKey, nonKeyColumns, indices, metadataColumns, vectorType = "cosine", } = args;
if (!table || !dimensions) {
throw new Error("Missing required arguments");
}
// Utility function to ensure the argument is treated as an array
function _toArray(value) {
return Array.isArray(value) ? value : [value];
}
const indicesArg = indices || [];
// Use the primary key if provided, else default to a single auto-generated UUID column
let primaryKeyArg;
if (primaryKey) {
primaryKeyArg = _toArray(primaryKey);
}
else {
primaryKeyArg = [
{ name: this.idColumnAutoName, type: "uuid", partition: true },
];
}
// The combined nonKeyColumns and metadataColumns, de-duped by name
const combinedColumns = [
..._toArray(nonKeyColumns || []),
..._toArray(metadataColumns || []),
];
const deduplicatedColumns = combinedColumns.filter((col, index, self) => self.findIndex((c) => c.name === col.name) === index);
const nonKeyColumnsArg = [...deduplicatedColumns];
// If no metadata columns are specified, add a default metadata column consistent with Langchain Python
if (nonKeyColumnsArg.length === 0) {
nonKeyColumnsArg.push({
name: this.metadataColumnDefaultName,
type: "map<text, text>",
});
indicesArg.push({
name: `idx_${this.metadataColumnDefaultName}_${table}_keys`,
value: `(keys(${this.metadataColumnDefaultName}))`,
});
indicesArg.push({
name: `idx_${this.metadataColumnDefaultName}_${table}_entries`,
value: `(entries(${this.metadataColumnDefaultName}))`,
});
}
const addDefaultNonKeyColumnIfNeeded = (defaultColumn) => {
const column = nonKeyColumnsArg.find((col) => col.name === defaultColumn.name);
if (!column) {
nonKeyColumnsArg.push(defaultColumn);
}
};
addDefaultNonKeyColumnIfNeeded({ name: this.textColumnName, type: "text" });
addDefaultNonKeyColumnIfNeeded({
name: this.vectorColumnName,
type: `VECTOR<FLOAT,${dimensions}>`,
alias: this.embeddingColumnAlias,
});
// If no index is specified for the vector column, add a default index
if (!indicesArg.some((index) => new RegExp(`\\(\\s*${this.vectorColumnName.toLowerCase()}\\s*\\)`).test(index.value.toLowerCase()))) {
indicesArg.push({
name: `idx_${this.vectorColumnName}_${table}`,
value: `(${this.vectorColumnName})`,
options: `{'similarity_function': '${vectorType.toLowerCase()}'}`,
});
}
// Metadata the user will see excludes vector column and text column
const metadataColumnsArg = [...primaryKeyArg, ...nonKeyColumnsArg].filter((column) => column.name !== this.vectorColumnName &&
column.name !== this.textColumnName);
return {
...args,
vectorType,
primaryKey: primaryKeyArg,
nonKeyColumns: nonKeyColumnsArg,
metadataColumns: metadataColumnsArg,
indices: indicesArg,
};
}
_getColumnByName(columns, columnName) {
const columnsArray = Array.isArray(columns) ? columns : [columns];
const column = columnsArray.find((col) => col.name === columnName);
if (!column) {
throw new Error(`Column ${columnName} not found`);
}
return column;
}
constructor(embeddings, args) {
super(embeddings, args);
Object.defineProperty(this, "table", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "idColumnAutoName", {
enumerable: true,
configurable: true,
writable: true,
value: "id"
});
Object.defineProperty(this, "idColumnAutoGenerated", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "vectorColumnName", {
enumerable: true,
configurable: true,
writable: true,
value: "vector"
});
Object.defineProperty(this, "vectorColumn", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "textColumnName", {
enumerable: true,
configurable: true,
writable: true,
value: "text"
});
Object.defineProperty(this, "textColumn", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "metadataColumnDefaultName", {
enumerable: true,
configurable: true,
writable: true,
value: "metadata"
});
Object.defineProperty(this, "metadataColumns", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "similarityColumn", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "embeddingColumnAlias", {
enumerable: true,
configurable: true,
writable: true,
value: "embedding"
});
const cleanedArgs = this._cleanArgs(args);
// This check here to help the compiler understand that nonKeyColumns will always
// have values after the _cleanArgs call. It is the cleanest way to handle the fact
// that the compiler is not able to make this determination, no matter how hard we try!
if (!cleanedArgs.nonKeyColumns || cleanedArgs.nonKeyColumns.length === 0) {
throw new Error("No non-key columns provided");
}
this.vectorColumn = this._getColumnByName(cleanedArgs.nonKeyColumns, this.vectorColumnName);
this.textColumn = this._getColumnByName(cleanedArgs.nonKeyColumns, this.textColumnName);
this.similarityColumn = {
name: `similarity_${cleanedArgs.vectorType}(${this.vectorColumnName},?)`,
alias: "similarity_score",
type: "",
};
this.idColumnAutoGenerated = !args.primaryKey;
this.metadataColumns = cleanedArgs.metadataColumns;
this.table = new CassandraTable(cleanedArgs);
}
/**
* Method to save vectors to the Cassandra database.
* @param vectors Vectors to save.
* @param documents The documents associated with the vectors.
* @returns Promise that resolves when the vectors have been added.
*/
async addVectors(vectors, documents) {
if (vectors.length === 0) {
return;
}
// Prepare the values for upsert
const values = vectors.map((vector, index) => {
const document = documents[index];
const docMetadata = document.metadata || {};
// If idColumnAutoGenerated is true and ID is not provided, generate a UUID
if (this.idColumnAutoGenerated &&
(docMetadata[this.idColumnAutoName] === undefined ||
docMetadata[this.idColumnAutoName] === "")) {
docMetadata[this.idColumnAutoName] = uuidv4();
}
// Construct the row
const row = [];
// Add values for each metadata column
this.metadataColumns.forEach((col) => {
row.push(docMetadata[col.name] || null);
});
// Add the text content and vector
row.push(document.pageContent);
row.push(new Float32Array(vector));
return row;
});
const columns = [
...this.metadataColumns,
{ name: this.textColumnName, type: "" },
{ name: this.vectorColumnName, type: "" },
];
return this.table.upsert(values, columns);
}
getCassandraTable() {
return this.table;
}
/**
* Method to add documents to the Cassandra database.
* @param documents The documents to add.
* @returns Promise that resolves when the documents have been added.
*/
async addDocuments(documents) {
return this.addVectors(await this.embeddings.embedDocuments(documents.map((d) => d.pageContent)), documents);
}
/**
* Helper method to search for vectors that are similar to a given query vector.
* @param query The query vector.
* @param k The number of similar Documents to return.
* @param filter Optional filter to be applied as a WHERE clause.
* @param includeEmbedding Whether to include the embedding vectors in the results.
* @returns Promise that resolves with an array of tuples, each containing a Document and a score.
*/
async search(query, k, filter, includeEmbedding) {
const vectorAsFloat32Array = new Float32Array(query);
const similarityColumnWithBinds = {
...this.similarityColumn,
binds: [vectorAsFloat32Array],
};
const queryCols = [
...this.metadataColumns,
this.textColumn,
similarityColumnWithBinds,
];
if (includeEmbedding) {
queryCols.push(this.vectorColumn);
}
const orderBy = {
name: this.vectorColumnName,
operator: "ANN OF",
value: [vectorAsFloat32Array],
};
const queryResultSet = await this.table.select(queryCols, filter, [orderBy], k);
return queryResultSet?.rows.map((row) => {
const textContent = row[this.textColumnName];
const sanitizedRow = { ...row };
delete sanitizedRow[this.textColumnName];
delete sanitizedRow.similarity_score;
Object.keys(sanitizedRow).forEach((key) => {
if (sanitizedRow[key] === null) {
delete sanitizedRow[key];
}
});
return [
new Document({ pageContent: textContent, metadata: sanitizedRow }),
row.similarity_score,
];
});
}
/**
* Method to search for vectors that are similar to a given query vector.
* @param query The query vector.
* @param k The number of similar Documents to return.
* @param filter Optional filter to be applied as a WHERE clause.
* @returns Promise that resolves with an array of tuples, each containing a Document and a score.
*/
async similaritySearchVectorWithScore(query, k, filter) {
return this.search(query, k, filter, false);
}
/**
* Method to search for vectors that are similar to a given query vector, but with
* the results selected using the maximal marginal relevance.
* @param query The query string.
* @param options.k The number of similar Documents to return.
* @param options.fetchK=4*k The number of records to fetch before passing to the MMR algorithm.
* @param options.lambda=0.5 The degree of diversity among the results between 0 (maximum diversity) and 1 (minimum diversity).
* @param options.filter Optional filter to be applied as a WHERE clause.
* @returns List of documents selected by maximal marginal relevance.
*/
async maxMarginalRelevanceSearch(query, options) {
const { k, fetchK = 4 * k, lambda = 0.5, filter } = options;
const queryEmbedding = await this.embeddings.embedQuery(query);
const queryResults = await this.search(queryEmbedding, fetchK, filter, true);
const embeddingList = queryResults.map((doc) => doc[0].metadata[this.embeddingColumnAlias]);
const mmrIndexes = maximalMarginalRelevance(queryEmbedding, embeddingList, lambda, k);
return mmrIndexes.map((idx) => {
const doc = queryResults[idx][0];
delete doc.metadata[this.embeddingColumnAlias];
return doc;
});
}
/**
* Static method to create an instance of CassandraStore from texts.
* @param texts The texts to use.
* @param metadatas The metadata associated with the texts.
* @param embeddings The embeddings to use.
* @param args The arguments for the CassandraStore.
* @returns Promise that resolves with a new instance of CassandraStore.
*/
static async fromTexts(texts, metadatas, embeddings, args) {
const docs = [];
for (let index = 0; index < texts.length; index += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[index] : metadatas;
const doc = new Document({
pageContent: texts[index],
metadata,
});
docs.push(doc);
}
return CassandraStore.fromDocuments(docs, embeddings, args);
}
/**
* Static method to create an instance of CassandraStore from documents.
* @param docs The documents to use.
* @param embeddings The embeddings to use.
* @param args The arguments for the CassandraStore.
* @returns Promise that resolves with a new instance of CassandraStore.
*/
static async fromDocuments(docs, embeddings, args) {
const instance = new this(embeddings, args);
await instance.addDocuments(docs);
return instance;
}
/**
* Static method to create an instance of CassandraStore from an existing
* index.
* @param embeddings The embeddings to use.
* @param args The arguments for the CassandraStore.
* @returns Promise that resolves with a new instance of CassandraStore.
*/
static async fromExistingIndex(embeddings, args) {
const instance = new this(embeddings, args);
return instance;
}
}