266 lines
11 KiB
JavaScript
266 lines
11 KiB
JavaScript
|
/* eslint-disable no-process-env */
|
||
|
import * as uuid from "uuid";
|
||
|
import flatten from "flat";
|
||
|
import { VectorStore, } from "@langchain/core/vectorstores";
|
||
|
import { Document } from "@langchain/core/documents";
|
||
|
import { AsyncCaller, } from "@langchain/core/utils/async_caller";
|
||
|
import { maximalMarginalRelevance } from "@langchain/core/utils/math";
|
||
|
import { chunkArray } from "@langchain/core/utils/chunk_array";
|
||
|
/**
|
||
|
* @deprecated Install and import from @langchain/pinecone instead.
|
||
|
* Class that extends the VectorStore class and provides methods to
|
||
|
* interact with the Pinecone vector database.
|
||
|
*/
|
||
|
export class PineconeStore extends VectorStore {
|
||
|
_vectorstoreType() {
|
||
|
return "pinecone";
|
||
|
}
|
||
|
constructor(embeddings, args) {
|
||
|
super(embeddings, args);
|
||
|
Object.defineProperty(this, "textKey", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "namespace", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "pineconeIndex", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "filter", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "caller", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
this.embeddings = embeddings;
|
||
|
const { namespace, pineconeIndex, textKey, filter, ...asyncCallerArgs } = args;
|
||
|
this.namespace = namespace;
|
||
|
this.pineconeIndex = pineconeIndex;
|
||
|
this.textKey = textKey ?? "text";
|
||
|
this.filter = filter;
|
||
|
this.caller = new AsyncCaller(asyncCallerArgs);
|
||
|
}
|
||
|
/**
|
||
|
* Method that adds documents to the Pinecone database.
|
||
|
* @param documents Array of documents to add to the Pinecone database.
|
||
|
* @param options Optional ids for the documents.
|
||
|
* @returns Promise that resolves with the ids of the added documents.
|
||
|
*/
|
||
|
async addDocuments(documents, options) {
|
||
|
const texts = documents.map(({ pageContent }) => pageContent);
|
||
|
return this.addVectors(await this.embeddings.embedDocuments(texts), documents, options);
|
||
|
}
|
||
|
/**
|
||
|
* Method that adds vectors to the Pinecone database.
|
||
|
* @param vectors Array of vectors to add to the Pinecone database.
|
||
|
* @param documents Array of documents associated with the vectors.
|
||
|
* @param options Optional ids for the vectors.
|
||
|
* @returns Promise that resolves with the ids of the added vectors.
|
||
|
*/
|
||
|
async addVectors(vectors, documents, options) {
|
||
|
const ids = Array.isArray(options) ? options : options?.ids;
|
||
|
const documentIds = ids == null ? documents.map(() => uuid.v4()) : ids;
|
||
|
const pineconeVectors = vectors.map((values, idx) => {
|
||
|
// Pinecone doesn't support nested objects, so we flatten them
|
||
|
const documentMetadata = { ...documents[idx].metadata };
|
||
|
// preserve string arrays which are allowed
|
||
|
const stringArrays = {};
|
||
|
for (const key of Object.keys(documentMetadata)) {
|
||
|
if (Array.isArray(documentMetadata[key]) &&
|
||
|
// eslint-disable-next-line @typescript-eslint/ban-types, @typescript-eslint/no-explicit-any
|
||
|
documentMetadata[key].every((el) => typeof el === "string")) {
|
||
|
stringArrays[key] = documentMetadata[key];
|
||
|
delete documentMetadata[key];
|
||
|
}
|
||
|
}
|
||
|
const metadata = {
|
||
|
...flatten(documentMetadata),
|
||
|
...stringArrays,
|
||
|
[this.textKey]: documents[idx].pageContent,
|
||
|
};
|
||
|
// Pinecone doesn't support null values, so we remove them
|
||
|
for (const key of Object.keys(metadata)) {
|
||
|
if (metadata[key] == null) {
|
||
|
delete metadata[key];
|
||
|
}
|
||
|
else if (typeof metadata[key] === "object" &&
|
||
|
Object.keys(metadata[key]).length === 0) {
|
||
|
delete metadata[key];
|
||
|
}
|
||
|
}
|
||
|
return {
|
||
|
id: documentIds[idx],
|
||
|
metadata,
|
||
|
values,
|
||
|
};
|
||
|
});
|
||
|
const namespace = this.pineconeIndex.namespace(this.namespace ?? "");
|
||
|
// Pinecone recommends a limit of 100 vectors per upsert request
|
||
|
const chunkSize = 100;
|
||
|
const chunkedVectors = chunkArray(pineconeVectors, chunkSize);
|
||
|
const batchRequests = chunkedVectors.map((chunk) => this.caller.call(async () => namespace.upsert(chunk)));
|
||
|
await Promise.all(batchRequests);
|
||
|
return documentIds;
|
||
|
}
|
||
|
/**
|
||
|
* Method that deletes vectors from the Pinecone database.
|
||
|
* @param params Parameters for the delete operation.
|
||
|
* @returns Promise that resolves when the delete operation is complete.
|
||
|
*/
|
||
|
async delete(params) {
|
||
|
const { deleteAll, ids, filter } = params;
|
||
|
const namespace = this.pineconeIndex.namespace(this.namespace ?? "");
|
||
|
if (deleteAll) {
|
||
|
await namespace.deleteAll();
|
||
|
}
|
||
|
else if (ids) {
|
||
|
const batchSize = 1000;
|
||
|
for (let i = 0; i < ids.length; i += batchSize) {
|
||
|
const batchIds = ids.slice(i, i + batchSize);
|
||
|
await namespace.deleteMany(batchIds);
|
||
|
}
|
||
|
}
|
||
|
else if (filter) {
|
||
|
await namespace.deleteMany(filter);
|
||
|
}
|
||
|
else {
|
||
|
throw new Error("Either ids or delete_all must be provided.");
|
||
|
}
|
||
|
}
|
||
|
async _runPineconeQuery(query, k, filter, options) {
|
||
|
if (filter && this.filter) {
|
||
|
throw new Error("cannot provide both `filter` and `this.filter`");
|
||
|
}
|
||
|
const _filter = filter ?? this.filter;
|
||
|
const namespace = this.pineconeIndex.namespace(this.namespace ?? "");
|
||
|
const results = await namespace.query({
|
||
|
includeMetadata: true,
|
||
|
topK: k,
|
||
|
vector: query,
|
||
|
filter: _filter,
|
||
|
...options,
|
||
|
});
|
||
|
return results;
|
||
|
}
|
||
|
/**
|
||
|
* Method that performs a similarity search in the Pinecone database and
|
||
|
* returns the results along with their scores.
|
||
|
* @param query Query vector for the similarity search.
|
||
|
* @param k Number of top results to return.
|
||
|
* @param filter Optional filter to apply to the search.
|
||
|
* @returns Promise that resolves with an array of documents and their scores.
|
||
|
*/
|
||
|
async similaritySearchVectorWithScore(query, k, filter) {
|
||
|
const results = await this._runPineconeQuery(query, k, filter);
|
||
|
const result = [];
|
||
|
if (results.matches) {
|
||
|
for (const res of results.matches) {
|
||
|
const { [this.textKey]: pageContent, ...metadata } = (res.metadata ??
|
||
|
{});
|
||
|
if (res.score) {
|
||
|
result.push([new Document({ metadata, pageContent }), res.score]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
/**
|
||
|
* Return documents selected using the maximal marginal relevance.
|
||
|
* Maximal marginal relevance optimizes for similarity to the query AND diversity
|
||
|
* among selected documents.
|
||
|
*
|
||
|
* @param {string} query - Text to look up documents similar to.
|
||
|
* @param {number} options.k - Number of documents to return.
|
||
|
* @param {number} options.fetchK=20 - Number of documents to fetch before passing to the MMR algorithm.
|
||
|
* @param {number} options.lambda=0.5 - Number between 0 and 1 that determines the degree of diversity among the results,
|
||
|
* where 0 corresponds to maximum diversity and 1 to minimum diversity.
|
||
|
* @param {PineconeMetadata} options.filter - Optional filter to apply to the search.
|
||
|
*
|
||
|
* @returns {Promise<Document[]>} - List of documents selected by maximal marginal relevance.
|
||
|
*/
|
||
|
async maxMarginalRelevanceSearch(query, options) {
|
||
|
const queryEmbedding = await this.embeddings.embedQuery(query);
|
||
|
const results = await this._runPineconeQuery(queryEmbedding, options.fetchK ?? 20, options.filter, { includeValues: true });
|
||
|
const matches = results?.matches ?? [];
|
||
|
const embeddingList = matches.map((match) => match.values);
|
||
|
const mmrIndexes = maximalMarginalRelevance(queryEmbedding, embeddingList, options.lambda, options.k);
|
||
|
const topMmrMatches = mmrIndexes.map((idx) => matches[idx]);
|
||
|
const finalResult = [];
|
||
|
for (const res of topMmrMatches) {
|
||
|
const { [this.textKey]: pageContent, ...metadata } = (res.metadata ??
|
||
|
{});
|
||
|
if (res.score) {
|
||
|
finalResult.push(new Document({ metadata, pageContent }));
|
||
|
}
|
||
|
}
|
||
|
return finalResult;
|
||
|
}
|
||
|
/**
|
||
|
* Static method that creates a new instance of the PineconeStore class
|
||
|
* from texts.
|
||
|
* @param texts Array of texts to add to the Pinecone database.
|
||
|
* @param metadatas Metadata associated with the texts.
|
||
|
* @param embeddings Embeddings to use for the texts.
|
||
|
* @param dbConfig Configuration for the Pinecone database.
|
||
|
* @returns Promise that resolves with a new instance of the PineconeStore class.
|
||
|
*/
|
||
|
static async fromTexts(texts, metadatas, embeddings, dbConfig) {
|
||
|
const docs = [];
|
||
|
for (let i = 0; i < texts.length; i += 1) {
|
||
|
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
|
||
|
const newDoc = new Document({
|
||
|
pageContent: texts[i],
|
||
|
metadata,
|
||
|
});
|
||
|
docs.push(newDoc);
|
||
|
}
|
||
|
const args = {
|
||
|
pineconeIndex: dbConfig.pineconeIndex,
|
||
|
textKey: dbConfig.textKey,
|
||
|
namespace: dbConfig.namespace,
|
||
|
};
|
||
|
return PineconeStore.fromDocuments(docs, embeddings, args);
|
||
|
}
|
||
|
/**
|
||
|
* Static method that creates a new instance of the PineconeStore class
|
||
|
* from documents.
|
||
|
* @param docs Array of documents to add to the Pinecone database.
|
||
|
* @param embeddings Embeddings to use for the documents.
|
||
|
* @param dbConfig Configuration for the Pinecone database.
|
||
|
* @returns Promise that resolves with a new instance of the PineconeStore class.
|
||
|
*/
|
||
|
static async fromDocuments(docs, embeddings, dbConfig) {
|
||
|
const args = dbConfig;
|
||
|
args.textKey = dbConfig.textKey ?? "text";
|
||
|
const instance = new this(embeddings, args);
|
||
|
await instance.addDocuments(docs);
|
||
|
return instance;
|
||
|
}
|
||
|
/**
|
||
|
* Static method that creates a new instance of the PineconeStore class
|
||
|
* from an existing index.
|
||
|
* @param embeddings Embeddings to use for the documents.
|
||
|
* @param dbConfig Configuration for the Pinecone database.
|
||
|
* @returns Promise that resolves with a new instance of the PineconeStore class.
|
||
|
*/
|
||
|
static async fromExistingIndex(embeddings, dbConfig) {
|
||
|
const instance = new this(embeddings, dbConfig);
|
||
|
return instance;
|
||
|
}
|
||
|
}
|