agsamantha/node_modules/@langchain/community/dist/vectorstores/vectara.js
2024-10-02 15:15:21 -05:00

437 lines
17 KiB
JavaScript

import * as uuid from "uuid";
import { Document } from "@langchain/core/documents";
import { getEnvironmentVariable } from "@langchain/core/utils/env";
import { VectorStore } from "@langchain/core/vectorstores";
import { FakeEmbeddings } from "@langchain/core/utils/testing";
export const DEFAULT_FILTER = {
start: 0,
filter: "",
lambda: 0.0,
contextConfig: {
sentencesBefore: 2,
sentencesAfter: 2,
startTag: "<b>",
endTag: "</b>",
},
mmrConfig: {
enabled: false,
mmrTopK: 0,
diversityBias: 0.0,
},
};
/**
* Class for interacting with the Vectara API. Extends the VectorStore
* class.
*/
export class VectaraStore extends VectorStore {
get lc_secrets() {
return {
apiKey: "VECTARA_API_KEY",
corpusId: "VECTARA_CORPUS_ID",
customerId: "VECTARA_CUSTOMER_ID",
};
}
get lc_aliases() {
return {
apiKey: "vectara_api_key",
corpusId: "vectara_corpus_id",
customerId: "vectara_customer_id",
};
}
_vectorstoreType() {
return "vectara";
}
constructor(args) {
// Vectara doesn't need embeddings, but we need to pass something to the parent constructor
// The embeddings are abstracted out from the user in Vectara.
super(new FakeEmbeddings(), args);
Object.defineProperty(this, "apiEndpoint", {
enumerable: true,
configurable: true,
writable: true,
value: "api.vectara.io"
});
Object.defineProperty(this, "apiKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "corpusId", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "customerId", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "verbose", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "source", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "vectaraApiTimeoutSeconds", {
enumerable: true,
configurable: true,
writable: true,
value: 60
});
const apiKey = args.apiKey ?? getEnvironmentVariable("VECTARA_API_KEY");
if (!apiKey) {
throw new Error("Vectara api key is not provided.");
}
this.apiKey = apiKey;
this.source = args.source ?? "langchainjs";
const corpusId = args.corpusId ??
getEnvironmentVariable("VECTARA_CORPUS_ID")
?.split(",")
.map((id) => {
const num = Number(id);
if (Number.isNaN(num))
throw new Error("Vectara corpus id is not a number.");
return num;
});
if (!corpusId) {
throw new Error("Vectara corpus id is not provided.");
}
if (typeof corpusId === "number") {
this.corpusId = [corpusId];
}
else {
if (corpusId.length === 0)
throw new Error("Vectara corpus id is not provided.");
this.corpusId = corpusId;
}
const customerId = args.customerId ?? getEnvironmentVariable("VECTARA_CUSTOMER_ID");
if (!customerId) {
throw new Error("Vectara customer id is not provided.");
}
this.customerId = customerId;
this.verbose = args.verbose ?? false;
}
/**
* Returns a header for Vectara API calls.
* @returns A Promise that resolves to a VectaraCallHeader object.
*/
async getJsonHeader() {
return {
headers: {
"x-api-key": this.apiKey,
"Content-Type": "application/json",
"customer-id": this.customerId.toString(),
"X-Source": this.source,
},
};
}
/**
* Throws an error, as this method is not implemented. Use addDocuments
* instead.
* @param _vectors Not used.
* @param _documents Not used.
* @returns Does not return a value.
*/
async addVectors(_vectors, _documents) {
throw new Error("Method not implemented. Please call addDocuments instead.");
}
/**
* Method to delete data from the Vectara corpus.
* @param params an array of document IDs to be deleted
* @returns Promise that resolves when the deletion is complete.
*/
async deleteDocuments(ids) {
if (ids && ids.length > 0) {
const headers = await this.getJsonHeader();
for (const id of ids) {
const data = {
customer_id: this.customerId,
corpus_id: this.corpusId[0],
document_id: id,
};
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), this.vectaraApiTimeoutSeconds * 1000);
const response = await fetch(`https://${this.apiEndpoint}/v1/delete-doc`, {
method: "POST",
headers: headers?.headers,
body: JSON.stringify(data),
signal: controller.signal,
});
clearTimeout(timeout);
if (response.status !== 200) {
throw new Error(`Vectara API returned status code ${response.status} when deleting document ${id}`);
}
}
catch (e) {
const error = new Error(`Error ${e.message}`);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
error.code = 500;
throw error;
}
}
}
else {
throw new Error(`no "ids" specified for deletion`);
}
}
/**
* Adds documents to the Vectara store.
* @param documents An array of Document objects to add to the Vectara store.
* @returns A Promise that resolves to an array of document IDs indexed in Vectara.
*/
async addDocuments(documents) {
if (this.corpusId.length > 1)
throw new Error("addDocuments does not support multiple corpus ids");
const headers = await this.getJsonHeader();
const doc_ids = [];
let countAdded = 0;
for (const document of documents) {
const doc_id = document.metadata?.document_id ?? uuid.v4();
const data = {
customer_id: this.customerId,
corpus_id: this.corpusId[0],
document: {
document_id: doc_id,
title: document.metadata?.title ?? "",
metadata_json: JSON.stringify(document.metadata ?? {}),
section: [
{
text: document.pageContent,
},
],
},
};
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), this.vectaraApiTimeoutSeconds * 1000);
const response = await fetch(`https://${this.apiEndpoint}/v1/index`, {
method: "POST",
headers: headers?.headers,
body: JSON.stringify(data),
signal: controller.signal,
});
clearTimeout(timeout);
const result = await response.json();
if (result.status?.code !== "OK" &&
result.status?.code !== "ALREADY_EXISTS") {
const error = new Error(`Vectara API returned status code ${result.status?.code}: ${JSON.stringify(result.message)}`);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
error.code = 500;
throw error;
}
else {
countAdded += 1;
doc_ids.push(doc_id);
}
}
catch (e) {
const error = new Error(`Error ${e.message} while adding document`);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
error.code = 500;
throw error;
}
}
if (this.verbose) {
console.log(`Added ${countAdded} documents to Vectara`);
}
return doc_ids;
}
/**
* Vectara provides a way to add documents directly via their API. This API handles
* pre-processing and chunking internally in an optimal manner. This method is a wrapper
* to utilize that API within LangChain.
*
* @param files An array of VectaraFile objects representing the files and their respective file names to be uploaded to Vectara.
* @param metadata Optional. An array of metadata objects corresponding to each file in the `filePaths` array.
* @returns A Promise that resolves to the number of successfully uploaded files.
*/
async addFiles(files, metadatas = undefined) {
if (this.corpusId.length > 1)
throw new Error("addFiles does not support multiple corpus ids");
const doc_ids = [];
for (const [index, file] of files.entries()) {
const md = metadatas ? metadatas[index] : {};
const data = new FormData();
data.append("file", file.blob, file.fileName);
data.append("doc-metadata", JSON.stringify(md));
const response = await fetch(`https://api.vectara.io/v1/upload?c=${this.customerId}&o=${this.corpusId[0]}&d=true`, {
method: "POST",
headers: {
"x-api-key": this.apiKey,
"X-Source": this.source,
},
body: data,
});
const { status } = response;
if (status === 409) {
throw new Error(`File at index ${index} already exists in Vectara`);
}
else if (status !== 200) {
throw new Error(`Vectara API returned status code ${status}`);
}
else {
const result = await response.json();
const doc_id = result.document.documentId;
doc_ids.push(doc_id);
}
}
if (this.verbose) {
console.log(`Uploaded ${files.length} files to Vectara`);
}
return doc_ids;
}
/**
* Performs a Vectara API call based on the arguments provided.
* @param query The query string for the similarity search.
* @param k Optional. The number of results to return. Default is 10.
* @param filter Optional. A VectaraFilter object to refine the search results.
* @returns A Promise that resolves to an array of tuples, each containing a Document and its score.
*/
async vectaraQuery(query, k, vectaraFilterObject, summary = {
enabled: false,
maxSummarizedResults: 0,
responseLang: "eng",
}) {
const headers = await this.getJsonHeader();
const { start, filter, lambda, contextConfig, mmrConfig } = vectaraFilterObject;
const corpusKeys = this.corpusId.map((corpusId) => ({
customerId: this.customerId,
corpusId,
metadataFilter: filter,
lexicalInterpolationConfig: { lambda },
}));
const data = {
query: [
{
query,
start,
numResults: mmrConfig?.enabled ? mmrConfig.mmrTopK : k,
contextConfig,
...(mmrConfig?.enabled
? {
rerankingConfig: {
rerankerId: 272725718,
mmrConfig: { diversityBias: mmrConfig.diversityBias },
},
}
: {}),
corpusKey: corpusKeys,
...(summary?.enabled ? { summary: [summary] } : {}),
},
],
};
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), this.vectaraApiTimeoutSeconds * 1000);
const response = await fetch(`https://${this.apiEndpoint}/v1/query`, {
method: "POST",
headers: headers?.headers,
body: JSON.stringify(data),
signal: controller.signal,
});
clearTimeout(timeout);
if (response.status !== 200) {
throw new Error(`Vectara API returned status code ${response.status}`);
}
const result = await response.json();
const responses = result.responseSet[0].response;
const documents = result.responseSet[0].document;
for (let i = 0; i < responses.length; i += 1) {
const responseMetadata = responses[i].metadata;
const documentMetadata = documents[responses[i].documentIndex].metadata;
const combinedMetadata = {};
responseMetadata.forEach((item) => {
combinedMetadata[item.name] = item.value;
});
documentMetadata.forEach((item) => {
combinedMetadata[item.name] = item.value;
});
responses[i].metadata = combinedMetadata;
}
const res = {
documents: responses.map((response) => new Document({
pageContent: response.text,
metadata: response.metadata,
})),
scores: responses.map((response) => response.score),
summary: result.responseSet[0].summary[0]?.text ?? "",
};
return res;
}
/**
* Performs a similarity search and returns documents along with their
* scores.
* @param query The query string for the similarity search.
* @param k Optional. The number of results to return. Default is 10.
* @param filter Optional. A VectaraFilter object to refine the search results.
* @returns A Promise that resolves to an array of tuples, each containing a Document and its score.
*/
async similaritySearchWithScore(query, k, filter) {
const summaryResult = await this.vectaraQuery(query, k || 10, filter || DEFAULT_FILTER);
const res = summaryResult.documents.map((document, index) => [document, summaryResult.scores[index]]);
return res;
}
/**
* Performs a similarity search and returns documents.
* @param query The query string for the similarity search.
* @param k Optional. The number of results to return. Default is 10.
* @param filter Optional. A VectaraFilter object to refine the search results.
* @returns A Promise that resolves to an array of Document objects.
*/
async similaritySearch(query, k, filter) {
const documents = await this.similaritySearchWithScore(query, k || 10, filter || DEFAULT_FILTER);
return documents.map((result) => result[0]);
}
/**
* Throws an error, as this method is not implemented. Use
* similaritySearch or similaritySearchWithScore instead.
* @param _query Not used.
* @param _k Not used.
* @param _filter Not used.
* @returns Does not return a value.
*/
async similaritySearchVectorWithScore(_query, _k, _filter) {
throw new Error("Method not implemented. Please call similaritySearch or similaritySearchWithScore instead.");
}
/**
* Creates a VectaraStore instance from texts.
* @param texts An array of text strings.
* @param metadatas Metadata for the texts. Can be a single object or an array of objects.
* @param _embeddings Not used.
* @param args A VectaraLibArgs object for initializing the VectaraStore instance.
* @returns A Promise that resolves to a VectaraStore instance.
*/
static fromTexts(texts, metadatas, _embeddings, args) {
const docs = [];
for (let i = 0; i < texts.length; i += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
const newDoc = new Document({
pageContent: texts[i],
metadata,
});
docs.push(newDoc);
}
return VectaraStore.fromDocuments(docs, new FakeEmbeddings(), args);
}
/**
* Creates a VectaraStore instance from documents.
* @param docs An array of Document objects.
* @param _embeddings Not used.
* @param args A VectaraLibArgs object for initializing the VectaraStore instance.
* @returns A Promise that resolves to a VectaraStore instance.
*/
static async fromDocuments(docs, _embeddings, args) {
const instance = new this(args);
await instance.addDocuments(docs);
return instance;
}
}