437 lines
17 KiB
JavaScript
437 lines
17 KiB
JavaScript
import * as uuid from "uuid";
|
|
import { Document } from "@langchain/core/documents";
|
|
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
import { VectorStore } from "@langchain/core/vectorstores";
|
|
import { FakeEmbeddings } from "@langchain/core/utils/testing";
|
|
export const DEFAULT_FILTER = {
|
|
start: 0,
|
|
filter: "",
|
|
lambda: 0.0,
|
|
contextConfig: {
|
|
sentencesBefore: 2,
|
|
sentencesAfter: 2,
|
|
startTag: "<b>",
|
|
endTag: "</b>",
|
|
},
|
|
mmrConfig: {
|
|
enabled: false,
|
|
mmrTopK: 0,
|
|
diversityBias: 0.0,
|
|
},
|
|
};
|
|
/**
|
|
* Class for interacting with the Vectara API. Extends the VectorStore
|
|
* class.
|
|
*/
|
|
export class VectaraStore extends VectorStore {
|
|
get lc_secrets() {
|
|
return {
|
|
apiKey: "VECTARA_API_KEY",
|
|
corpusId: "VECTARA_CORPUS_ID",
|
|
customerId: "VECTARA_CUSTOMER_ID",
|
|
};
|
|
}
|
|
get lc_aliases() {
|
|
return {
|
|
apiKey: "vectara_api_key",
|
|
corpusId: "vectara_corpus_id",
|
|
customerId: "vectara_customer_id",
|
|
};
|
|
}
|
|
_vectorstoreType() {
|
|
return "vectara";
|
|
}
|
|
constructor(args) {
|
|
// Vectara doesn't need embeddings, but we need to pass something to the parent constructor
|
|
// The embeddings are abstracted out from the user in Vectara.
|
|
super(new FakeEmbeddings(), args);
|
|
Object.defineProperty(this, "apiEndpoint", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: "api.vectara.io"
|
|
});
|
|
Object.defineProperty(this, "apiKey", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "corpusId", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "customerId", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "verbose", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "source", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "vectaraApiTimeoutSeconds", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: 60
|
|
});
|
|
const apiKey = args.apiKey ?? getEnvironmentVariable("VECTARA_API_KEY");
|
|
if (!apiKey) {
|
|
throw new Error("Vectara api key is not provided.");
|
|
}
|
|
this.apiKey = apiKey;
|
|
this.source = args.source ?? "langchainjs";
|
|
const corpusId = args.corpusId ??
|
|
getEnvironmentVariable("VECTARA_CORPUS_ID")
|
|
?.split(",")
|
|
.map((id) => {
|
|
const num = Number(id);
|
|
if (Number.isNaN(num))
|
|
throw new Error("Vectara corpus id is not a number.");
|
|
return num;
|
|
});
|
|
if (!corpusId) {
|
|
throw new Error("Vectara corpus id is not provided.");
|
|
}
|
|
if (typeof corpusId === "number") {
|
|
this.corpusId = [corpusId];
|
|
}
|
|
else {
|
|
if (corpusId.length === 0)
|
|
throw new Error("Vectara corpus id is not provided.");
|
|
this.corpusId = corpusId;
|
|
}
|
|
const customerId = args.customerId ?? getEnvironmentVariable("VECTARA_CUSTOMER_ID");
|
|
if (!customerId) {
|
|
throw new Error("Vectara customer id is not provided.");
|
|
}
|
|
this.customerId = customerId;
|
|
this.verbose = args.verbose ?? false;
|
|
}
|
|
/**
|
|
* Returns a header for Vectara API calls.
|
|
* @returns A Promise that resolves to a VectaraCallHeader object.
|
|
*/
|
|
async getJsonHeader() {
|
|
return {
|
|
headers: {
|
|
"x-api-key": this.apiKey,
|
|
"Content-Type": "application/json",
|
|
"customer-id": this.customerId.toString(),
|
|
"X-Source": this.source,
|
|
},
|
|
};
|
|
}
|
|
/**
|
|
* Throws an error, as this method is not implemented. Use addDocuments
|
|
* instead.
|
|
* @param _vectors Not used.
|
|
* @param _documents Not used.
|
|
* @returns Does not return a value.
|
|
*/
|
|
async addVectors(_vectors, _documents) {
|
|
throw new Error("Method not implemented. Please call addDocuments instead.");
|
|
}
|
|
/**
|
|
* Method to delete data from the Vectara corpus.
|
|
* @param params an array of document IDs to be deleted
|
|
* @returns Promise that resolves when the deletion is complete.
|
|
*/
|
|
async deleteDocuments(ids) {
|
|
if (ids && ids.length > 0) {
|
|
const headers = await this.getJsonHeader();
|
|
for (const id of ids) {
|
|
const data = {
|
|
customer_id: this.customerId,
|
|
corpus_id: this.corpusId[0],
|
|
document_id: id,
|
|
};
|
|
try {
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), this.vectaraApiTimeoutSeconds * 1000);
|
|
const response = await fetch(`https://${this.apiEndpoint}/v1/delete-doc`, {
|
|
method: "POST",
|
|
headers: headers?.headers,
|
|
body: JSON.stringify(data),
|
|
signal: controller.signal,
|
|
});
|
|
clearTimeout(timeout);
|
|
if (response.status !== 200) {
|
|
throw new Error(`Vectara API returned status code ${response.status} when deleting document ${id}`);
|
|
}
|
|
}
|
|
catch (e) {
|
|
const error = new Error(`Error ${e.message}`);
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
error.code = 500;
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
throw new Error(`no "ids" specified for deletion`);
|
|
}
|
|
}
|
|
/**
|
|
* Adds documents to the Vectara store.
|
|
* @param documents An array of Document objects to add to the Vectara store.
|
|
* @returns A Promise that resolves to an array of document IDs indexed in Vectara.
|
|
*/
|
|
async addDocuments(documents) {
|
|
if (this.corpusId.length > 1)
|
|
throw new Error("addDocuments does not support multiple corpus ids");
|
|
const headers = await this.getJsonHeader();
|
|
const doc_ids = [];
|
|
let countAdded = 0;
|
|
for (const document of documents) {
|
|
const doc_id = document.metadata?.document_id ?? uuid.v4();
|
|
const data = {
|
|
customer_id: this.customerId,
|
|
corpus_id: this.corpusId[0],
|
|
document: {
|
|
document_id: doc_id,
|
|
title: document.metadata?.title ?? "",
|
|
metadata_json: JSON.stringify(document.metadata ?? {}),
|
|
section: [
|
|
{
|
|
text: document.pageContent,
|
|
},
|
|
],
|
|
},
|
|
};
|
|
try {
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), this.vectaraApiTimeoutSeconds * 1000);
|
|
const response = await fetch(`https://${this.apiEndpoint}/v1/index`, {
|
|
method: "POST",
|
|
headers: headers?.headers,
|
|
body: JSON.stringify(data),
|
|
signal: controller.signal,
|
|
});
|
|
clearTimeout(timeout);
|
|
const result = await response.json();
|
|
if (result.status?.code !== "OK" &&
|
|
result.status?.code !== "ALREADY_EXISTS") {
|
|
const error = new Error(`Vectara API returned status code ${result.status?.code}: ${JSON.stringify(result.message)}`);
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
error.code = 500;
|
|
throw error;
|
|
}
|
|
else {
|
|
countAdded += 1;
|
|
doc_ids.push(doc_id);
|
|
}
|
|
}
|
|
catch (e) {
|
|
const error = new Error(`Error ${e.message} while adding document`);
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
error.code = 500;
|
|
throw error;
|
|
}
|
|
}
|
|
if (this.verbose) {
|
|
console.log(`Added ${countAdded} documents to Vectara`);
|
|
}
|
|
return doc_ids;
|
|
}
|
|
/**
|
|
* Vectara provides a way to add documents directly via their API. This API handles
|
|
* pre-processing and chunking internally in an optimal manner. This method is a wrapper
|
|
* to utilize that API within LangChain.
|
|
*
|
|
* @param files An array of VectaraFile objects representing the files and their respective file names to be uploaded to Vectara.
|
|
* @param metadata Optional. An array of metadata objects corresponding to each file in the `filePaths` array.
|
|
* @returns A Promise that resolves to the number of successfully uploaded files.
|
|
*/
|
|
async addFiles(files, metadatas = undefined) {
|
|
if (this.corpusId.length > 1)
|
|
throw new Error("addFiles does not support multiple corpus ids");
|
|
const doc_ids = [];
|
|
for (const [index, file] of files.entries()) {
|
|
const md = metadatas ? metadatas[index] : {};
|
|
const data = new FormData();
|
|
data.append("file", file.blob, file.fileName);
|
|
data.append("doc-metadata", JSON.stringify(md));
|
|
const response = await fetch(`https://api.vectara.io/v1/upload?c=${this.customerId}&o=${this.corpusId[0]}&d=true`, {
|
|
method: "POST",
|
|
headers: {
|
|
"x-api-key": this.apiKey,
|
|
"X-Source": this.source,
|
|
},
|
|
body: data,
|
|
});
|
|
const { status } = response;
|
|
if (status === 409) {
|
|
throw new Error(`File at index ${index} already exists in Vectara`);
|
|
}
|
|
else if (status !== 200) {
|
|
throw new Error(`Vectara API returned status code ${status}`);
|
|
}
|
|
else {
|
|
const result = await response.json();
|
|
const doc_id = result.document.documentId;
|
|
doc_ids.push(doc_id);
|
|
}
|
|
}
|
|
if (this.verbose) {
|
|
console.log(`Uploaded ${files.length} files to Vectara`);
|
|
}
|
|
return doc_ids;
|
|
}
|
|
/**
|
|
* Performs a Vectara API call based on the arguments provided.
|
|
* @param query The query string for the similarity search.
|
|
* @param k Optional. The number of results to return. Default is 10.
|
|
* @param filter Optional. A VectaraFilter object to refine the search results.
|
|
* @returns A Promise that resolves to an array of tuples, each containing a Document and its score.
|
|
*/
|
|
async vectaraQuery(query, k, vectaraFilterObject, summary = {
|
|
enabled: false,
|
|
maxSummarizedResults: 0,
|
|
responseLang: "eng",
|
|
}) {
|
|
const headers = await this.getJsonHeader();
|
|
const { start, filter, lambda, contextConfig, mmrConfig } = vectaraFilterObject;
|
|
const corpusKeys = this.corpusId.map((corpusId) => ({
|
|
customerId: this.customerId,
|
|
corpusId,
|
|
metadataFilter: filter,
|
|
lexicalInterpolationConfig: { lambda },
|
|
}));
|
|
const data = {
|
|
query: [
|
|
{
|
|
query,
|
|
start,
|
|
numResults: mmrConfig?.enabled ? mmrConfig.mmrTopK : k,
|
|
contextConfig,
|
|
...(mmrConfig?.enabled
|
|
? {
|
|
rerankingConfig: {
|
|
rerankerId: 272725718,
|
|
mmrConfig: { diversityBias: mmrConfig.diversityBias },
|
|
},
|
|
}
|
|
: {}),
|
|
corpusKey: corpusKeys,
|
|
...(summary?.enabled ? { summary: [summary] } : {}),
|
|
},
|
|
],
|
|
};
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), this.vectaraApiTimeoutSeconds * 1000);
|
|
const response = await fetch(`https://${this.apiEndpoint}/v1/query`, {
|
|
method: "POST",
|
|
headers: headers?.headers,
|
|
body: JSON.stringify(data),
|
|
signal: controller.signal,
|
|
});
|
|
clearTimeout(timeout);
|
|
if (response.status !== 200) {
|
|
throw new Error(`Vectara API returned status code ${response.status}`);
|
|
}
|
|
const result = await response.json();
|
|
const responses = result.responseSet[0].response;
|
|
const documents = result.responseSet[0].document;
|
|
for (let i = 0; i < responses.length; i += 1) {
|
|
const responseMetadata = responses[i].metadata;
|
|
const documentMetadata = documents[responses[i].documentIndex].metadata;
|
|
const combinedMetadata = {};
|
|
responseMetadata.forEach((item) => {
|
|
combinedMetadata[item.name] = item.value;
|
|
});
|
|
documentMetadata.forEach((item) => {
|
|
combinedMetadata[item.name] = item.value;
|
|
});
|
|
responses[i].metadata = combinedMetadata;
|
|
}
|
|
const res = {
|
|
documents: responses.map((response) => new Document({
|
|
pageContent: response.text,
|
|
metadata: response.metadata,
|
|
})),
|
|
scores: responses.map((response) => response.score),
|
|
summary: result.responseSet[0].summary[0]?.text ?? "",
|
|
};
|
|
return res;
|
|
}
|
|
/**
|
|
* Performs a similarity search and returns documents along with their
|
|
* scores.
|
|
* @param query The query string for the similarity search.
|
|
* @param k Optional. The number of results to return. Default is 10.
|
|
* @param filter Optional. A VectaraFilter object to refine the search results.
|
|
* @returns A Promise that resolves to an array of tuples, each containing a Document and its score.
|
|
*/
|
|
async similaritySearchWithScore(query, k, filter) {
|
|
const summaryResult = await this.vectaraQuery(query, k || 10, filter || DEFAULT_FILTER);
|
|
const res = summaryResult.documents.map((document, index) => [document, summaryResult.scores[index]]);
|
|
return res;
|
|
}
|
|
/**
|
|
* Performs a similarity search and returns documents.
|
|
* @param query The query string for the similarity search.
|
|
* @param k Optional. The number of results to return. Default is 10.
|
|
* @param filter Optional. A VectaraFilter object to refine the search results.
|
|
* @returns A Promise that resolves to an array of Document objects.
|
|
*/
|
|
async similaritySearch(query, k, filter) {
|
|
const documents = await this.similaritySearchWithScore(query, k || 10, filter || DEFAULT_FILTER);
|
|
return documents.map((result) => result[0]);
|
|
}
|
|
/**
|
|
* Throws an error, as this method is not implemented. Use
|
|
* similaritySearch or similaritySearchWithScore instead.
|
|
* @param _query Not used.
|
|
* @param _k Not used.
|
|
* @param _filter Not used.
|
|
* @returns Does not return a value.
|
|
*/
|
|
async similaritySearchVectorWithScore(_query, _k, _filter) {
|
|
throw new Error("Method not implemented. Please call similaritySearch or similaritySearchWithScore instead.");
|
|
}
|
|
/**
|
|
* Creates a VectaraStore instance from texts.
|
|
* @param texts An array of text strings.
|
|
* @param metadatas Metadata for the texts. Can be a single object or an array of objects.
|
|
* @param _embeddings Not used.
|
|
* @param args A VectaraLibArgs object for initializing the VectaraStore instance.
|
|
* @returns A Promise that resolves to a VectaraStore instance.
|
|
*/
|
|
static fromTexts(texts, metadatas, _embeddings, args) {
|
|
const docs = [];
|
|
for (let i = 0; i < texts.length; i += 1) {
|
|
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
|
|
const newDoc = new Document({
|
|
pageContent: texts[i],
|
|
metadata,
|
|
});
|
|
docs.push(newDoc);
|
|
}
|
|
return VectaraStore.fromDocuments(docs, new FakeEmbeddings(), args);
|
|
}
|
|
/**
|
|
* Creates a VectaraStore instance from documents.
|
|
* @param docs An array of Document objects.
|
|
* @param _embeddings Not used.
|
|
* @param args A VectaraLibArgs object for initializing the VectaraStore instance.
|
|
* @returns A Promise that resolves to a VectaraStore instance.
|
|
*/
|
|
static async fromDocuments(docs, _embeddings, args) {
|
|
const instance = new this(args);
|
|
await instance.addDocuments(docs);
|
|
return instance;
|
|
}
|
|
}
|