agsamantha/node_modules/@langchain/community/dist/vectorstores/milvus.js
2024-10-02 15:15:21 -05:00

693 lines
26 KiB
JavaScript

import * as uuid from "uuid";
import { MilvusClient, DataType, DataTypeMap, ErrorCode, } from "@zilliz/milvus2-sdk-node";
import { VectorStore } from "@langchain/core/vectorstores";
import { Document } from "@langchain/core/documents";
import { getEnvironmentVariable } from "@langchain/core/utils/env";
const MILVUS_PRIMARY_FIELD_NAME = "langchain_primaryid";
const MILVUS_VECTOR_FIELD_NAME = "langchain_vector";
const MILVUS_TEXT_FIELD_NAME = "langchain_text";
const MILVUS_COLLECTION_NAME_PREFIX = "langchain_col";
const MILVUS_PARTITION_KEY_MAX_LENGTH = 512;
/**
* Default parameters for index searching.
*/
const DEFAULT_INDEX_SEARCH_PARAMS = {
FLAT: { params: {} },
IVF_FLAT: { params: { nprobe: 10 } },
IVF_SQ8: { params: { nprobe: 10 } },
IVF_PQ: { params: { nprobe: 10 } },
HNSW: { params: { ef: 10 } },
RHNSW_FLAT: { params: { ef: 10 } },
RHNSW_SQ: { params: { ef: 10 } },
RHNSW_PQ: { params: { ef: 10 } },
IVF_HNSW: { params: { nprobe: 10, ef: 10 } },
ANNOY: { params: { search_k: 10 } },
};
/**
* Class for interacting with a Milvus database. Extends the VectorStore
* class.
*/
export class Milvus extends VectorStore {
get lc_secrets() {
return {
ssl: "MILVUS_SSL",
username: "MILVUS_USERNAME",
password: "MILVUS_PASSWORD",
};
}
_vectorstoreType() {
return "milvus";
}
constructor(embeddings, args) {
super(embeddings, args);
Object.defineProperty(this, "embeddings", {
enumerable: true,
configurable: true,
writable: true,
value: embeddings
});
Object.defineProperty(this, "collectionName", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "partitionName", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "numDimensions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "autoId", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "primaryField", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "vectorField", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "textField", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "textFieldMaxLength", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "partitionKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "partitionKeyMaxLength", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "fields", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "indexCreateParams", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "indexSearchParams", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.collectionName = args.collectionName ?? genCollectionName();
this.partitionName = args.partitionName;
this.textField = args.textField ?? MILVUS_TEXT_FIELD_NAME;
this.autoId = args.autoId ?? true;
this.primaryField = args.primaryField ?? MILVUS_PRIMARY_FIELD_NAME;
this.vectorField = args.vectorField ?? MILVUS_VECTOR_FIELD_NAME;
this.textFieldMaxLength = args.textFieldMaxLength ?? 0;
this.partitionKey = args.partitionKey;
this.partitionKeyMaxLength =
args.partitionKeyMaxLength ?? MILVUS_PARTITION_KEY_MAX_LENGTH;
this.fields = [];
const url = args.url ?? getEnvironmentVariable("MILVUS_URL");
const { address = "", username = "", password = "", ssl, } = args.clientConfig || {};
// Index creation parameters
const { indexCreateOptions } = args;
if (indexCreateOptions) {
const { metric_type, index_type, params, search_params = {}, } = indexCreateOptions;
this.indexCreateParams = {
metric_type,
index_type,
params,
};
this.indexSearchParams = {
...DEFAULT_INDEX_SEARCH_PARAMS[index_type].params,
...search_params,
};
}
else {
// Default index creation parameters.
this.indexCreateParams = {
index_type: "HNSW",
metric_type: "L2",
params: { M: 8, efConstruction: 64 },
};
// Default index search parameters.
this.indexSearchParams = {
...DEFAULT_INDEX_SEARCH_PARAMS.HNSW.params,
};
}
// combine args clientConfig and env variables
const clientConfig = {
...(args.clientConfig || {}),
address: url || address,
username: args.username || username,
password: args.password || password,
ssl: args.ssl || ssl,
};
if (!clientConfig.address) {
throw new Error("Milvus URL address is not provided.");
}
this.client = new MilvusClient(clientConfig);
}
/**
* Adds documents to the Milvus database.
* @param documents Array of Document instances to be added to the database.
* @param options Optional parameter that can include specific IDs for the documents.
* @returns Promise resolving to void.
*/
async addDocuments(documents, options) {
const texts = documents.map(({ pageContent }) => pageContent);
await this.addVectors(await this.embeddings.embedDocuments(texts), documents, options);
}
/**
* Adds vectors to the Milvus database.
* @param vectors Array of vectors to be added to the database.
* @param documents Array of Document instances associated with the vectors.
* @param options Optional parameter that can include specific IDs for the documents.
* @returns Promise resolving to void.
*/
async addVectors(vectors, documents, options) {
if (vectors.length === 0) {
return;
}
await this.ensureCollection(vectors, documents);
if (this.partitionName !== undefined) {
await this.ensurePartition();
}
const documentIds = options?.ids ?? [];
const insertDatas = [];
// eslint-disable-next-line no-plusplus
for (let index = 0; index < vectors.length; index++) {
const vec = vectors[index];
const doc = documents[index];
const data = {
[this.textField]: doc.pageContent,
[this.vectorField]: vec,
};
this.fields.forEach((field) => {
switch (field) {
case this.primaryField:
if (documentIds[index] !== undefined) {
data[field] = documentIds[index];
}
else if (!this.autoId) {
if (doc.metadata[this.primaryField] === undefined) {
throw new Error(`The Collection's primaryField is configured with autoId=false, thus its value must be provided through metadata.`);
}
data[field] = doc.metadata[this.primaryField];
}
break;
case this.textField:
data[field] = doc.pageContent;
break;
case this.vectorField:
data[field] = vec;
break;
default: // metadata fields
if (doc.metadata[field] === undefined) {
throw new Error(`The field "${field}" is not provided in documents[${index}].metadata.`);
}
else if (typeof doc.metadata[field] === "object") {
data[field] = JSON.stringify(doc.metadata[field]);
}
else {
data[field] = doc.metadata[field];
}
break;
}
});
insertDatas.push(data);
}
const params = {
collection_name: this.collectionName,
fields_data: insertDatas,
};
if (this.partitionName !== undefined) {
params.partition_name = this.partitionName;
}
const insertResp = this.autoId
? await this.client.insert(params)
: await this.client.upsert(params);
if (insertResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error ${this.autoId ? "inserting" : "upserting"} data: ${JSON.stringify(insertResp)}`);
}
await this.client.flushSync({ collection_names: [this.collectionName] });
}
/**
* Searches for vectors in the Milvus database that are similar to a given
* vector.
* @param query Vector to compare with the vectors in the database.
* @param k Number of similar vectors to return.
* @param filter Optional filter to apply to the search.
* @returns Promise resolving to an array of tuples, each containing a Document instance and a similarity score.
*/
async similaritySearchVectorWithScore(query, k, filter) {
const hasColResp = await this.client.hasCollection({
collection_name: this.collectionName,
});
if (hasColResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error checking collection: ${hasColResp}`);
}
if (hasColResp.value === false) {
throw new Error(`Collection not found: ${this.collectionName}, please create collection before search.`);
}
const filterStr = filter ?? "";
await this.grabCollectionFields();
const loadResp = await this.client.loadCollectionSync({
collection_name: this.collectionName,
});
if (loadResp.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error loading collection: ${loadResp}`);
}
// clone this.field and remove vectorField
const outputFields = this.fields.filter((field) => field !== this.vectorField);
const searchResp = await this.client.search({
collection_name: this.collectionName,
search_params: {
anns_field: this.vectorField,
topk: k,
metric_type: this.indexCreateParams.metric_type,
params: JSON.stringify(this.indexSearchParams),
},
output_fields: outputFields,
vector_type: DataType.FloatVector,
vectors: [query],
filter: filterStr,
});
if (searchResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error searching data: ${JSON.stringify(searchResp)}`);
}
const results = [];
searchResp.results.forEach((result) => {
const fields = {
pageContent: "",
// eslint-disable-next-line @typescript-eslint/no-explicit-any
metadata: {},
};
Object.keys(result).forEach((key) => {
if (key === this.textField) {
fields.pageContent = result[key];
}
else if (this.fields.includes(key) || key === this.primaryField) {
if (typeof result[key] === "string") {
const { isJson, obj } = checkJsonString(result[key]);
fields.metadata[key] = isJson ? obj : result[key];
}
else {
fields.metadata[key] = result[key];
}
}
});
results.push([new Document(fields), result.score]);
});
// console.log("Search result: " + JSON.stringify(results, null, 2));
return results;
}
/**
* Ensures that a collection exists in the Milvus database.
* @param vectors Optional array of vectors to be used if a new collection needs to be created.
* @param documents Optional array of Document instances to be used if a new collection needs to be created.
* @returns Promise resolving to void.
*/
async ensureCollection(vectors, documents) {
const hasColResp = await this.client.hasCollection({
collection_name: this.collectionName,
});
if (hasColResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error checking collection: ${JSON.stringify(hasColResp, null, 2)}`);
}
if (hasColResp.value === false) {
if (vectors === undefined || documents === undefined) {
throw new Error(`Collection not found: ${this.collectionName}, please provide vectors and documents to create collection.`);
}
await this.createCollection(vectors, documents);
}
else {
await this.grabCollectionFields();
}
}
/**
* Ensures that a partition exists in the Milvus collection.
* @returns Promise resolving to void.
*/
async ensurePartition() {
if (this.partitionName === undefined) {
return;
}
const hasPartResp = await this.client.hasPartition({
collection_name: this.collectionName,
partition_name: this.partitionName,
});
if (hasPartResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error checking partition: ${JSON.stringify(hasPartResp, null, 2)}`);
}
if (hasPartResp.value === false) {
await this.client.createPartition({
collection_name: this.collectionName,
partition_name: this.partitionName,
});
}
}
/**
* Creates a collection in the Milvus database.
* @param vectors Array of vectors to be added to the new collection.
* @param documents Array of Document instances to be added to the new collection.
* @returns Promise resolving to void.
*/
async createCollection(vectors, documents) {
const fieldList = [];
fieldList.push(...createFieldTypeForMetadata(documents, this.primaryField, this.partitionKey));
if (this.autoId) {
fieldList.push({
name: this.primaryField,
description: "Primary key",
data_type: DataType.Int64,
is_primary_key: true,
autoID: true,
});
}
else {
fieldList.push({
name: this.primaryField,
description: "Primary key",
data_type: DataType.VarChar,
is_primary_key: true,
autoID: false,
max_length: 65535,
});
}
fieldList.push({
name: this.textField,
description: "Text field",
data_type: DataType.VarChar,
type_params: {
max_length: this.textFieldMaxLength > 0
? this.textFieldMaxLength.toString()
: getTextFieldMaxLength(documents).toString(),
},
}, {
name: this.vectorField,
description: "Vector field",
data_type: DataType.FloatVector,
type_params: {
dim: getVectorFieldDim(vectors).toString(),
},
});
if (this.partitionKey) {
fieldList.push({
name: this.partitionKey,
description: "Partition key",
data_type: DataType.VarChar,
max_length: this.partitionKeyMaxLength,
is_partition_key: true,
});
}
fieldList.forEach((field) => {
if (!field.autoID) {
this.fields.push(field.name);
}
});
const createRes = await this.client.createCollection({
collection_name: this.collectionName,
fields: fieldList,
});
if (createRes.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Failed to create collection: ${createRes}`);
}
const extraParams = {
...this.indexCreateParams,
params: JSON.stringify(this.indexCreateParams.params),
};
await this.client.createIndex({
collection_name: this.collectionName,
field_name: this.vectorField,
extra_params: extraParams,
});
}
/**
* Retrieves the fields of a collection in the Milvus database.
* @returns Promise resolving to void.
*/
async grabCollectionFields() {
if (!this.collectionName) {
throw new Error("Need collection name to grab collection fields");
}
if (this.primaryField &&
this.vectorField &&
this.textField &&
this.fields.length > 0) {
return;
}
const desc = await this.client.describeCollection({
collection_name: this.collectionName,
});
desc.schema.fields.forEach((field) => {
this.fields.push(field.name);
if (field.autoID) {
const index = this.fields.indexOf(field.name);
if (index !== -1) {
this.fields.splice(index, 1);
}
}
if (field.is_primary_key) {
this.primaryField = field.name;
}
const dtype = DataTypeMap[field.data_type];
if (dtype === DataType.FloatVector || dtype === DataType.BinaryVector) {
this.vectorField = field.name;
}
if (dtype === DataType.VarChar && field.name === MILVUS_TEXT_FIELD_NAME) {
this.textField = field.name;
}
});
}
/**
* Creates a Milvus instance from a set of texts and their associated
* metadata.
* @param texts Array of texts to be added to the database.
* @param metadatas Array of metadata objects associated with the texts.
* @param embeddings Embeddings instance used to generate vector embeddings for the texts.
* @param dbConfig Optional configuration for the Milvus database.
* @returns Promise resolving to a new Milvus instance.
*/
static async fromTexts(texts, metadatas, embeddings, dbConfig) {
const docs = [];
for (let i = 0; i < texts.length; i += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
const newDoc = new Document({
pageContent: texts[i],
metadata,
});
docs.push(newDoc);
}
return Milvus.fromDocuments(docs, embeddings, dbConfig);
}
/**
* Creates a Milvus instance from a set of Document instances.
* @param docs Array of Document instances to be added to the database.
* @param embeddings Embeddings instance used to generate vector embeddings for the documents.
* @param dbConfig Optional configuration for the Milvus database.
* @returns Promise resolving to a new Milvus instance.
*/
static async fromDocuments(docs, embeddings, dbConfig) {
const args = {
...dbConfig,
collectionName: dbConfig?.collectionName ?? genCollectionName(),
};
const instance = new this(embeddings, args);
await instance.addDocuments(docs);
return instance;
}
/**
* Creates a Milvus instance from an existing collection in the Milvus
* database.
* @param embeddings Embeddings instance used to generate vector embeddings for the documents in the collection.
* @param dbConfig Configuration for the Milvus database.
* @returns Promise resolving to a new Milvus instance.
*/
static async fromExistingCollection(embeddings, dbConfig) {
const instance = new this(embeddings, dbConfig);
await instance.ensureCollection();
return instance;
}
/**
* Deletes data from the Milvus database.
* @param params Object containing a filter to apply to the deletion.
* @returns Promise resolving to void.
*/
async delete(params) {
const hasColResp = await this.client.hasCollection({
collection_name: this.collectionName,
});
if (hasColResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error checking collection: ${hasColResp}`);
}
if (hasColResp.value === false) {
throw new Error(`Collection not found: ${this.collectionName}, please create collection before search.`);
}
const { filter, ids } = params;
if (filter && !ids) {
const deleteResp = await this.client.deleteEntities({
collection_name: this.collectionName,
expr: filter,
});
if (deleteResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error deleting data: ${JSON.stringify(deleteResp)}`);
}
}
else if (!filter && ids && ids.length > 0) {
const deleteResp = await this.client.delete({
collection_name: this.collectionName,
ids,
});
if (deleteResp.status.error_code !== ErrorCode.SUCCESS) {
throw new Error(`Error deleting data with ids: ${JSON.stringify(deleteResp)}`);
}
}
}
}
function createFieldTypeForMetadata(documents, primaryFieldName, partitionKey) {
const sampleMetadata = documents[0].metadata;
let textFieldMaxLength = 0;
let jsonFieldMaxLength = 0;
documents.forEach(({ metadata }) => {
// check all keys name and count in metadata is same as sampleMetadata
Object.keys(metadata).forEach((key) => {
if (!(key in metadata) ||
typeof metadata[key] !== typeof sampleMetadata[key]) {
throw new Error("All documents must have same metadata keys and datatype");
}
// find max length of string field and json field, cache json string value
if (typeof metadata[key] === "string") {
if (metadata[key].length > textFieldMaxLength) {
textFieldMaxLength = metadata[key].length;
}
}
else if (typeof metadata[key] === "object") {
const json = JSON.stringify(metadata[key]);
if (json.length > jsonFieldMaxLength) {
jsonFieldMaxLength = json.length;
}
}
});
});
const fields = [];
for (const [key, value] of Object.entries(sampleMetadata)) {
const type = typeof value;
if (key === primaryFieldName || key === partitionKey) {
/**
* skip primary field and partition key
* because we will create primary field and partition key in createCollection
* */
}
else if (type === "string") {
fields.push({
name: key,
description: `Metadata String field`,
data_type: DataType.VarChar,
type_params: {
max_length: textFieldMaxLength.toString(),
},
});
}
else if (type === "number") {
fields.push({
name: key,
description: `Metadata Number field`,
data_type: DataType.Float,
});
}
else if (type === "boolean") {
fields.push({
name: key,
description: `Metadata Boolean field`,
data_type: DataType.Bool,
});
}
else if (value === null) {
// skip
}
else {
// use json for other types
try {
fields.push({
name: key,
description: `Metadata JSON field`,
data_type: DataType.VarChar,
type_params: {
max_length: jsonFieldMaxLength.toString(),
},
});
}
catch (e) {
throw new Error("Failed to parse metadata field as JSON");
}
}
}
return fields;
}
function genCollectionName() {
return `${MILVUS_COLLECTION_NAME_PREFIX}_${uuid.v4().replaceAll("-", "")}`;
}
function getTextFieldMaxLength(documents) {
let textMaxLength = 0;
const textEncoder = new TextEncoder();
// eslint-disable-next-line no-plusplus
for (let i = 0; i < documents.length; i++) {
const text = documents[i].pageContent;
const textLengthInBytes = textEncoder.encode(text).length;
if (textLengthInBytes > textMaxLength) {
textMaxLength = textLengthInBytes;
}
}
return textMaxLength;
}
function getVectorFieldDim(vectors) {
if (vectors.length === 0) {
throw new Error("No vectors found");
}
return vectors[0].length;
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
function checkJsonString(value) {
try {
const result = JSON.parse(value);
return { isJson: true, obj: result };
}
catch (e) {
return { isJson: false, obj: null };
}
}