260 lines
10 KiB
JavaScript
260 lines
10 KiB
JavaScript
|
import * as uuid from "uuid";
|
||
|
import { createClient } from "@clickhouse/client";
|
||
|
import { format } from "mysql2";
|
||
|
import { VectorStore } from "@langchain/core/vectorstores";
|
||
|
import { Document } from "@langchain/core/documents";
|
||
|
/**
|
||
|
* Class for interacting with the ClickHouse database. It extends the
|
||
|
* VectorStore class and provides methods for adding vectors and
|
||
|
* documents, searching for similar vectors, and creating instances from
|
||
|
* texts or documents.
|
||
|
*/
|
||
|
export class ClickHouseStore extends VectorStore {
|
||
|
_vectorstoreType() {
|
||
|
return "clickhouse";
|
||
|
}
|
||
|
constructor(embeddings, args) {
|
||
|
super(embeddings, args);
|
||
|
Object.defineProperty(this, "client", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "indexType", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "indexParam", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "indexQueryParams", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "columnMap", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "database", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "table", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "isInitialized", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: false
|
||
|
});
|
||
|
this.indexType = args.indexType || "annoy";
|
||
|
this.indexParam = args.indexParam || { L2Distance: 100 };
|
||
|
this.indexQueryParams = args.indexQueryParams || {};
|
||
|
this.columnMap = args.columnMap || {
|
||
|
id: "id",
|
||
|
document: "document",
|
||
|
embedding: "embedding",
|
||
|
metadata: "metadata",
|
||
|
uuid: "uuid",
|
||
|
};
|
||
|
this.database = args.database || "default";
|
||
|
this.table = args.table || "vector_table";
|
||
|
this.client = createClient({
|
||
|
host: `${args.protocol ?? "https://"}${args.host}:${args.port}`,
|
||
|
username: args.username,
|
||
|
password: args.password,
|
||
|
session_id: uuid.v4(),
|
||
|
});
|
||
|
}
|
||
|
/**
|
||
|
* Method to add vectors to the ClickHouse database.
|
||
|
* @param vectors The vectors to add.
|
||
|
* @param documents The documents associated with the vectors.
|
||
|
* @returns Promise that resolves when the vectors have been added.
|
||
|
*/
|
||
|
async addVectors(vectors, documents) {
|
||
|
if (vectors.length === 0) {
|
||
|
return;
|
||
|
}
|
||
|
if (!this.isInitialized) {
|
||
|
await this.initialize(vectors[0].length);
|
||
|
}
|
||
|
const queryStr = this.buildInsertQuery(vectors, documents);
|
||
|
await this.client.exec({ query: queryStr });
|
||
|
}
|
||
|
/**
|
||
|
* Method to add documents to the ClickHouse database.
|
||
|
* @param documents The documents to add.
|
||
|
* @returns Promise that resolves when the documents have been added.
|
||
|
*/
|
||
|
async addDocuments(documents) {
|
||
|
return this.addVectors(await this.embeddings.embedDocuments(documents.map((d) => d.pageContent)), documents);
|
||
|
}
|
||
|
/**
|
||
|
* Method to search for vectors that are similar to a given query vector.
|
||
|
* @param query The query vector.
|
||
|
* @param k The number of similar vectors to return.
|
||
|
* @param filter Optional filter for the search results.
|
||
|
* @returns Promise that resolves with an array of tuples, each containing a Document and a score.
|
||
|
*/
|
||
|
async similaritySearchVectorWithScore(query, k, filter) {
|
||
|
if (!this.isInitialized) {
|
||
|
await this.initialize(query.length);
|
||
|
}
|
||
|
const queryStr = this.buildSearchQuery(query, k, filter);
|
||
|
const queryResultSet = await this.client.query({ query: queryStr });
|
||
|
const queryResult = await queryResultSet.json();
|
||
|
const result = queryResult.data.map((item) => [
|
||
|
new Document({ pageContent: item.document, metadata: item.metadata }),
|
||
|
item.dist,
|
||
|
]);
|
||
|
return result;
|
||
|
}
|
||
|
/**
|
||
|
* Static method to create an instance of ClickHouseStore from texts.
|
||
|
* @param texts The texts to use.
|
||
|
* @param metadatas The metadata associated with the texts.
|
||
|
* @param embeddings The embeddings to use.
|
||
|
* @param args The arguments for the ClickHouseStore.
|
||
|
* @returns Promise that resolves with a new instance of ClickHouseStore.
|
||
|
*/
|
||
|
static async fromTexts(texts, metadatas, embeddings, args) {
|
||
|
const docs = [];
|
||
|
for (let i = 0; i < texts.length; i += 1) {
|
||
|
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
|
||
|
const newDoc = new Document({
|
||
|
pageContent: texts[i],
|
||
|
metadata,
|
||
|
});
|
||
|
docs.push(newDoc);
|
||
|
}
|
||
|
return ClickHouseStore.fromDocuments(docs, embeddings, args);
|
||
|
}
|
||
|
/**
|
||
|
* Static method to create an instance of ClickHouseStore from documents.
|
||
|
* @param docs The documents to use.
|
||
|
* @param embeddings The embeddings to use.
|
||
|
* @param args The arguments for the ClickHouseStore.
|
||
|
* @returns Promise that resolves with a new instance of ClickHouseStore.
|
||
|
*/
|
||
|
static async fromDocuments(docs, embeddings, args) {
|
||
|
const instance = new this(embeddings, args);
|
||
|
await instance.addDocuments(docs);
|
||
|
return instance;
|
||
|
}
|
||
|
/**
|
||
|
* Static method to create an instance of ClickHouseStore from an existing
|
||
|
* index.
|
||
|
* @param embeddings The embeddings to use.
|
||
|
* @param args The arguments for the ClickHouseStore.
|
||
|
* @returns Promise that resolves with a new instance of ClickHouseStore.
|
||
|
*/
|
||
|
static async fromExistingIndex(embeddings, args) {
|
||
|
const instance = new this(embeddings, args);
|
||
|
await instance.initialize();
|
||
|
return instance;
|
||
|
}
|
||
|
/**
|
||
|
* Method to initialize the ClickHouse database.
|
||
|
* @param dimension Optional dimension of the vectors.
|
||
|
* @returns Promise that resolves when the database has been initialized.
|
||
|
*/
|
||
|
async initialize(dimension) {
|
||
|
const dim = dimension ?? (await this.embeddings.embedQuery("test")).length;
|
||
|
const indexParamStr = this.indexParam
|
||
|
? Object.entries(this.indexParam)
|
||
|
.map(([key, value]) => `'${key}', ${value}`)
|
||
|
.join(", ")
|
||
|
: "";
|
||
|
const query = `
|
||
|
CREATE TABLE IF NOT EXISTS ${this.database}.${this.table}(
|
||
|
${this.columnMap.id} Nullable(String),
|
||
|
${this.columnMap.document} Nullable(String),
|
||
|
${this.columnMap.embedding} Array(Float32),
|
||
|
${this.columnMap.metadata} JSON,
|
||
|
${this.columnMap.uuid} UUID DEFAULT generateUUIDv4(),
|
||
|
CONSTRAINT cons_vec_len CHECK length(${this.columnMap.embedding}) = ${dim},
|
||
|
INDEX vec_idx ${this.columnMap.embedding} TYPE ${this.indexType}(${indexParamStr}) GRANULARITY 1000
|
||
|
) ENGINE = MergeTree ORDER BY ${this.columnMap.uuid} SETTINGS index_granularity = 8192;`;
|
||
|
await this.client.exec({
|
||
|
query,
|
||
|
clickhouse_settings: {
|
||
|
allow_experimental_object_type: 1,
|
||
|
allow_experimental_annoy_index: 1,
|
||
|
},
|
||
|
});
|
||
|
this.isInitialized = true;
|
||
|
}
|
||
|
/**
|
||
|
* Method to build an SQL query for inserting vectors and documents into
|
||
|
* the ClickHouse database.
|
||
|
* @param vectors The vectors to insert.
|
||
|
* @param documents The documents to insert.
|
||
|
* @returns The SQL query string.
|
||
|
*/
|
||
|
buildInsertQuery(vectors, documents) {
|
||
|
const columnsStr = Object.values(Object.fromEntries(Object.entries(this.columnMap).filter(([key]) => key !== this.columnMap.uuid))).join(", ");
|
||
|
const placeholders = vectors.map(() => "(?, ?, ?, ?)").join(", ");
|
||
|
const values = [];
|
||
|
for (let i = 0; i < vectors.length; i += 1) {
|
||
|
const vector = vectors[i];
|
||
|
const document = documents[i];
|
||
|
values.push(uuid.v4(), this.escapeString(document.pageContent), JSON.stringify(vector), JSON.stringify(document.metadata));
|
||
|
}
|
||
|
const insertQueryStr = `
|
||
|
INSERT INTO TABLE ${this.database}.${this.table}(${columnsStr})
|
||
|
VALUES ${placeholders}
|
||
|
`;
|
||
|
const insertQuery = format(insertQueryStr, values);
|
||
|
return insertQuery;
|
||
|
}
|
||
|
escapeString(str) {
|
||
|
return str.replace(/\\/g, "\\\\").replace(/'/g, "\\'");
|
||
|
}
|
||
|
/**
|
||
|
* Method to build an SQL query for searching for similar vectors in the
|
||
|
* ClickHouse database.
|
||
|
* @param query The query vector.
|
||
|
* @param k The number of similar vectors to return.
|
||
|
* @param filter Optional filter for the search results.
|
||
|
* @returns The SQL query string.
|
||
|
*/
|
||
|
buildSearchQuery(query, k, filter) {
|
||
|
const order = "ASC";
|
||
|
const whereStr = filter ? `PREWHERE ${filter.whereStr}` : "";
|
||
|
const placeholders = query.map(() => "?").join(", ");
|
||
|
const settingStrings = [];
|
||
|
if (this.indexQueryParams) {
|
||
|
for (const [key, value] of Object.entries(this.indexQueryParams)) {
|
||
|
settingStrings.push(`SETTING ${key}=${value}`);
|
||
|
}
|
||
|
}
|
||
|
const searchQueryStr = `
|
||
|
SELECT ${this.columnMap.document} AS document, ${this.columnMap.metadata} AS metadata, dist
|
||
|
FROM ${this.database}.${this.table}
|
||
|
${whereStr}
|
||
|
ORDER BY L2Distance(${this.columnMap.embedding}, [${placeholders}]) AS dist ${order}
|
||
|
LIMIT ${k} ${settingStrings.join(" ")}
|
||
|
`;
|
||
|
// Format the query with actual values
|
||
|
const searchQuery = format(searchQueryStr, query);
|
||
|
return searchQuery;
|
||
|
}
|
||
|
}
|