271 lines
12 KiB
JavaScript
271 lines
12 KiB
JavaScript
import { SaveableVectorStore } from "@langchain/core/vectorstores";
|
|
import { Document } from "@langchain/core/documents";
|
|
import { SynchronousInMemoryDocstore } from "../stores/doc/in_memory.js";
|
|
/**
|
|
* Class that implements a vector store using Hierarchical Navigable Small
|
|
* World (HNSW) graphs. It extends the SaveableVectorStore class and
|
|
* provides methods for adding documents and vectors, performing
|
|
* similarity searches, and saving and loading the vector store.
|
|
*/
|
|
export class HNSWLib extends SaveableVectorStore {
|
|
_vectorstoreType() {
|
|
return "hnswlib";
|
|
}
|
|
constructor(embeddings, args) {
|
|
super(embeddings, args);
|
|
Object.defineProperty(this, "_index", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "docstore", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "args", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
this._index = args.index;
|
|
this.args = args;
|
|
this.embeddings = embeddings;
|
|
this.docstore = args?.docstore ?? new SynchronousInMemoryDocstore();
|
|
}
|
|
/**
|
|
* Method to add documents to the vector store. It first converts the
|
|
* documents to vectors using the embeddings, then adds the vectors to the
|
|
* vector store.
|
|
* @param documents The documents to be added to the vector store.
|
|
* @returns A Promise that resolves when the documents have been added.
|
|
*/
|
|
async addDocuments(documents) {
|
|
const texts = documents.map(({ pageContent }) => pageContent);
|
|
return this.addVectors(await this.embeddings.embedDocuments(texts), documents);
|
|
}
|
|
static async getHierarchicalNSW(args) {
|
|
const { HierarchicalNSW } = await HNSWLib.imports();
|
|
if (!args.space) {
|
|
throw new Error("hnswlib-node requires a space argument");
|
|
}
|
|
if (args.numDimensions === undefined) {
|
|
throw new Error("hnswlib-node requires a numDimensions argument");
|
|
}
|
|
return new HierarchicalNSW(args.space, args.numDimensions);
|
|
}
|
|
async initIndex(vectors) {
|
|
if (!this._index) {
|
|
if (this.args.numDimensions === undefined) {
|
|
this.args.numDimensions = vectors[0].length;
|
|
}
|
|
this.index = await HNSWLib.getHierarchicalNSW(this.args);
|
|
}
|
|
if (!this.index.getCurrentCount()) {
|
|
this.index.initIndex(vectors.length);
|
|
}
|
|
}
|
|
get index() {
|
|
if (!this._index) {
|
|
throw new Error("Vector store not initialised yet. Try calling `addTexts` first.");
|
|
}
|
|
return this._index;
|
|
}
|
|
set index(index) {
|
|
this._index = index;
|
|
}
|
|
/**
|
|
* Method to add vectors to the vector store. It first initializes the
|
|
* index if it hasn't been initialized yet, then adds the vectors to the
|
|
* index and the documents to the document store.
|
|
* @param vectors The vectors to be added to the vector store.
|
|
* @param documents The documents corresponding to the vectors.
|
|
* @returns A Promise that resolves when the vectors and documents have been added.
|
|
*/
|
|
async addVectors(vectors, documents) {
|
|
if (vectors.length === 0) {
|
|
return;
|
|
}
|
|
await this.initIndex(vectors);
|
|
// TODO here we could optionally normalise the vectors to unit length
|
|
// so that dot product is equivalent to cosine similarity, like this
|
|
// https://github.com/nmslib/hnswlib/issues/384#issuecomment-1155737730
|
|
// While we only support OpenAI embeddings this isn't necessary
|
|
if (vectors.length !== documents.length) {
|
|
throw new Error(`Vectors and metadatas must have the same length`);
|
|
}
|
|
if (vectors[0].length !== this.args.numDimensions) {
|
|
throw new Error(`Vectors must have the same length as the number of dimensions (${this.args.numDimensions})`);
|
|
}
|
|
const capacity = this.index.getMaxElements();
|
|
const needed = this.index.getCurrentCount() + vectors.length;
|
|
if (needed > capacity) {
|
|
this.index.resizeIndex(needed);
|
|
}
|
|
const docstoreSize = this.index.getCurrentCount();
|
|
const toSave = {};
|
|
for (let i = 0; i < vectors.length; i += 1) {
|
|
this.index.addPoint(vectors[i], docstoreSize + i);
|
|
toSave[docstoreSize + i] = documents[i];
|
|
}
|
|
this.docstore.add(toSave);
|
|
}
|
|
/**
|
|
* Method to perform a similarity search in the vector store using a query
|
|
* vector. It returns the k most similar documents along with their
|
|
* similarity scores. An optional filter function can be provided to
|
|
* filter the documents.
|
|
* @param query The query vector.
|
|
* @param k The number of most similar documents to return.
|
|
* @param filter An optional filter function to filter the documents.
|
|
* @returns A Promise that resolves to an array of tuples, where each tuple contains a document and its similarity score.
|
|
*/
|
|
async similaritySearchVectorWithScore(query, k, filter) {
|
|
if (this.args.numDimensions && !this._index) {
|
|
await this.initIndex([[]]);
|
|
}
|
|
if (query.length !== this.args.numDimensions) {
|
|
throw new Error(`Query vector must have the same length as the number of dimensions (${this.args.numDimensions})`);
|
|
}
|
|
if (k > this.index.getCurrentCount()) {
|
|
const total = this.index.getCurrentCount();
|
|
console.warn(`k (${k}) is greater than the number of elements in the index (${total}), setting k to ${total}`);
|
|
// eslint-disable-next-line no-param-reassign
|
|
k = total;
|
|
}
|
|
const filterFunction = (label) => {
|
|
if (!filter) {
|
|
return true;
|
|
}
|
|
const document = this.docstore.search(String(label));
|
|
// eslint-disable-next-line no-instanceof/no-instanceof
|
|
if (typeof document !== "string") {
|
|
return filter(document);
|
|
}
|
|
return false;
|
|
};
|
|
const result = this.index.searchKnn(query, k, filter ? filterFunction : undefined);
|
|
return result.neighbors.map((docIndex, resultIndex) => [
|
|
this.docstore.search(String(docIndex)),
|
|
result.distances[resultIndex],
|
|
]);
|
|
}
|
|
/**
|
|
* Method to delete the vector store from a directory. It deletes the
|
|
* hnswlib.index file, the docstore.json file, and the args.json file from
|
|
* the directory.
|
|
* @param params An object with a directory property that specifies the directory from which to delete the vector store.
|
|
* @returns A Promise that resolves when the vector store has been deleted.
|
|
*/
|
|
async delete(params) {
|
|
const fs = await import("node:fs/promises");
|
|
const path = await import("node:path");
|
|
try {
|
|
await fs.access(path.join(params.directory, "hnswlib.index"));
|
|
}
|
|
catch (err) {
|
|
throw new Error(`Directory ${params.directory} does not contain a hnswlib.index file.`);
|
|
}
|
|
await Promise.all([
|
|
await fs.rm(path.join(params.directory, "hnswlib.index"), {
|
|
force: true,
|
|
}),
|
|
await fs.rm(path.join(params.directory, "docstore.json"), {
|
|
force: true,
|
|
}),
|
|
await fs.rm(path.join(params.directory, "args.json"), { force: true }),
|
|
]);
|
|
}
|
|
/**
|
|
* Method to save the vector store to a directory. It saves the HNSW
|
|
* index, the arguments, and the document store to the directory.
|
|
* @param directory The directory to which to save the vector store.
|
|
* @returns A Promise that resolves when the vector store has been saved.
|
|
*/
|
|
async save(directory) {
|
|
const fs = await import("node:fs/promises");
|
|
const path = await import("node:path");
|
|
await fs.mkdir(directory, { recursive: true });
|
|
await Promise.all([
|
|
this.index.writeIndex(path.join(directory, "hnswlib.index")),
|
|
await fs.writeFile(path.join(directory, "args.json"), JSON.stringify(this.args)),
|
|
await fs.writeFile(path.join(directory, "docstore.json"), JSON.stringify(Array.from(this.docstore._docs.entries()))),
|
|
]);
|
|
}
|
|
/**
|
|
* Static method to load a vector store from a directory. It reads the
|
|
* HNSW index, the arguments, and the document store from the directory,
|
|
* then creates a new HNSWLib instance with these values.
|
|
* @param directory The directory from which to load the vector store.
|
|
* @param embeddings The embeddings to be used by the HNSWLib instance.
|
|
* @returns A Promise that resolves to a new HNSWLib instance.
|
|
*/
|
|
static async load(directory, embeddings) {
|
|
const fs = await import("node:fs/promises");
|
|
const path = await import("node:path");
|
|
const args = JSON.parse(await fs.readFile(path.join(directory, "args.json"), "utf8"));
|
|
const index = await HNSWLib.getHierarchicalNSW(args);
|
|
const [docstoreFiles] = await Promise.all([
|
|
fs
|
|
.readFile(path.join(directory, "docstore.json"), "utf8")
|
|
.then(JSON.parse),
|
|
index.readIndex(path.join(directory, "hnswlib.index")),
|
|
]);
|
|
args.docstore = new SynchronousInMemoryDocstore(new Map(docstoreFiles));
|
|
args.index = index;
|
|
return new HNSWLib(embeddings, args);
|
|
}
|
|
/**
|
|
* Static method to create a new HNSWLib instance from texts and metadata.
|
|
* It creates a new Document instance for each text and metadata, then
|
|
* calls the fromDocuments method to create the HNSWLib instance.
|
|
* @param texts The texts to be used to create the documents.
|
|
* @param metadatas The metadata to be used to create the documents.
|
|
* @param embeddings The embeddings to be used by the HNSWLib instance.
|
|
* @param dbConfig An optional configuration object for the document store.
|
|
* @returns A Promise that resolves to a new HNSWLib instance.
|
|
*/
|
|
static async fromTexts(texts, metadatas, embeddings, dbConfig) {
|
|
const docs = [];
|
|
for (let i = 0; i < texts.length; i += 1) {
|
|
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
|
|
const newDoc = new Document({
|
|
pageContent: texts[i],
|
|
metadata,
|
|
});
|
|
docs.push(newDoc);
|
|
}
|
|
return HNSWLib.fromDocuments(docs, embeddings, dbConfig);
|
|
}
|
|
/**
|
|
* Static method to create a new HNSWLib instance from documents. It
|
|
* creates a new HNSWLib instance, adds the documents to it, then returns
|
|
* the instance.
|
|
* @param docs The documents to be added to the HNSWLib instance.
|
|
* @param embeddings The embeddings to be used by the HNSWLib instance.
|
|
* @param dbConfig An optional configuration object for the document store.
|
|
* @returns A Promise that resolves to a new HNSWLib instance.
|
|
*/
|
|
static async fromDocuments(docs, embeddings, dbConfig) {
|
|
const args = {
|
|
docstore: dbConfig?.docstore,
|
|
space: "cosine",
|
|
};
|
|
const instance = new this(embeddings, args);
|
|
await instance.addDocuments(docs);
|
|
return instance;
|
|
}
|
|
static async imports() {
|
|
try {
|
|
const { default: { HierarchicalNSW }, } = await import("hnswlib-node");
|
|
return { HierarchicalNSW };
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
}
|
|
catch (err) {
|
|
throw new Error(`Could not import hnswlib-node. Please install hnswlib-node as a dependency with, e.g. \`npm install -S hnswlib-node\`.\n\nError: ${err?.message}`);
|
|
}
|
|
}
|
|
}
|