agsamantha/node_modules/@langchain/community/dist/vectorstores/hnswlib.js

272 lines
12 KiB
JavaScript
Raw Normal View History

2024-10-02 20:15:21 +00:00
import { SaveableVectorStore } from "@langchain/core/vectorstores";
import { Document } from "@langchain/core/documents";
import { SynchronousInMemoryDocstore } from "../stores/doc/in_memory.js";
/**
* Class that implements a vector store using Hierarchical Navigable Small
* World (HNSW) graphs. It extends the SaveableVectorStore class and
* provides methods for adding documents and vectors, performing
* similarity searches, and saving and loading the vector store.
*/
export class HNSWLib extends SaveableVectorStore {
_vectorstoreType() {
return "hnswlib";
}
constructor(embeddings, args) {
super(embeddings, args);
Object.defineProperty(this, "_index", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "docstore", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "args", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this._index = args.index;
this.args = args;
this.embeddings = embeddings;
this.docstore = args?.docstore ?? new SynchronousInMemoryDocstore();
}
/**
* Method to add documents to the vector store. It first converts the
* documents to vectors using the embeddings, then adds the vectors to the
* vector store.
* @param documents The documents to be added to the vector store.
* @returns A Promise that resolves when the documents have been added.
*/
async addDocuments(documents) {
const texts = documents.map(({ pageContent }) => pageContent);
return this.addVectors(await this.embeddings.embedDocuments(texts), documents);
}
static async getHierarchicalNSW(args) {
const { HierarchicalNSW } = await HNSWLib.imports();
if (!args.space) {
throw new Error("hnswlib-node requires a space argument");
}
if (args.numDimensions === undefined) {
throw new Error("hnswlib-node requires a numDimensions argument");
}
return new HierarchicalNSW(args.space, args.numDimensions);
}
async initIndex(vectors) {
if (!this._index) {
if (this.args.numDimensions === undefined) {
this.args.numDimensions = vectors[0].length;
}
this.index = await HNSWLib.getHierarchicalNSW(this.args);
}
if (!this.index.getCurrentCount()) {
this.index.initIndex(vectors.length);
}
}
get index() {
if (!this._index) {
throw new Error("Vector store not initialised yet. Try calling `addTexts` first.");
}
return this._index;
}
set index(index) {
this._index = index;
}
/**
* Method to add vectors to the vector store. It first initializes the
* index if it hasn't been initialized yet, then adds the vectors to the
* index and the documents to the document store.
* @param vectors The vectors to be added to the vector store.
* @param documents The documents corresponding to the vectors.
* @returns A Promise that resolves when the vectors and documents have been added.
*/
async addVectors(vectors, documents) {
if (vectors.length === 0) {
return;
}
await this.initIndex(vectors);
// TODO here we could optionally normalise the vectors to unit length
// so that dot product is equivalent to cosine similarity, like this
// https://github.com/nmslib/hnswlib/issues/384#issuecomment-1155737730
// While we only support OpenAI embeddings this isn't necessary
if (vectors.length !== documents.length) {
throw new Error(`Vectors and metadatas must have the same length`);
}
if (vectors[0].length !== this.args.numDimensions) {
throw new Error(`Vectors must have the same length as the number of dimensions (${this.args.numDimensions})`);
}
const capacity = this.index.getMaxElements();
const needed = this.index.getCurrentCount() + vectors.length;
if (needed > capacity) {
this.index.resizeIndex(needed);
}
const docstoreSize = this.index.getCurrentCount();
const toSave = {};
for (let i = 0; i < vectors.length; i += 1) {
this.index.addPoint(vectors[i], docstoreSize + i);
toSave[docstoreSize + i] = documents[i];
}
this.docstore.add(toSave);
}
/**
* Method to perform a similarity search in the vector store using a query
* vector. It returns the k most similar documents along with their
* similarity scores. An optional filter function can be provided to
* filter the documents.
* @param query The query vector.
* @param k The number of most similar documents to return.
* @param filter An optional filter function to filter the documents.
* @returns A Promise that resolves to an array of tuples, where each tuple contains a document and its similarity score.
*/
async similaritySearchVectorWithScore(query, k, filter) {
if (this.args.numDimensions && !this._index) {
await this.initIndex([[]]);
}
if (query.length !== this.args.numDimensions) {
throw new Error(`Query vector must have the same length as the number of dimensions (${this.args.numDimensions})`);
}
if (k > this.index.getCurrentCount()) {
const total = this.index.getCurrentCount();
console.warn(`k (${k}) is greater than the number of elements in the index (${total}), setting k to ${total}`);
// eslint-disable-next-line no-param-reassign
k = total;
}
const filterFunction = (label) => {
if (!filter) {
return true;
}
const document = this.docstore.search(String(label));
// eslint-disable-next-line no-instanceof/no-instanceof
if (typeof document !== "string") {
return filter(document);
}
return false;
};
const result = this.index.searchKnn(query, k, filter ? filterFunction : undefined);
return result.neighbors.map((docIndex, resultIndex) => [
this.docstore.search(String(docIndex)),
result.distances[resultIndex],
]);
}
/**
* Method to delete the vector store from a directory. It deletes the
* hnswlib.index file, the docstore.json file, and the args.json file from
* the directory.
* @param params An object with a directory property that specifies the directory from which to delete the vector store.
* @returns A Promise that resolves when the vector store has been deleted.
*/
async delete(params) {
const fs = await import("node:fs/promises");
const path = await import("node:path");
try {
await fs.access(path.join(params.directory, "hnswlib.index"));
}
catch (err) {
throw new Error(`Directory ${params.directory} does not contain a hnswlib.index file.`);
}
await Promise.all([
await fs.rm(path.join(params.directory, "hnswlib.index"), {
force: true,
}),
await fs.rm(path.join(params.directory, "docstore.json"), {
force: true,
}),
await fs.rm(path.join(params.directory, "args.json"), { force: true }),
]);
}
/**
* Method to save the vector store to a directory. It saves the HNSW
* index, the arguments, and the document store to the directory.
* @param directory The directory to which to save the vector store.
* @returns A Promise that resolves when the vector store has been saved.
*/
async save(directory) {
const fs = await import("node:fs/promises");
const path = await import("node:path");
await fs.mkdir(directory, { recursive: true });
await Promise.all([
this.index.writeIndex(path.join(directory, "hnswlib.index")),
await fs.writeFile(path.join(directory, "args.json"), JSON.stringify(this.args)),
await fs.writeFile(path.join(directory, "docstore.json"), JSON.stringify(Array.from(this.docstore._docs.entries()))),
]);
}
/**
* Static method to load a vector store from a directory. It reads the
* HNSW index, the arguments, and the document store from the directory,
* then creates a new HNSWLib instance with these values.
* @param directory The directory from which to load the vector store.
* @param embeddings The embeddings to be used by the HNSWLib instance.
* @returns A Promise that resolves to a new HNSWLib instance.
*/
static async load(directory, embeddings) {
const fs = await import("node:fs/promises");
const path = await import("node:path");
const args = JSON.parse(await fs.readFile(path.join(directory, "args.json"), "utf8"));
const index = await HNSWLib.getHierarchicalNSW(args);
const [docstoreFiles] = await Promise.all([
fs
.readFile(path.join(directory, "docstore.json"), "utf8")
.then(JSON.parse),
index.readIndex(path.join(directory, "hnswlib.index")),
]);
args.docstore = new SynchronousInMemoryDocstore(new Map(docstoreFiles));
args.index = index;
return new HNSWLib(embeddings, args);
}
/**
* Static method to create a new HNSWLib instance from texts and metadata.
* It creates a new Document instance for each text and metadata, then
* calls the fromDocuments method to create the HNSWLib instance.
* @param texts The texts to be used to create the documents.
* @param metadatas The metadata to be used to create the documents.
* @param embeddings The embeddings to be used by the HNSWLib instance.
* @param dbConfig An optional configuration object for the document store.
* @returns A Promise that resolves to a new HNSWLib instance.
*/
static async fromTexts(texts, metadatas, embeddings, dbConfig) {
const docs = [];
for (let i = 0; i < texts.length; i += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
const newDoc = new Document({
pageContent: texts[i],
metadata,
});
docs.push(newDoc);
}
return HNSWLib.fromDocuments(docs, embeddings, dbConfig);
}
/**
* Static method to create a new HNSWLib instance from documents. It
* creates a new HNSWLib instance, adds the documents to it, then returns
* the instance.
* @param docs The documents to be added to the HNSWLib instance.
* @param embeddings The embeddings to be used by the HNSWLib instance.
* @param dbConfig An optional configuration object for the document store.
* @returns A Promise that resolves to a new HNSWLib instance.
*/
static async fromDocuments(docs, embeddings, dbConfig) {
const args = {
docstore: dbConfig?.docstore,
space: "cosine",
};
const instance = new this(embeddings, args);
await instance.addDocuments(docs);
return instance;
}
static async imports() {
try {
const { default: { HierarchicalNSW }, } = await import("hnswlib-node");
return { HierarchicalNSW };
// eslint-disable-next-line @typescript-eslint/no-explicit-any
}
catch (err) {
throw new Error(`Could not import hnswlib-node. Please install hnswlib-node as a dependency with, e.g. \`npm install -S hnswlib-node\`.\n\nError: ${err?.message}`);
}
}
}