import { insecureHash } from "@langchain/core/utils/hash"; import { Embeddings, } from "@langchain/core/embeddings"; import { EncoderBackedStore } from "../storage/encoder_backed.js"; /** * Interface for caching results from embedding models. * * The interface allows works with any store that implements * the abstract store interface accepting keys of type str and values of list of * floats. * * If need be, the interface can be extended to accept other implementations * of the value serializer and deserializer, as well as the key encoder. * @example * ```typescript * const underlyingEmbeddings = new OpenAIEmbeddings(); * * const cacheBackedEmbeddings = CacheBackedEmbeddings.fromBytesStore( * underlyingEmbeddings, * new ConvexKVStore({ ctx }), * { * namespace: underlyingEmbeddings.modelName, * }, * ); * * const loader = new TextLoader("./state_of_the_union.txt"); * const rawDocuments = await loader.load(); * const splitter = new RecursiveCharacterTextSplitter({ * chunkSize: 1000, * chunkOverlap: 0, * }); * const documents = await splitter.splitDocuments(rawDocuments); * * let time = Date.now(); * const vectorstore = await ConvexVectorStore.fromDocuments( * documents, * cacheBackedEmbeddings, * { ctx }, * ); * console.log(`Initial creation time: ${Date.now() - time}ms`); * * time = Date.now(); * const vectorstore2 = await ConvexVectorStore.fromDocuments( * documents, * cacheBackedEmbeddings, * { ctx }, * ); * console.log(`Cached creation time: ${Date.now() - time}ms`); * * ``` */ export class CacheBackedEmbeddings extends Embeddings { constructor(fields) { super(fields); Object.defineProperty(this, "underlyingEmbeddings", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "documentEmbeddingStore", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.underlyingEmbeddings = fields.underlyingEmbeddings; this.documentEmbeddingStore = fields.documentEmbeddingStore; } /** * Embed query text. * * This method does not support caching at the moment. * * Support for caching queries is easy to implement, but might make * sense to hold off to see the most common patterns. * * If the cache has an eviction policy, we may need to be a bit more careful * about sharing the cache between documents and queries. Generally, * one is OK evicting query caches, but document caches should be kept. * * @param document The text to embed. * @returns The embedding for the given text. */ async embedQuery(document) { return this.underlyingEmbeddings.embedQuery(document); } /** * Embed a list of texts. * * The method first checks the cache for the embeddings. * If the embeddings are not found, the method uses the underlying embedder * to embed the documents and stores the results in the cache. * * @param documents * @returns A list of embeddings for the given texts. */ async embedDocuments(documents) { const vectors = await this.documentEmbeddingStore.mget(documents); const missingIndicies = []; const missingDocuments = []; for (let i = 0; i < vectors.length; i += 1) { if (vectors[i] === undefined) { missingIndicies.push(i); missingDocuments.push(documents[i]); } } if (missingDocuments.length) { const missingVectors = await this.underlyingEmbeddings.embedDocuments(missingDocuments); const keyValuePairs = missingDocuments.map((document, i) => [document, missingVectors[i]]); await this.documentEmbeddingStore.mset(keyValuePairs); for (let i = 0; i < missingIndicies.length; i += 1) { vectors[missingIndicies[i]] = missingVectors[i]; } } return vectors; } /** * Create a new CacheBackedEmbeddings instance from another embeddings instance * and a storage instance. * @param underlyingEmbeddings Embeddings used to populate the cache for new documents. * @param documentEmbeddingStore Stores raw document embedding values. Keys are hashes of the document content. * @param options.namespace Optional namespace for store keys. * @returns A new CacheBackedEmbeddings instance. */ static fromBytesStore(underlyingEmbeddings, documentEmbeddingStore, options) { const encoder = new TextEncoder(); const decoder = new TextDecoder(); const encoderBackedStore = new EncoderBackedStore({ store: documentEmbeddingStore, keyEncoder: (key) => (options?.namespace ?? "") + insecureHash(key), valueSerializer: (value) => encoder.encode(JSON.stringify(value)), valueDeserializer: (serializedValue) => JSON.parse(decoder.decode(serializedValue)), }); return new this({ underlyingEmbeddings, documentEmbeddingStore: encoderBackedStore, }); } }