122 lines
4.1 KiB
JavaScript
122 lines
4.1 KiB
JavaScript
import { existsSync, readFileSync } from "fs";
|
|
import { parse } from "url";
|
|
import { Embeddings } from "@langchain/core/embeddings";
|
|
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
/**
|
|
* The default Jina API URL for embedding requests.
|
|
*/
|
|
const JINA_API_URL = "https://api.jina.ai/v1/embeddings";
|
|
/**
|
|
* Check if a URL is a local file.
|
|
* @param url - The URL to check.
|
|
* @returns True if the URL is a local file, False otherwise.
|
|
*/
|
|
function isLocal(url) {
|
|
const urlParsed = parse(url);
|
|
if (urlParsed.protocol === null || urlParsed.protocol === "file:") {
|
|
return existsSync(urlParsed.pathname || "");
|
|
}
|
|
return false;
|
|
}
|
|
/**
|
|
* Get the bytes string of a file.
|
|
* @param filePath - The path to the file.
|
|
* @returns The bytes string of the file.
|
|
*/
|
|
function getBytesStr(filePath) {
|
|
const imageFile = readFileSync(filePath);
|
|
return Buffer.from(imageFile).toString("base64");
|
|
}
|
|
/**
|
|
* A class for generating embeddings using the Jina API.
|
|
* @example
|
|
* ```typescript
|
|
* // Embed a query using the JinaEmbeddings class
|
|
* const model = new JinaEmbeddings();
|
|
* const res = await model.embedQuery(
|
|
* "What would be a good name for a semantic search engine ?",
|
|
* );
|
|
* console.log({ res });
|
|
* ```
|
|
*/
|
|
export class JinaEmbeddings extends Embeddings {
|
|
/**
|
|
* Constructor for the JinaEmbeddings class.
|
|
* @param fields - An optional object with properties to configure the instance.
|
|
*/
|
|
constructor(fields) {
|
|
const fieldsWithDefaults = {
|
|
model: "jina-embeddings-v2-base-en",
|
|
...fields,
|
|
};
|
|
super(fieldsWithDefaults);
|
|
Object.defineProperty(this, "apiKey", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "model", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
const apiKey = fieldsWithDefaults?.apiKey ||
|
|
getEnvironmentVariable("JINA_API_KEY") ||
|
|
getEnvironmentVariable("JINA_AUTH_TOKEN");
|
|
if (!apiKey) {
|
|
throw new Error("Jina API key not found");
|
|
}
|
|
this.model = fieldsWithDefaults?.model ?? this.model;
|
|
this.apiKey = apiKey;
|
|
}
|
|
/**
|
|
* Generates embeddings for an array of inputs.
|
|
* @param input - An array of strings or objects to generate embeddings for.
|
|
* @returns A Promise that resolves to an array of embeddings.
|
|
*/
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
async _embed(input) {
|
|
const response = await fetch(JINA_API_URL, {
|
|
method: "POST",
|
|
headers: {
|
|
Authorization: `Bearer ${this.apiKey}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({ input, model: this.model }),
|
|
});
|
|
const json = (await response.json());
|
|
if (!json.data) {
|
|
throw new Error(json.detail || "Unknown error from Jina API");
|
|
}
|
|
const sortedEmbeddings = json.data.sort((a, b) => a.index - b.index);
|
|
return sortedEmbeddings.map((item) => item.embedding);
|
|
}
|
|
/**
|
|
* Generates embeddings for an array of texts.
|
|
* @param texts - An array of strings to generate embeddings for.
|
|
* @returns A Promise that resolves to an array of embeddings.
|
|
*/
|
|
async embedDocuments(texts) {
|
|
return this._embed(texts);
|
|
}
|
|
/**
|
|
* Generates an embedding for a single text.
|
|
* @param text - A string to generate an embedding for.
|
|
* @returns A Promise that resolves to an array of numbers representing the embedding.
|
|
*/
|
|
async embedQuery(text) {
|
|
const embeddings = await this._embed([text]);
|
|
return embeddings[0];
|
|
}
|
|
/**
|
|
* Generates embeddings for an array of image URIs.
|
|
* @param uris - An array of image URIs to generate embeddings for.
|
|
* @returns A Promise that resolves to an array of embeddings.
|
|
*/
|
|
async embedImages(uris) {
|
|
const input = uris.map((uri) => (isLocal(uri) ? getBytesStr(uri) : uri));
|
|
return this._embed(input);
|
|
}
|
|
}
|