agsamantha/node_modules/js-tiktoken/dist/chunk-XX6PTLQF.js
2024-10-02 15:15:21 -05:00

239 lines
7.2 KiB
JavaScript

import base64 from 'base64-js';
var __defProp = Object.defineProperty;
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
var __publicField = (obj, key, value) => {
__defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
return value;
};
// src/utils.ts
function never(_) {
}
function bytePairMerge(piece, ranks) {
let parts = Array.from(
{ length: piece.length },
(_, i) => ({ start: i, end: i + 1 })
);
while (parts.length > 1) {
let minRank = null;
for (let i = 0; i < parts.length - 1; i++) {
const slice = piece.slice(parts[i].start, parts[i + 1].end);
const rank = ranks.get(slice.join(","));
if (rank == null)
continue;
if (minRank == null || rank < minRank[0]) {
minRank = [rank, i];
}
}
if (minRank != null) {
const i = minRank[1];
parts[i] = { start: parts[i].start, end: parts[i + 1].end };
parts.splice(i + 1, 1);
} else {
break;
}
}
return parts;
}
function bytePairEncode(piece, ranks) {
if (piece.length === 1)
return [ranks.get(piece.join(","))];
return bytePairMerge(piece, ranks).map((p) => ranks.get(piece.slice(p.start, p.end).join(","))).filter((x) => x != null);
}
function escapeRegex(str) {
return str.replace(/[\\^$*+?.()|[\]{}]/g, "\\$&");
}
var _Tiktoken = class {
/** @internal */
specialTokens;
/** @internal */
inverseSpecialTokens;
/** @internal */
patStr;
/** @internal */
textEncoder = new TextEncoder();
/** @internal */
textDecoder = new TextDecoder("utf-8");
/** @internal */
rankMap = /* @__PURE__ */ new Map();
/** @internal */
textMap = /* @__PURE__ */ new Map();
constructor(ranks, extendedSpecialTokens) {
this.patStr = ranks.pat_str;
const uncompressed = ranks.bpe_ranks.split("\n").filter(Boolean).reduce((memo, x) => {
const [_, offsetStr, ...tokens] = x.split(" ");
const offset = Number.parseInt(offsetStr, 10);
tokens.forEach((token, i) => memo[token] = offset + i);
return memo;
}, {});
for (const [token, rank] of Object.entries(uncompressed)) {
const bytes = base64.toByteArray(token);
this.rankMap.set(bytes.join(","), rank);
this.textMap.set(rank, bytes);
}
this.specialTokens = { ...ranks.special_tokens, ...extendedSpecialTokens };
this.inverseSpecialTokens = Object.entries(this.specialTokens).reduce((memo, [text, rank]) => {
memo[rank] = this.textEncoder.encode(text);
return memo;
}, {});
}
encode(text, allowedSpecial = [], disallowedSpecial = "all") {
const regexes = new RegExp(this.patStr, "ug");
const specialRegex = _Tiktoken.specialTokenRegex(
Object.keys(this.specialTokens)
);
const ret = [];
const allowedSpecialSet = new Set(
allowedSpecial === "all" ? Object.keys(this.specialTokens) : allowedSpecial
);
const disallowedSpecialSet = new Set(
disallowedSpecial === "all" ? Object.keys(this.specialTokens).filter(
(x) => !allowedSpecialSet.has(x)
) : disallowedSpecial
);
if (disallowedSpecialSet.size > 0) {
const disallowedSpecialRegex = _Tiktoken.specialTokenRegex([
...disallowedSpecialSet
]);
const specialMatch = text.match(disallowedSpecialRegex);
if (specialMatch != null) {
throw new Error(
`The text contains a special token that is not allowed: ${specialMatch[0]}`
);
}
}
let start = 0;
while (true) {
let nextSpecial = null;
let startFind = start;
while (true) {
specialRegex.lastIndex = startFind;
nextSpecial = specialRegex.exec(text);
if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0]))
break;
startFind = nextSpecial.index + 1;
}
const end = nextSpecial?.index ?? text.length;
for (const match of text.substring(start, end).matchAll(regexes)) {
const piece = this.textEncoder.encode(match[0]);
const token2 = this.rankMap.get(piece.join(","));
if (token2 != null) {
ret.push(token2);
continue;
}
ret.push(...bytePairEncode(piece, this.rankMap));
}
if (nextSpecial == null)
break;
let token = this.specialTokens[nextSpecial[0]];
ret.push(token);
start = nextSpecial.index + nextSpecial[0].length;
}
return ret;
}
decode(tokens) {
const res = [];
let length = 0;
for (let i2 = 0; i2 < tokens.length; ++i2) {
const token = tokens[i2];
const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token];
if (bytes != null) {
res.push(bytes);
length += bytes.length;
}
}
const mergedArray = new Uint8Array(length);
let i = 0;
for (const bytes of res) {
mergedArray.set(bytes, i);
i += bytes.length;
}
return this.textDecoder.decode(mergedArray);
}
};
var Tiktoken = _Tiktoken;
__publicField(Tiktoken, "specialTokenRegex", (tokens) => {
return new RegExp(tokens.map((i) => escapeRegex(i)).join("|"), "g");
});
function getEncodingNameForModel(model) {
switch (model) {
case "gpt2": {
return "gpt2";
}
case "code-cushman-001":
case "code-cushman-002":
case "code-davinci-001":
case "code-davinci-002":
case "cushman-codex":
case "davinci-codex":
case "davinci-002":
case "text-davinci-002":
case "text-davinci-003": {
return "p50k_base";
}
case "code-davinci-edit-001":
case "text-davinci-edit-001": {
return "p50k_edit";
}
case "ada":
case "babbage":
case "babbage-002":
case "code-search-ada-code-001":
case "code-search-babbage-code-001":
case "curie":
case "davinci":
case "text-ada-001":
case "text-babbage-001":
case "text-curie-001":
case "text-davinci-001":
case "text-search-ada-doc-001":
case "text-search-babbage-doc-001":
case "text-search-curie-doc-001":
case "text-search-davinci-doc-001":
case "text-similarity-ada-001":
case "text-similarity-babbage-001":
case "text-similarity-curie-001":
case "text-similarity-davinci-001": {
return "r50k_base";
}
case "gpt-3.5-turbo-instruct-0914":
case "gpt-3.5-turbo-instruct":
case "gpt-3.5-turbo-16k-0613":
case "gpt-3.5-turbo-16k":
case "gpt-3.5-turbo-0613":
case "gpt-3.5-turbo-0301":
case "gpt-3.5-turbo":
case "gpt-4-32k-0613":
case "gpt-4-32k-0314":
case "gpt-4-32k":
case "gpt-4-0613":
case "gpt-4-0314":
case "gpt-4":
case "gpt-3.5-turbo-1106":
case "gpt-35-turbo":
case "gpt-4-1106-preview":
case "gpt-4-vision-preview":
case "gpt-3.5-turbo-0125":
case "gpt-4-turbo":
case "gpt-4-turbo-2024-04-09":
case "gpt-4-turbo-preview":
case "gpt-4-0125-preview":
case "text-embedding-ada-002":
case "text-embedding-3-small":
case "text-embedding-3-large": {
return "cl100k_base";
}
case "gpt-4o":
case "gpt-4o-2024-05-13":
case "gpt-4o-2024-08-06":
case "gpt-4o-mini-2024-07-18":
case "gpt-4o-mini": {
return "o200k_base";
}
default:
throw new Error("Unknown model");
}
}
export { Tiktoken, getEncodingNameForModel, never };