239 lines
7.2 KiB
JavaScript
239 lines
7.2 KiB
JavaScript
import base64 from 'base64-js';
|
|
|
|
var __defProp = Object.defineProperty;
|
|
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
var __publicField = (obj, key, value) => {
|
|
__defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
|
|
return value;
|
|
};
|
|
|
|
// src/utils.ts
|
|
function never(_) {
|
|
}
|
|
function bytePairMerge(piece, ranks) {
|
|
let parts = Array.from(
|
|
{ length: piece.length },
|
|
(_, i) => ({ start: i, end: i + 1 })
|
|
);
|
|
while (parts.length > 1) {
|
|
let minRank = null;
|
|
for (let i = 0; i < parts.length - 1; i++) {
|
|
const slice = piece.slice(parts[i].start, parts[i + 1].end);
|
|
const rank = ranks.get(slice.join(","));
|
|
if (rank == null)
|
|
continue;
|
|
if (minRank == null || rank < minRank[0]) {
|
|
minRank = [rank, i];
|
|
}
|
|
}
|
|
if (minRank != null) {
|
|
const i = minRank[1];
|
|
parts[i] = { start: parts[i].start, end: parts[i + 1].end };
|
|
parts.splice(i + 1, 1);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
return parts;
|
|
}
|
|
function bytePairEncode(piece, ranks) {
|
|
if (piece.length === 1)
|
|
return [ranks.get(piece.join(","))];
|
|
return bytePairMerge(piece, ranks).map((p) => ranks.get(piece.slice(p.start, p.end).join(","))).filter((x) => x != null);
|
|
}
|
|
function escapeRegex(str) {
|
|
return str.replace(/[\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
}
|
|
var _Tiktoken = class {
|
|
/** @internal */
|
|
specialTokens;
|
|
/** @internal */
|
|
inverseSpecialTokens;
|
|
/** @internal */
|
|
patStr;
|
|
/** @internal */
|
|
textEncoder = new TextEncoder();
|
|
/** @internal */
|
|
textDecoder = new TextDecoder("utf-8");
|
|
/** @internal */
|
|
rankMap = /* @__PURE__ */ new Map();
|
|
/** @internal */
|
|
textMap = /* @__PURE__ */ new Map();
|
|
constructor(ranks, extendedSpecialTokens) {
|
|
this.patStr = ranks.pat_str;
|
|
const uncompressed = ranks.bpe_ranks.split("\n").filter(Boolean).reduce((memo, x) => {
|
|
const [_, offsetStr, ...tokens] = x.split(" ");
|
|
const offset = Number.parseInt(offsetStr, 10);
|
|
tokens.forEach((token, i) => memo[token] = offset + i);
|
|
return memo;
|
|
}, {});
|
|
for (const [token, rank] of Object.entries(uncompressed)) {
|
|
const bytes = base64.toByteArray(token);
|
|
this.rankMap.set(bytes.join(","), rank);
|
|
this.textMap.set(rank, bytes);
|
|
}
|
|
this.specialTokens = { ...ranks.special_tokens, ...extendedSpecialTokens };
|
|
this.inverseSpecialTokens = Object.entries(this.specialTokens).reduce((memo, [text, rank]) => {
|
|
memo[rank] = this.textEncoder.encode(text);
|
|
return memo;
|
|
}, {});
|
|
}
|
|
encode(text, allowedSpecial = [], disallowedSpecial = "all") {
|
|
const regexes = new RegExp(this.patStr, "ug");
|
|
const specialRegex = _Tiktoken.specialTokenRegex(
|
|
Object.keys(this.specialTokens)
|
|
);
|
|
const ret = [];
|
|
const allowedSpecialSet = new Set(
|
|
allowedSpecial === "all" ? Object.keys(this.specialTokens) : allowedSpecial
|
|
);
|
|
const disallowedSpecialSet = new Set(
|
|
disallowedSpecial === "all" ? Object.keys(this.specialTokens).filter(
|
|
(x) => !allowedSpecialSet.has(x)
|
|
) : disallowedSpecial
|
|
);
|
|
if (disallowedSpecialSet.size > 0) {
|
|
const disallowedSpecialRegex = _Tiktoken.specialTokenRegex([
|
|
...disallowedSpecialSet
|
|
]);
|
|
const specialMatch = text.match(disallowedSpecialRegex);
|
|
if (specialMatch != null) {
|
|
throw new Error(
|
|
`The text contains a special token that is not allowed: ${specialMatch[0]}`
|
|
);
|
|
}
|
|
}
|
|
let start = 0;
|
|
while (true) {
|
|
let nextSpecial = null;
|
|
let startFind = start;
|
|
while (true) {
|
|
specialRegex.lastIndex = startFind;
|
|
nextSpecial = specialRegex.exec(text);
|
|
if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0]))
|
|
break;
|
|
startFind = nextSpecial.index + 1;
|
|
}
|
|
const end = nextSpecial?.index ?? text.length;
|
|
for (const match of text.substring(start, end).matchAll(regexes)) {
|
|
const piece = this.textEncoder.encode(match[0]);
|
|
const token2 = this.rankMap.get(piece.join(","));
|
|
if (token2 != null) {
|
|
ret.push(token2);
|
|
continue;
|
|
}
|
|
ret.push(...bytePairEncode(piece, this.rankMap));
|
|
}
|
|
if (nextSpecial == null)
|
|
break;
|
|
let token = this.specialTokens[nextSpecial[0]];
|
|
ret.push(token);
|
|
start = nextSpecial.index + nextSpecial[0].length;
|
|
}
|
|
return ret;
|
|
}
|
|
decode(tokens) {
|
|
const res = [];
|
|
let length = 0;
|
|
for (let i2 = 0; i2 < tokens.length; ++i2) {
|
|
const token = tokens[i2];
|
|
const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token];
|
|
if (bytes != null) {
|
|
res.push(bytes);
|
|
length += bytes.length;
|
|
}
|
|
}
|
|
const mergedArray = new Uint8Array(length);
|
|
let i = 0;
|
|
for (const bytes of res) {
|
|
mergedArray.set(bytes, i);
|
|
i += bytes.length;
|
|
}
|
|
return this.textDecoder.decode(mergedArray);
|
|
}
|
|
};
|
|
var Tiktoken = _Tiktoken;
|
|
__publicField(Tiktoken, "specialTokenRegex", (tokens) => {
|
|
return new RegExp(tokens.map((i) => escapeRegex(i)).join("|"), "g");
|
|
});
|
|
function getEncodingNameForModel(model) {
|
|
switch (model) {
|
|
case "gpt2": {
|
|
return "gpt2";
|
|
}
|
|
case "code-cushman-001":
|
|
case "code-cushman-002":
|
|
case "code-davinci-001":
|
|
case "code-davinci-002":
|
|
case "cushman-codex":
|
|
case "davinci-codex":
|
|
case "davinci-002":
|
|
case "text-davinci-002":
|
|
case "text-davinci-003": {
|
|
return "p50k_base";
|
|
}
|
|
case "code-davinci-edit-001":
|
|
case "text-davinci-edit-001": {
|
|
return "p50k_edit";
|
|
}
|
|
case "ada":
|
|
case "babbage":
|
|
case "babbage-002":
|
|
case "code-search-ada-code-001":
|
|
case "code-search-babbage-code-001":
|
|
case "curie":
|
|
case "davinci":
|
|
case "text-ada-001":
|
|
case "text-babbage-001":
|
|
case "text-curie-001":
|
|
case "text-davinci-001":
|
|
case "text-search-ada-doc-001":
|
|
case "text-search-babbage-doc-001":
|
|
case "text-search-curie-doc-001":
|
|
case "text-search-davinci-doc-001":
|
|
case "text-similarity-ada-001":
|
|
case "text-similarity-babbage-001":
|
|
case "text-similarity-curie-001":
|
|
case "text-similarity-davinci-001": {
|
|
return "r50k_base";
|
|
}
|
|
case "gpt-3.5-turbo-instruct-0914":
|
|
case "gpt-3.5-turbo-instruct":
|
|
case "gpt-3.5-turbo-16k-0613":
|
|
case "gpt-3.5-turbo-16k":
|
|
case "gpt-3.5-turbo-0613":
|
|
case "gpt-3.5-turbo-0301":
|
|
case "gpt-3.5-turbo":
|
|
case "gpt-4-32k-0613":
|
|
case "gpt-4-32k-0314":
|
|
case "gpt-4-32k":
|
|
case "gpt-4-0613":
|
|
case "gpt-4-0314":
|
|
case "gpt-4":
|
|
case "gpt-3.5-turbo-1106":
|
|
case "gpt-35-turbo":
|
|
case "gpt-4-1106-preview":
|
|
case "gpt-4-vision-preview":
|
|
case "gpt-3.5-turbo-0125":
|
|
case "gpt-4-turbo":
|
|
case "gpt-4-turbo-2024-04-09":
|
|
case "gpt-4-turbo-preview":
|
|
case "gpt-4-0125-preview":
|
|
case "text-embedding-ada-002":
|
|
case "text-embedding-3-small":
|
|
case "text-embedding-3-large": {
|
|
return "cl100k_base";
|
|
}
|
|
case "gpt-4o":
|
|
case "gpt-4o-2024-05-13":
|
|
case "gpt-4o-2024-08-06":
|
|
case "gpt-4o-mini-2024-07-18":
|
|
case "gpt-4o-mini": {
|
|
return "o200k_base";
|
|
}
|
|
default:
|
|
throw new Error("Unknown model");
|
|
}
|
|
}
|
|
|
|
export { Tiktoken, getEncodingNameForModel, never };
|