import base64 from 'base64-js'; var __defProp = Object.defineProperty; var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value; var __publicField = (obj, key, value) => { __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value); return value; }; // src/utils.ts function never(_) { } function bytePairMerge(piece, ranks) { let parts = Array.from( { length: piece.length }, (_, i) => ({ start: i, end: i + 1 }) ); while (parts.length > 1) { let minRank = null; for (let i = 0; i < parts.length - 1; i++) { const slice = piece.slice(parts[i].start, parts[i + 1].end); const rank = ranks.get(slice.join(",")); if (rank == null) continue; if (minRank == null || rank < minRank[0]) { minRank = [rank, i]; } } if (minRank != null) { const i = minRank[1]; parts[i] = { start: parts[i].start, end: parts[i + 1].end }; parts.splice(i + 1, 1); } else { break; } } return parts; } function bytePairEncode(piece, ranks) { if (piece.length === 1) return [ranks.get(piece.join(","))]; return bytePairMerge(piece, ranks).map((p) => ranks.get(piece.slice(p.start, p.end).join(","))).filter((x) => x != null); } function escapeRegex(str) { return str.replace(/[\\^$*+?.()|[\]{}]/g, "\\$&"); } var _Tiktoken = class { /** @internal */ specialTokens; /** @internal */ inverseSpecialTokens; /** @internal */ patStr; /** @internal */ textEncoder = new TextEncoder(); /** @internal */ textDecoder = new TextDecoder("utf-8"); /** @internal */ rankMap = /* @__PURE__ */ new Map(); /** @internal */ textMap = /* @__PURE__ */ new Map(); constructor(ranks, extendedSpecialTokens) { this.patStr = ranks.pat_str; const uncompressed = ranks.bpe_ranks.split("\n").filter(Boolean).reduce((memo, x) => { const [_, offsetStr, ...tokens] = x.split(" "); const offset = Number.parseInt(offsetStr, 10); tokens.forEach((token, i) => memo[token] = offset + i); return memo; }, {}); for (const [token, rank] of Object.entries(uncompressed)) { const bytes = base64.toByteArray(token); this.rankMap.set(bytes.join(","), rank); this.textMap.set(rank, bytes); } this.specialTokens = { ...ranks.special_tokens, ...extendedSpecialTokens }; this.inverseSpecialTokens = Object.entries(this.specialTokens).reduce((memo, [text, rank]) => { memo[rank] = this.textEncoder.encode(text); return memo; }, {}); } encode(text, allowedSpecial = [], disallowedSpecial = "all") { const regexes = new RegExp(this.patStr, "ug"); const specialRegex = _Tiktoken.specialTokenRegex( Object.keys(this.specialTokens) ); const ret = []; const allowedSpecialSet = new Set( allowedSpecial === "all" ? Object.keys(this.specialTokens) : allowedSpecial ); const disallowedSpecialSet = new Set( disallowedSpecial === "all" ? Object.keys(this.specialTokens).filter( (x) => !allowedSpecialSet.has(x) ) : disallowedSpecial ); if (disallowedSpecialSet.size > 0) { const disallowedSpecialRegex = _Tiktoken.specialTokenRegex([ ...disallowedSpecialSet ]); const specialMatch = text.match(disallowedSpecialRegex); if (specialMatch != null) { throw new Error( `The text contains a special token that is not allowed: ${specialMatch[0]}` ); } } let start = 0; while (true) { let nextSpecial = null; let startFind = start; while (true) { specialRegex.lastIndex = startFind; nextSpecial = specialRegex.exec(text); if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0])) break; startFind = nextSpecial.index + 1; } const end = nextSpecial?.index ?? text.length; for (const match of text.substring(start, end).matchAll(regexes)) { const piece = this.textEncoder.encode(match[0]); const token2 = this.rankMap.get(piece.join(",")); if (token2 != null) { ret.push(token2); continue; } ret.push(...bytePairEncode(piece, this.rankMap)); } if (nextSpecial == null) break; let token = this.specialTokens[nextSpecial[0]]; ret.push(token); start = nextSpecial.index + nextSpecial[0].length; } return ret; } decode(tokens) { const res = []; let length = 0; for (let i2 = 0; i2 < tokens.length; ++i2) { const token = tokens[i2]; const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token]; if (bytes != null) { res.push(bytes); length += bytes.length; } } const mergedArray = new Uint8Array(length); let i = 0; for (const bytes of res) { mergedArray.set(bytes, i); i += bytes.length; } return this.textDecoder.decode(mergedArray); } }; var Tiktoken = _Tiktoken; __publicField(Tiktoken, "specialTokenRegex", (tokens) => { return new RegExp(tokens.map((i) => escapeRegex(i)).join("|"), "g"); }); function getEncodingNameForModel(model) { switch (model) { case "gpt2": { return "gpt2"; } case "code-cushman-001": case "code-cushman-002": case "code-davinci-001": case "code-davinci-002": case "cushman-codex": case "davinci-codex": case "davinci-002": case "text-davinci-002": case "text-davinci-003": { return "p50k_base"; } case "code-davinci-edit-001": case "text-davinci-edit-001": { return "p50k_edit"; } case "ada": case "babbage": case "babbage-002": case "code-search-ada-code-001": case "code-search-babbage-code-001": case "curie": case "davinci": case "text-ada-001": case "text-babbage-001": case "text-curie-001": case "text-davinci-001": case "text-search-ada-doc-001": case "text-search-babbage-doc-001": case "text-search-curie-doc-001": case "text-search-davinci-doc-001": case "text-similarity-ada-001": case "text-similarity-babbage-001": case "text-similarity-curie-001": case "text-similarity-davinci-001": { return "r50k_base"; } case "gpt-3.5-turbo-instruct-0914": case "gpt-3.5-turbo-instruct": case "gpt-3.5-turbo-16k-0613": case "gpt-3.5-turbo-16k": case "gpt-3.5-turbo-0613": case "gpt-3.5-turbo-0301": case "gpt-3.5-turbo": case "gpt-4-32k-0613": case "gpt-4-32k-0314": case "gpt-4-32k": case "gpt-4-0613": case "gpt-4-0314": case "gpt-4": case "gpt-3.5-turbo-1106": case "gpt-35-turbo": case "gpt-4-1106-preview": case "gpt-4-vision-preview": case "gpt-3.5-turbo-0125": case "gpt-4-turbo": case "gpt-4-turbo-2024-04-09": case "gpt-4-turbo-preview": case "gpt-4-0125-preview": case "text-embedding-ada-002": case "text-embedding-3-small": case "text-embedding-3-large": { return "cl100k_base"; } case "gpt-4o": case "gpt-4o-2024-05-13": case "gpt-4o-2024-08-06": case "gpt-4o-mini-2024-07-18": case "gpt-4o-mini": { return "o200k_base"; } default: throw new Error("Unknown model"); } } export { Tiktoken, getEncodingNameForModel, never };