/* * Copyright (C) 2016-2023 Apple, Inc. All rights reserved. * Some portions covered by other copyrights, listed below. *--- * Copyright (C) 2016 and later: Unicode, Inc. and others. * License & terms of use: http://www.unicode.org/copyright.html *--- * Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * * add APPLE_OSREFERENCE_LICENSE_HEADER stuff... */ #include #include #include #include "vfs_unicode_data.h" #define STATIC_UNLESS_TEST static enum { /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */ kMaxUTF8BytesPerChar = 4 }; /* local prototypes used by exported functions (and themselves exported for testing) */ STATIC_UNLESS_TEST int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit); STATIC_UNLESS_TEST int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], uint8_t combClass[kNFCSingleCharDecompMax]); /* local prototypes used by exported functions (not exported for separate testing) */ static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes, int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, int32_t* buf, uint8_t* bufcc, int32_t* buflenP, bool* needReorderP, bool* startP); void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen); int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]); /* * utf8_normalizeOptCaseFoldGetUVersion * * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 * version[3] = Code revision level; for any given Unicode version, this value starts * at 0 and is incremented for each significant revision to the * normalizeOptCaseFold functions. */ void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]) { version[0] = 15; version[1] = 1; version[2] = 0; version[3] = 0; return; } /* * utf8_normalizeOptCaseFoldAndHash * * str: The input UTF-8 string (need not be 0 terminated) * str_len: The byte length of the input string (excluding any 0 terminator) * case_sens: False for case-insensitive behavior; generates canonical caseless form. * True for case-sensitive behavior; generates standard NFD. * hash_func: A pointer to a hashing function to compute the hash of the * normalized/case-folded result. buf contains buf_len bytes * of data to be added to the hash using the caller-supplied * context (ctx). * hash_ctx: The context for the hash function. * * Returns: 0 on success, or * EILSEQ: The input string contains illegal ASCII-range characters * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or * contains codepoints that are non-characters or unassigned in * the version of Unicode currently supported (Unicode 9.0). */ int utf8_normalizeOptCaseFoldAndHash(const char *str, size_t str_len, bool case_sens, void (*hash_func)(void *buf, size_t buf_len, void *ctx), void *hash_ctx) { const char *strLimit = str + str_len; /* Data for the next pending single-char norm from input; * This will always begin with a base char (combining class 0) * or the first character in the string, which may no be a base */ int32_t unorm[kNFCSingleCharDecompMax]; uint8_t unormcc[kNFCSingleCharDecompMax]; int32_t unormlen = 0; int32_t unormstart = 0; bool start = true; /* main loop: * Each input character may be normalized to a sequence of one or more characters, * some of which may have non-zero combining class. Any sequence of characters * with non-zero combining class resulting from one or more input characters needs * to be accumulated in the main buffer so we can reorder as necessary before * calling the hash function. * * At the beginning of the main loop: The normalization buffer and main buffer are * both empty. * * Each time through the main loop we do the following: * 1. If there are characters available in the normalization result buffer (from the * result of normalizing a previous input character), copy the first character and * any following characters that have non-zero combining class to the main buffer. * 2. If there is nothing left in the normalization buffer, then loop processing * input characters as follows: * a) Get the next input character from UTF8, get its normalized and case-folded * result in the normalization buffer. * b) If the first character in the normalization buffer has combining class 0, * break; we will handle this normalization buffer next time through the main * loop. * c) Else copy the current normalization buffer (which has only combining marks) * to the main buffer, and continue with the loop processing input characters. * 3. At this point the first character in the main buffer may or may not have * combining class 0, but any subsequent characters (up to the the limit for * stream safe text) will be combining characters with nonzero combining class. * Reorder the combining marks if necessary into canonical order. * 4. Call the hash function for each character in the main buffer. * */ do { /* Data for the buffers being built up from input */ int32_t buf[kNCFStreamSafeBufMax]; uint8_t bufcc[kNCFStreamSafeBufMax]; int32_t buflen = 0; bool needReorder = false; int err; err = nextBaseAndAnyMarks(&str, strLimit, case_sens, false /* allow_slashes */, unorm, unormcc, &unormlen, &unormstart, buf, bufcc, &buflen, &needReorder, &start); if (err != 0) { return err; } if (buflen > 0) { /* Now buffer should have all of the combining marks up to the next base char. * Normally it will also start with the last base char encountered (unless the * UTF8 string began with a combining mark). */ /* Now reorder combining marks if necessary. */ if (needReorder) { doReorder(buf, bufcc, buflen); } /* Now write to hash func */ hash_func(buf, buflen * sizeof(buf[0]), hash_ctx); } /* OK so far, top of loop clears buffers to start refilling again */ } while (str < strLimit || unormlen > 0); return 0; } /* * utf8_normalizeOptCaseFoldAndCompare * * strA: A UTF-8 string to be compared (need not be 0 terminated) * strA_len: The byte length of strA (excluding any 0 terminator) * strB: The second UTF-8 string to be compared (need not be 0 terminated) * strB_len: The byte length of strB (excluding any 0 terminator) * case_sens: False for case-insensitive behavior; compares canonical caseless forms. * True for case-sensitive behavior; compares standard NFD forms. * are_equal: On success, set to true if the strings are equal, or set to false * if they are not. * * Returns: 0 on success, or * EILSEQ: One or both of the input strings contains illegal ASCII-range * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, * or contains codepoints that are non-characters or unassigned in * the version of Unicode currently supported (Unicode 9.0). * Note: The comparison may terminate early when a difference is * detected, and may return 0 and set *are_equal=false even * if one or both strings are invalid. */ enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + 4 }; /* room for 03B9 pushback(s) */ int utf8_normalizeOptCaseFoldAndCompare(const char *strA, size_t strA_len, const char *strB, size_t strB_len, bool case_sens, bool *are_equal) { const char *strALimit = strA + strA_len; const char *strBLimit = strB + strB_len; /* Data for the next pending single-char norms from each input; * These will always begin with a base char (combining class 0) * or the first character in the string, which may not be a base */ int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback]; uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback]; int32_t unormAlen = 0, unormBlen = 0; int32_t unormAstart = 0, unormBstart = 0; bool startA = true, startB = true; /* main loop: * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash, * described above. The differences are: * - We keep a normalization buffer and main buffer for each string. * - In the main loop, we do steps 1-3 for each string. * - In step 4, instead of calling the hash function, we compare the two main * buffers; if they are unequal, we return a non-equal result. * - After the end of the main loop, if we still have data for one string but * not the other, return a non-equal result, else return an equal result. */ do { /* Data for the buffers being built up from each input */ int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax]; uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax]; int32_t bufAlen = 0, bufBlen = 0; bool needReorderA = false, needReorderB = false; int err; err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, false /* allow_slashes */, unormA, unormAcc, &unormAlen, &unormAstart, bufA, bufAcc, &bufAlen, &needReorderA, &startA); if (err != 0) { return err; } err = nextBaseAndAnyMarks(&strB, strBLimit, case_sens, false /* allow_slashes */, unormB, unormBcc, &unormBlen, &unormBstart, bufB, bufBcc, &bufBlen, &needReorderB, &startB); if (err != 0) { return err; } if (bufAlen > 0 || bufBlen > 0) { /* Now each buffer should have all of the combining marks up to the next base char. * Normally it will also start with the last base char encountered (unless the * UTF8 string began with a combining mark). */ /* Now reorder combining marks if necessary. */ if (needReorderA) { doReorder(bufA, bufAcc, bufAlen); } if (needReorderB) { doReorder(bufB, bufBcc, bufBlen); } /* handle 03B9 pushback */ int32_t idx; if (!case_sens) { if (bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9 && unormAstart == 0) { int32_t tailCount = 0; while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9) { tailCount++; bufAlen--; } for (idx = unormAlen; idx > 0; idx--) { unormA[idx - 1 + tailCount] = unormA[idx - 1]; unormAcc[idx - 1 + tailCount] = unormAcc[idx - 1]; } for (idx = 0; idx < tailCount; idx++) { unormA[idx] = 0x03B9; unormAcc[idx] = 0; } unormAlen += tailCount; } if (bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9 && unormBstart == 0) { int32_t tailCount = 0; while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9) { tailCount++; bufBlen--; } for (idx = unormBlen; idx > 0; idx--) { unormB[idx - 1 + tailCount] = unormB[idx - 1]; unormBcc[idx - 1 + tailCount] = unormBcc[idx - 1]; } for (idx = 0; idx < tailCount; idx++) { unormB[idx] = 0x03B9; unormBcc[idx] = 0; } unormBlen += tailCount; } } /* Now compare the buffers. */ if (bufAlen != bufBlen || memcmp(bufA, bufB, bufAlen * sizeof(bufA[0])) != 0) { *are_equal = false; return 0; } } /* OK so far, top of loop clears buffers to start refilling again */ } while ((strA < strALimit || unormAlen > 0) && (strB < strBLimit || unormBlen > 0)); *are_equal = (strA == strALimit && unormAlen == 0 && strB == strBLimit && unormBlen == 0); return 0; } /* * utf8_normalizeOptCaseFold * * str: The input UTF-8 string (need not be 0 terminated) * str_len: The byte length of the input string (excluding any 0 terminator) * case_sens: False for case-insensitive behavior; generates canonical caseless form. * True for case-sensitive behavior; generates standard NFD. * ustr: A pointer to a buffer for the resulting UTF-32 string. * ustr_size: The capacity of ustr, in UTF-32 units. * ustr_len: Pointer to a value that will be filled in with the actual length * in UTF-32 units of the string copied to ustr. * * Returns: 0 on success, or * EILSEQ: The input string contains illegal ASCII-range characters * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or * contains codepoints that are non-characters or unassigned in * the version of Unicode currently supported. * ENOMEM: ustr_size is insufficient for the resulting string. In this * case the value returned in *ustr_len is invalid. */ int utf8_normalizeOptCaseFold(const char *str, size_t str_len, bool case_sens, int32_t *ustr, int32_t ustr_size, int32_t *ustr_len) { const char *strLimit = str + str_len; int32_t *ustrCur = ustr; const int32_t *ustrLimit = ustr + ustr_size; /* Data for the next pending single-char norm from input; * This will always begin with a base char (combining class 0) */ int32_t unorm[kNFCSingleCharDecompMax]; uint8_t unormcc[kNFCSingleCharDecompMax]; int32_t unormlen = 0; int32_t unormstart = 0; bool start = true; *ustr_len = 0; do { /* Data for the buffers being built up from input */ int32_t buf[kNCFStreamSafeBufMax]; uint8_t bufcc[kNCFStreamSafeBufMax]; int32_t buflen = 0; bool needReorder = false; int err; err = nextBaseAndAnyMarks(&str, strLimit, case_sens, false /* allow_slashes */, unorm, unormcc, &unormlen, &unormstart, buf, bufcc, &buflen, &needReorder, &start); if (err != 0) { return err; } if (buflen > 0) { if (needReorder) { doReorder(buf, bufcc, buflen); } /* Now copy to output buffer */ int32_t idx; if (ustrCur + buflen > ustrLimit) { return ENOMEM; } for (idx = 0; idx < buflen; idx++) { *ustrCur++ = buf[idx]; } } /* OK so far, top of loop clears buffers to start refilling again */ } while (str < strLimit || unormlen > 0); *ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop return 0; } static int utf8_normalizeOptCaseFoldToUTF8_internal(const char *str, size_t str_len, bool case_sens, bool allow_slashes, char *ustr, size_t ustr_size, size_t *ustr_len) { const char *strLimit = str + str_len; char *ustrCur = ustr; const char *ustrLimit = ustr + ustr_size; /* Data for the next pending single-char norm from input; * This will always begin with a base char (combining class 0) */ int32_t unorm[kNFCSingleCharDecompMax]; uint8_t unormcc[kNFCSingleCharDecompMax]; int32_t unormlen = 0; int32_t unormstart = 0; bool start = true; *ustr_len = 0; do { /* Data for the buffers being built up from input */ int32_t buf[kNCFStreamSafeBufMax]; uint8_t bufcc[kNCFStreamSafeBufMax]; int32_t buflen = 0; bool needReorder = false; int err; err = nextBaseAndAnyMarks(&str, strLimit, case_sens, allow_slashes, unorm, unormcc, &unormlen, &unormstart, buf, bufcc, &buflen, &needReorder, &start); if (err != 0) { return err; } if (buflen > 0) { uint8_t utf8Bytes[kMaxUTF8BytesPerChar]; int32_t *bufPtr = buf; if (needReorder) { doReorder(buf, bufcc, buflen); } /* Now copy to output buffer */ while (buflen-- > 0) { int32_t idx, utf8Len = u32CharToUTF8Bytes((uint32_t)*bufPtr++, utf8Bytes); if (ustrCur + utf8Len > ustrLimit) { return ENOMEM; } for (idx = 0; idx < utf8Len; idx++) { *ustrCur++ = (char)utf8Bytes[idx]; } } } /* OK so far, top of loop clears buffers to start refilling again */ } while (str < strLimit || unormlen > 0); *ustr_len = ustrCur - ustr; return 0; } /* * utf8_normalizeOptCaseFoldToUTF8 * (This is similar to normalizeOptCaseFold except that this has a different output * buffer type, and adds conversion to UTF8 while copying to output buffer) * * str: The input UTF-8 string (need not be 0 terminated) * str_len: The byte length of the input string (excluding any 0 terminator) * case_sens: False for case-insensitive behavior; generates canonical caseless form. * True for case-sensitive behavior; generates standard NFD. * ustr: A pointer to a buffer for the resulting UTF-8 string. * ustr_size: The capacity of ustr, in bytes. * ustr_len: Pointer to a value that will be filled in with the actual length * in bytes of the string copied to ustr. * * Returns: 0 on success, or * EILSEQ: The input string contains illegal ASCII-range characters * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or * contains codepoints that are non-characters or unassigned in * the version of Unicode currently supported. * ENOMEM: ustr_size is insufficient for the resulting string. In this * case the value returned in *ustr_len is invalid. */ int utf8_normalizeOptCaseFoldToUTF8(const char *str, size_t str_len, bool case_sens, char *ustr, size_t ustr_size, size_t *ustr_len) { return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, false /* allow_slashes */, ustr, ustr_size, ustr_len); } /* * utf8_normalizeOptCaseFoldToUTF8ForPath * (This is similar to normalizeOptCaseFoldToUTF8 except that this allows '/' character.) * * str: The input UTF-8 path string * str_len: The byte length of the input path string (excluding any 0 terminator) * case_sens: False for case-insensitive behavior; generates canonical caseless form. * True for case-sensitive behavior; generates standard NFD. * ustr: A pointer to a buffer for the resulting UTF-8 string. * ustr_size: The capacity of ustr, in bytes. * ustr_len: Pointer to a value that will be filled in with the actual length * in bytes of the string copied to ustr. * * Returns: 0 on success, or * EILSEQ: The input string contains illegal ASCII-range characters * (0x00), or is not well-formed stream-safe UTF-8, or * contains codepoints that are non-characters or unassigned in * the version of Unicode currently supported. * ENOMEM: ustr_size is insufficient for the resulting string. In this * case the value returned in *ustr_len is invalid. */ int utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str, size_t str_len, bool case_sens, char *ustr, size_t ustr_size, size_t *ustr_len) { return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, true /* allow_slashes */, ustr, ustr_size, ustr_len); } /* * utf8_normalizeOptCaseFoldAndMatchSubstring * * strA: A UTF-8 string (need not be 0 terminated) in which to search for the * substring specified by ustrB. * strA_len: The byte length of strA (excluding any 0 terminator) * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched * for in the UTF-32 string resulting from converting strA to the normalized * UTF-32 form specified by the case_sens parameter; ustrB must already be * in that form. * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). * case_sens: False for case-insensitive matching; compares canonical caseless forms. * True for case-sensitive matching; compares standard NFD forms. * buf: Pointer to caller-supplied working memory for storing the portion of * strA which has been converted to normalized UTF-32. * buf_size: The size of buf. * has_match: On success, set to true if strA (when converter to UTF-32 and normalized * per case_sens) contains ustrB, set to false otherwise. * * Returns: 0 on success, or * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is * not well-formed stream-safe UTF-8, or contains codepoints that are * non-characters or unassigned in the version of Unicode currently * supported. * Note: The search may terminate early when a match is detected, and * may return 0 and set *has_match=true even if strA is invalid. * ENOMEM: buf_size is insufficient. */ int utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA, size_t strA_len, const int32_t *ustrB, int32_t ustrB_len, bool case_sens, void *buf, size_t buf_size, bool *has_match) { /* * ustrA represents the current position in the UTF-32 normalized version of strA * at which we want to test for a match; ustrANormEnd is the position beyond that * which is just after the end of what has already been converted from strA to * UTF-32 normalized form. * Each time through the main loop: * - The first task is to make sure we have enough of strA converted to UTF32 * normalized form to test for match with ustrB at the current match position. * If we don't, then convert more of strA to UTF-32 normalized form until we * have enough to compare with ustrB. To do this, run a loop which is like the * main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of * calling the hash function, we copy the normalized buffer to ustrANormEnd, * advancing the latter. We keep doing this until we have enough additional * converted to match with ustrB. * - Then we test for match of ustrB at the current ustrA position. If there is * a match we return; otherwise, if there is more strA to convert we advance * ustrA and repeat the main loop, otherwise we return without a match. */ if (ustrB_len == 0) { /* always matches */ *has_match = true; return 0; } *has_match = false; /* initialize return value */ if (ustrB_len > 2 * strA_len) { /* If ustrB is clearly too long to find in strA, don't bother normalizing strA. * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit. * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units. * The maximum expansion from unnormalized UTF-8 byte length to normalized * UTF-32 unit length is thus 2. */ return 0; } const char *strALimit = strA + strA_len; int32_t *ustrA = (int32_t *)buf; const int32_t *ustrALimit = ustrA + (buf_size / sizeof(int32_t)); int32_t *ustrANormEnd = ustrA; /* how far we have already normalized in ustrA */ /* Data for the next pending single-char norms from each input; * These will always begin with a base char (combining class 0) * or the first character in the string, which may not be a base */ int32_t unormA[kNFCSingleCharDecompMax]; uint8_t unormAcc[kNFCSingleCharDecompMax]; int32_t unormAlen = 0; int32_t unormAstart = 0; bool startA = true; while (true) { /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */ if (ustrANormEnd - ustrA < ustrB_len) { do { /* Data for the buffers being built up from each input */ int32_t bufA[kNCFStreamSafeBufMax]; uint8_t bufAcc[kNCFStreamSafeBufMax]; int32_t bufAlen = 0; bool needReorderA = false; int err; err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, false /* allow_slashes */, unormA, unormAcc, &unormAlen, &unormAstart, bufA, bufAcc, &bufAlen, &needReorderA, &startA); if (err != 0) { return err; } if (bufAlen > 0) { /* Now each buffer should have all of the combining marks up to the next base char. * Normally it will also start with the last base char encountered (unless the * UTF8 string began with a combining mark). */ /* Now reorder combining marks if necessary. Should be rare, and sequences should * usually be short when does occur => simple bubblesort should be sufficient. */ if (needReorderA) { doReorder(bufA, bufAcc, bufAlen); } /* Now copy to working buffer */ int32_t idx; if (ustrANormEnd + bufAlen > ustrALimit) { return ENOMEM; } for (idx = 0; idx < bufAlen; idx++) { *ustrANormEnd++ = bufA[idx]; } } /* OK so far, top of loop clears buffers to start refilling again */ } while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit || unormAlen > 0)); } if (ustrANormEnd - ustrA < ustrB_len) { return 0; /* not enough of strA left for match */ } /* check for match, return if so */ if (memcmp(ustrA, ustrB, ustrB_len * sizeof(ustrB[0])) == 0) { *has_match = true; return 0; } ustrA++; /* advance match position */ } } /* nextBaseAndAnyMarks: * Guts of code to get next bufferful of base character (or first char in string) * and all trailing combining marks. * This is called each time through the main loop of functions above, and does the * following: * 1. If there are characters available in the normalization result buffer (from the * result of normalizing a previous input character), copy the first character and * any following characters that have non-zero combining class to the main buffer. * 2. If there is nothing left in the normalization buffer, then loop processing * input characters as follows: * a) Get the next input character from UTF8, get its normalized and case-folded * result in the normalization buffer. * b) If the first character in the normalization buffer has combining class 0, * break; we will handle this normalization buffer next time through the main * loop. * c) Else copy the current normalization buffer (which has only combining marks) * to the main buffer, and continue with the loop processing input characters. */ static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes, int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, int32_t* buf, uint8_t* bufcc, int32_t* buflenP, bool* needReorderP, bool* startP) { /* update buffers for str */ if (*unormlenP > 0 && *unormstartP < *unormlenP) { /* unorm begins with a base char; buflen should be 0 */ *needReorderP = false; for (*buflenP = 0; true;) { if (*buflenP > 0 && unormcc[*unormstartP] > 0 && unormcc[*unormstartP] < bufcc[(*buflenP) - 1]) { *needReorderP = true; } buf[*buflenP] = unorm[*unormstartP]; bufcc[(*buflenP)++] = unormcc[(*unormstartP)++]; if (*unormstartP >= *unormlenP || unormcc[*unormstartP] == 0) { break; } } } if (*unormstartP >= *unormlenP) { *unormstartP = *unormlenP = 0; while (*strP < strLimit) { int32_t idx; uint32_t bytevalue = (uint8_t)*(*strP)++; /* '/' is not produced by NFD decomposition from another character so we can * check for it before normalization */ if (bytevalue == 0 || (bytevalue == 0x2F /*'/'*/ && !allow_slashes)) { return EILSEQ; } if (bytevalue < 0x80) { unorm[0] = (!case_sens && bytevalue >= 'A' && bytevalue <= 'Z')? bytevalue += 0x20: bytevalue; *unormlenP = 1; unormcc[0] = 0; *startP = false; break; } else { int32_t u32char = utf8ToU32Code(bytevalue, strP, strLimit); if (u32char <= 0) { return EILSEQ; } *unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, unorm, unormcc); if (*unormlenP <= 0) { return EILSEQ; } if (unormcc[0] == 0 || *startP) { *startP = false; break; } } /* the latest char decomposes to just combining sequence, add to buffer being built */ if (*buflenP + *unormlenP > kNCFStreamSafeBufMax) { return EILSEQ; } for (idx = 0; idx < *unormlenP; idx++, (*buflenP)++) { if (*buflenP > 0 && unormcc[idx] > 0 && unormcc[idx] < bufcc[(*buflenP) - 1]) { *needReorderP = true; } buf[*buflenP] = unorm[idx]; bufcc[*buflenP] = unormcc[idx]; } *unormlenP = 0; } } return 0; } /* local prototypes used only by internal functions */ static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]); static int32_t adjustCase(bool case_sens, int32_t uSeqLen, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); static uint8_t getCombClassU32Char(int32_t u32char); static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); /* Reorder combining marks if necessary. Should be rare, and sequences should * usually be short when does occur => simple bubblesort should be sufficient. */ void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen) { int32_t idx, jdx; for (idx = 0; idx < buflen - 1; idx++) { for (jdx = buflen - 1; jdx > idx; jdx--) { if (bufcc[jdx] < bufcc[jdx - 1]) { swapBufCharCCWithPrevious(jdx, buf, bufcc); } } } } /* swap function for bubblesort */ static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]) { int32_t bufchar = buf[jdx]; uint8_t bufccval = bufcc[jdx]; buf[jdx] = buf[jdx - 1]; bufcc[jdx] = bufcc[jdx - 1]; buf[jdx - 1] = bufchar; bufcc[jdx - 1] = bufccval; } /* * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes, * and returns the number of UTF8 bytes. * * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h). */ int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]) { int32_t idx = 0; if (u32char <= 0x7F) { utf8Bytes[idx++] = (uint8_t)u32char; } else { if (u32char <= 0x7FF) { utf8Bytes[idx++] = (uint8_t)((u32char >> 6) | 0xC0); } else { if (u32char <= 0xFFFF) { utf8Bytes[idx++] = (uint8_t)((u32char >> 12) | 0xE0); } else { utf8Bytes[idx++] = (uint8_t)((u32char >> 18) | 0xF0); utf8Bytes[idx++] = (uint8_t)(((u32char >> 12) & 0x3F) | 0x80); } utf8Bytes[idx++] = (uint8_t)(((u32char >> 6) & 0x3F) | 0x80); } utf8Bytes[idx++] = (uint8_t)((u32char & 0x3F) | 0x80); } return idx; } /* two macros adapted from ICU's utf8.h */ #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \ ((uint8_t)(leadByte)<0XF0 ? \ ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \ (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0) #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) /* array adapted from ICU's utf_impl.c */ static const int32_t utf8_minLegal[4] = { 0, 0X80, 0x800, 0x10000 }; /* * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point * * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c); * verified to produce the same results (adusted for the difference in API signature). * * assumes at entry that: * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character * has been read, and its value is in u32char * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly * points to the trail bytes for that UTF8 char. * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer) * * For a valid and complete UTF8 character, the function returns its value and advances * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1 * (and the value in *srcPtr is undefined). * Note that while it does not map to surrogate values (generates an error for malformed * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error. * * equivalences used in adapted ICU code: * UChar = uint16_t * UChar32 = int32_t * * This has been validated against ICU behavior. */ STATIC_UNLESS_TEST int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit) { const char* src = *srcPtr; uint8_t pt1, pt2; if (0xE0 < u32char && u32char <= 0xEC && src + 1 < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F && (pt2 = (uint8_t)(src[1] - 0x80)) <= 0x3F) { /* handle U+1000..U+CFFF */ /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */ u32char = (uint16_t)((u32char << 12) | (pt1 << 6) | pt2); src += 2; } else if (u32char < 0xE0 && u32char >= 0xC2 && src < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F) { /* handle U+0080..U+07FF */ u32char = ((u32char & 0x1F) << 6) | pt1; src++; } else { /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */ uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char); if (src + count <= srcLimit) { uint8_t trail; U8_MASK_LEAD_BYTE_LOC(u32char, count); switch (count) { /* branches 3, 2 fall through to the next one */ case 0: /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */ case 5: case 4: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ break; case 3: trail = *src++ - (char)0X80; u32char = (u32char << 6) | trail; /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */ if (u32char >= 0x110 || trail > 0X3F) { break; } case 2: trail = *src++ - (char)0X80; u32char = (u32char << 6) | trail; /* * test for a surrogate D800..DFFF: * before the last (u32char<<6), a surrogate is u32char=360..37F */ if (((u32char & 0xFFE0) == 0x360) || trail > 0X3F) { break; } case 1: trail = *src++ - (char)0X80; u32char = (u32char << 6) | trail; if (trail > 0X3F) { break; } /* correct sequence - all trail bytes have (b7..b6)==(10) */ if (u32char >= utf8_minLegal[count]) { *srcPtr = src; return u32char; } /* no default branch to optimize switch() - all values are covered */ } } u32char = -1; } *srcPtr = src; return u32char; } /* * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result * and the combining classes for each resulting char, or indicate it is invalid. * * The normalized and case-folded result might be up to 4 UTF32 characters (current * max, could change in the future). * * u32char - input UTF32 code point * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) * to receive the normalize result. * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) * to receive the combining classes for the characters in u32NormFoldBuf. If * the first entry has non-zero combining class, the remaining entries do too. * * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax * is insufficient (though it is assumed to be at least 3), else the length of the * normalized and case-folded result (currently in the range 1..4). * * This has been validated against ICU behavior. * * This function is highly dependent on the structure of the data trie; for details on * that structure, see comments in normalizeCaseFoldData.h */ STATIC_UNLESS_TEST int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], uint8_t combClass[kNFCSingleCharDecompMax]) { combClass[0] = 0; /* return hi-range PUA as self, except non-characters */ if (u32char >= kU32HiPUAStart) { if ((u32char & 0xFFFE) == 0xFFFE) { return -1; } u32NormFoldBuf[0] = u32char; return 1; } /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ /* 0 - 0x323FF; everything in between in currently invalid. */ int32_t u32charLookup = u32char; if (u32charLookup >= kU32LowRangeLimit) { u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); if (u32charLookup < kU32LowRangeLimit || u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) { return -1; /* in the large range of currently-unassigned code points */ } } /* Now we have u32charLookup either in 0..0x323FF representing u32char itself, * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in * the trie that identifies unassigneds in this range, or maps others to * decomps or combining class or just self. */ uint16_t trieValue; /* TrieHi */ trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; if (trieValue == kInvalidCodeFlag) { return -1; } if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ u32NormFoldBuf[0] = u32char; combClass[0] = trieValue & kFlagValueMask; return 1; } if (trieValue == kHangulMask) { combClass[1] = combClass[2] = 0; return decomposeHangul(u32char, u32NormFoldBuf); } /* TrieMid */ trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; if (trieValue == kInvalidCodeFlag) { return -1; } if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { u32NormFoldBuf[0] = u32char; combClass[0] = trieValue & kFlagValueMask; return adjustCase(case_sens, 1, u32NormFoldBuf); } if ((trieValue & kFlagTestMask) == kInvMaskFlag) { uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask]; uint16_t testBit = (uint16_t)(1 << (u32charLookup & kNFTrieLoMask)); if (testBit & invalidMask) { /* invalid */ return -1; } else { /* treat like trieValue == 0 above */ u32NormFoldBuf[0] = u32char; return adjustCase(case_sens, 1, u32NormFoldBuf); } } if (trieValue == kHangulMask) { combClass[1] = combClass[2] = 0; return decomposeHangul(u32char, u32NormFoldBuf); } /* TrieLo */ trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask]; if (trieValue == kInvalidCodeFlag) { return -1; } if (trieValue == kHangulMask) { combClass[1] = combClass[2] = 0; return decomposeHangul(u32char, u32NormFoldBuf); } if (trieValue < kToU16Seq2Mask || trieValue > kSpecialsEnd) { if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { u32NormFoldBuf[0] = u32char; combClass[0] = trieValue & kFlagValueMask; } else { u32NormFoldBuf[0] = trieValue; } return adjustCase(case_sens, 1, u32NormFoldBuf); } const uint16_t* u16SeqPtr = NULL; const int32_t* u32SeqPtr = NULL; int32_t uSeqLen = 0; switch (trieValue & kSpecialsMask) { case kToU16Seq2Mask: if (case_sens && (trieValue & kToSeqCaseFoldMask)) { /* don't use the mapping, it is only for case folding */ u32NormFoldBuf[0] = u32char; /* already have combClass[0] = 0 */ return 1; } u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask]; uSeqLen = 2; break; case kToU16Seq3Mask: if (case_sens && (trieValue & kToSeqCaseFoldMask)) { /* don't use the mapping, it is only for case folding */ u32NormFoldBuf[0] = u32char; /* already have combClass[0] = 0 */ return 1; } u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask]; uSeqLen = 3; break; case kToU16SeqMiscMask: u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask]; uSeqLen = *u16SeqPtr & kToSeqMiscLenMask; combClass[0] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift); break; case kToU32CharMask: if (case_sens && (trieValue & kToSeqCaseFoldMask)) { /* don't use the mapping, it is only for case folding */ u32NormFoldBuf[0] = u32char; /* already have combClass[0] = 0 */ return 1; } u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask]; uSeqLen = 1; break; case kToU32SeqMiscMask: u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask]; uSeqLen = *u32SeqPtr & kToSeqMiscLenMask; combClass[0] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift); break; default: return -1; } if (kNFCSingleCharDecompMax < uSeqLen) { return 0; } int32_t idx; for (idx = 0; idx < uSeqLen; idx++) { u32NormFoldBuf[idx] = (u16SeqPtr)? *u16SeqPtr++: *u32SeqPtr++; if (idx > 0) { combClass[idx] = getCombClassU32Char(u32NormFoldBuf[idx]); } } return adjustCase(case_sens, uSeqLen, u32NormFoldBuf); } /* * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding * * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only * uSeqLen - length of the sequence specified in the u32NormFoldBuf * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3) * with normalized result. * * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax * is insufficient (though it is assumed to be at least 3), else the length of the * normalized and case-folded result (currently in the range 1..4). * * This function is a reduced version of normalizeOptCaseFoldU32Char above. */ static int32_t adjustCase(bool case_sens, int32_t uSeqLen, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) { if (!case_sens && uSeqLen > 0) { if (u32NormFoldBuf[0] < kSimpleCaseFoldLimit) { u32NormFoldBuf[0] = nfBasicCF[u32NormFoldBuf[0]]; /* There is one case in which this maps to a character with different combining * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the * first or only character in the sequence, we want to keep the original * combining class, so nothing special to do here. */ } /* The following is the only case where we have a casefolding after the first * character in the sequence. Don't worry about combining class here. that gets * set later for characters after the first. */ if (uSeqLen > 1 && u32NormFoldBuf[uSeqLen - 1] == 0x0345) { u32NormFoldBuf[uSeqLen - 1] = 0x03B9; } } return uSeqLen; } /* * getCombClassU32Char, map a single character (in UTF32 form) to its combining class. * * u32char - input UTF32 code point. This is assumed to be a valid character that does * not have a decomposition. * * returns combining class of the character. * * This is only called for characters after the first is a decomposition expansion. In * this situation, if we encounter U+03B9 (combining class 0), it is only there as the * case-folding of U+0345 (combining class 240). In this case it is the combining class * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all. * * This function is a reduced version of normalizeOptCaseFoldU32Char above. */ static uint8_t getCombClassU32Char(int32_t u32char) { if (u32char >= kU32HiPUAStart) { return 0; } if (u32char == 0x03B9) { return 240; } /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ /* 0 - 0x323FF; everything in between in currently invalid. */ int32_t u32charLookup = u32char; if (u32charLookup >= kU32LowRangeLimit) { u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); } /* Now we have u32charLookup either in 0..0x323FF representing u32char itself, * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in * the trie that identifies unassigneds in this range, or maps others to * decomps or combining class or just self. */ uint16_t trieValue; /* TrieHi */ trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { return trieValue & kFlagValueMask; } /* TrieMid */ trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ return trieValue & kFlagValueMask; } if ((trieValue & kFlagTestMask) == kInvMaskFlag) { return 0; } /* TrieLo */ trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask]; return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): 0; } /* * decomposeHangul, map a single UTF32 code point for a composed Hangul * in the range AC00-D7A3, using algorithmic decomp * * The normalized result will be 2 or 3 UTF32 characters. * * u32char - input UTF32 code point * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) * to receive the normalize result. * * returns the length of the normalized result (2..3). * * Adapted from ICU Hangul:decompose in normalizer2impl.h * */ enum { HANGUL_BASE=0xAC00, JAMO_L_BASE=0x1100, /* "lead" jamo */ JAMO_V_BASE=0x1161, /* "vowel" jamo */ JAMO_T_BASE=0x11A7, /* "trail" jamo */ JAMO_L_COUNT=19, JAMO_V_COUNT=21, JAMO_T_COUNT=28, }; static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) { u32char -= HANGUL_BASE; int32_t tIndex = u32char % JAMO_T_COUNT; u32char /= JAMO_T_COUNT; u32NormFoldBuf[0] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT); u32NormFoldBuf[1] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT); if (tIndex == 0) { return 2; } u32NormFoldBuf[2] = (uint16_t)(JAMO_T_BASE + tIndex); return 3; }