2020-09-06 16:53:08 +00:00
|
|
|
/*
|
|
|
|
* Unicode utilities
|
2024-02-10 15:18:11 +00:00
|
|
|
*
|
2020-09-06 16:53:08 +00:00
|
|
|
* Copyright (c) 2017-2018 Fabrice Bellard
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
#ifndef LIBUNICODE_H
|
|
|
|
#define LIBUNICODE_H
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
|
|
|
|
|
|
|
#define LRE_BOOL int /* for documentation purposes */
|
|
|
|
|
|
|
|
/* define it to include all the unicode tables (40KB larger) */
|
|
|
|
#define CONFIG_ALL_UNICODE
|
|
|
|
|
|
|
|
#define LRE_CC_RES_LEN_MAX 3
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
UNICODE_NFC,
|
|
|
|
UNICODE_NFD,
|
|
|
|
UNICODE_NFKC,
|
|
|
|
UNICODE_NFKD,
|
|
|
|
} UnicodeNormalizationEnum;
|
|
|
|
|
|
|
|
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
|
2024-03-03 14:36:00 +00:00
|
|
|
int lre_canonicalize(uint32_t c, LRE_BOOL is_unicode);
|
2020-09-06 16:53:08 +00:00
|
|
|
LRE_BOOL lre_is_cased(uint32_t c);
|
|
|
|
LRE_BOOL lre_is_case_ignorable(uint32_t c);
|
|
|
|
|
|
|
|
/* char ranges */
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
int len; /* in points, always even */
|
|
|
|
int size;
|
|
|
|
uint32_t *points; /* points sorted by increasing value */
|
|
|
|
void *mem_opaque;
|
|
|
|
void *(*realloc_func)(void *opaque, void *ptr, size_t size);
|
|
|
|
} CharRange;
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
CR_OP_UNION,
|
|
|
|
CR_OP_INTER,
|
|
|
|
CR_OP_XOR,
|
|
|
|
} CharRangeOpEnum;
|
|
|
|
|
|
|
|
void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
|
|
|
|
void cr_free(CharRange *cr);
|
|
|
|
int cr_realloc(CharRange *cr, int size);
|
|
|
|
int cr_copy(CharRange *cr, const CharRange *cr1);
|
|
|
|
|
|
|
|
static inline int cr_add_point(CharRange *cr, uint32_t v)
|
|
|
|
{
|
|
|
|
if (cr->len >= cr->size) {
|
|
|
|
if (cr_realloc(cr, cr->len + 1))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
cr->points[cr->len++] = v;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2)
|
|
|
|
{
|
|
|
|
if ((cr->len + 2) > cr->size) {
|
|
|
|
if (cr_realloc(cr, cr->len + 2))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
cr->points[cr->len++] = c1;
|
|
|
|
cr->points[cr->len++] = c2;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len);
|
|
|
|
|
|
|
|
static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2)
|
|
|
|
{
|
|
|
|
uint32_t b_pt[2];
|
|
|
|
b_pt[0] = c1;
|
|
|
|
b_pt[1] = c2 + 1;
|
|
|
|
return cr_union1(cr, b_pt, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
|
|
|
|
const uint32_t *b_pt, int b_len, int op);
|
|
|
|
|
|
|
|
int cr_invert(CharRange *cr);
|
|
|
|
|
2024-03-03 14:36:00 +00:00
|
|
|
int cr_regexp_canonicalize(CharRange *cr, LRE_BOOL is_unicode);
|
2024-01-08 17:42:29 +00:00
|
|
|
|
2020-09-06 16:53:08 +00:00
|
|
|
#ifdef CONFIG_ALL_UNICODE
|
|
|
|
|
|
|
|
LRE_BOOL lre_is_id_start(uint32_t c);
|
|
|
|
LRE_BOOL lre_is_id_continue(uint32_t c);
|
|
|
|
|
|
|
|
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
|
|
|
|
UnicodeNormalizationEnum n_type,
|
|
|
|
void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
|
|
|
|
|
|
|
|
/* Unicode character range functions */
|
|
|
|
|
|
|
|
int unicode_script(CharRange *cr,
|
|
|
|
const char *script_name, LRE_BOOL is_ext);
|
|
|
|
int unicode_general_category(CharRange *cr, const char *gc_name);
|
|
|
|
int unicode_prop(CharRange *cr, const char *prop_name);
|
|
|
|
|
|
|
|
#endif /* CONFIG_ALL_UNICODE */
|
|
|
|
|
|
|
|
#undef LRE_BOOL
|
|
|
|
|
|
|
|
#endif /* LIBUNICODE_H */
|