504 lines
14 KiB
Text
504 lines
14 KiB
Text
|
/**
|
||
|
* PANDA 3D SOFTWARE
|
||
|
* Copyright (c) Carnegie Mellon University. All rights reserved.
|
||
|
*
|
||
|
* All use of this software is subject to the terms of the revised BSD
|
||
|
* license. You should have received a copy of this license along
|
||
|
* with this source code in a file named "LICENSE."
|
||
|
*
|
||
|
* @file textEncoder.I
|
||
|
* @author drose
|
||
|
* @date 2003-03-26
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
INLINE TextEncoder::
|
||
|
TextEncoder() {
|
||
|
_encoding = _default_encoding;
|
||
|
|
||
|
// Initially, since the text string is empty, we know that both _text and
|
||
|
// _wtext accurately reflect the empty state; so we "got" both of them.
|
||
|
_flags = (F_got_text | F_got_wtext);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
INLINE TextEncoder::
|
||
|
TextEncoder(const TextEncoder ©) :
|
||
|
_flags(copy._flags),
|
||
|
_encoding(copy._encoding),
|
||
|
_text(copy._text),
|
||
|
_wtext(copy._wtext)
|
||
|
{
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Specifies how the string set via set_text() is to be interpreted. The
|
||
|
* default, E_iso8859, means a standard string with one-byte characters (i.e.
|
||
|
* ASCII). Other encodings are possible to take advantage of character sets
|
||
|
* with more than 256 characters.
|
||
|
*
|
||
|
* This affects only future calls to set_text(); it does not change text that
|
||
|
* was set previously.
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
set_encoding(TextEncoder::Encoding encoding) {
|
||
|
// Force the previously-set strings to be encoded or decoded now.
|
||
|
get_text();
|
||
|
get_wtext();
|
||
|
_encoding = encoding;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the encoding by which the string set via set_text() is to be
|
||
|
* interpreted. See set_encoding().
|
||
|
*/
|
||
|
INLINE TextEncoder::Encoding TextEncoder::
|
||
|
get_encoding() const {
|
||
|
return _encoding;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Specifies the default encoding to be used for all subsequently created
|
||
|
* TextEncoder objects. See set_encoding().
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
set_default_encoding(TextEncoder::Encoding encoding) {
|
||
|
_default_encoding = encoding;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Specifies the default encoding to be used for all subsequently created
|
||
|
* TextEncoder objects. See set_encoding().
|
||
|
*/
|
||
|
INLINE TextEncoder::Encoding TextEncoder::
|
||
|
get_default_encoding() {
|
||
|
return _default_encoding;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Changes the text that is stored in the encoder. The text should be encoded
|
||
|
* according to the method indicated by set_encoding(). Subsequent calls to
|
||
|
* get_text() will return this same string, while get_wtext() will return the
|
||
|
* decoded version of the string.
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
set_text(const std::string &text) {
|
||
|
if (!has_text() || _text != text) {
|
||
|
_text = text;
|
||
|
_flags = (_flags | F_got_text) & ~F_got_wtext;
|
||
|
text_changed();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The two-parameter version of set_text() accepts an explicit encoding; the
|
||
|
* text is immediately decoded and stored as a wide-character string.
|
||
|
* Subsequent calls to get_text() will return the same text re-encoded using
|
||
|
* whichever encoding is specified by set_encoding().
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
set_text(const std::string &text, TextEncoder::Encoding encoding) {
|
||
|
if (encoding == _encoding) {
|
||
|
set_text(text);
|
||
|
} else {
|
||
|
set_wtext(decode_text(text, encoding));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Removes the text from the TextEncoder.
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
clear_text() {
|
||
|
_text = std::string();
|
||
|
_wtext = std::wstring();
|
||
|
_flags |= (F_got_text | F_got_wtext);
|
||
|
text_changed();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
*/
|
||
|
INLINE bool TextEncoder::
|
||
|
has_text() const {
|
||
|
if (_flags & F_got_wtext) {
|
||
|
return !_wtext.empty();
|
||
|
} else {
|
||
|
return !_text.empty();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the current text, as encoded via the current encoding system.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
get_text() const {
|
||
|
if ((_flags & F_got_text) == 0) {
|
||
|
((TextEncoder *)this)->_text = encode_wtext(_wtext);
|
||
|
((TextEncoder *)this)->_flags |= F_got_text;
|
||
|
}
|
||
|
return _text;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the current text, as encoded via the indicated encoding system.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
get_text(TextEncoder::Encoding encoding) const {
|
||
|
return encode_wtext(get_wtext(), encoding);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Appends the indicates string to the end of the stored text.
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
append_text(const std::string &text) {
|
||
|
if (!text.empty()) {
|
||
|
_text = get_text() + text;
|
||
|
_flags = (_flags | F_got_text) & ~F_got_wtext;
|
||
|
text_changed();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Appends a single character to the end of the stored text. This may be a
|
||
|
* wide character, up to 16 bits in Unicode.
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
append_unicode_char(char32_t character) {
|
||
|
#if WCHAR_MAX >= 0x10FFFF
|
||
|
// wchar_t might be UTF-32.
|
||
|
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
|
||
|
#else
|
||
|
if ((character & ~0xffff) == 0) {
|
||
|
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
|
||
|
} else {
|
||
|
// Encode as a surrogate pair.
|
||
|
uint32_t v = (uint32_t)character - 0x10000u;
|
||
|
wchar_t wstr[2] = {
|
||
|
(wchar_t)((v >> 10u) | 0xd800u),
|
||
|
(wchar_t)((v & 0x3ffu) | 0xdc00u),
|
||
|
};
|
||
|
_wtext = get_wtext() + std::wstring(wstr, 2);
|
||
|
}
|
||
|
#endif
|
||
|
_flags = (_flags | F_got_wtext) & ~F_got_text;
|
||
|
text_changed();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the number of characters in the stored text. This is a count of
|
||
|
* wide characters, after the string has been decoded according to
|
||
|
* set_encoding().
|
||
|
*/
|
||
|
INLINE size_t TextEncoder::
|
||
|
get_num_chars() const {
|
||
|
return get_wtext().length();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the Unicode value of the nth character in the stored text. This
|
||
|
* may be a wide character (greater than 255), after the string has been
|
||
|
* decoded according to set_encoding().
|
||
|
*/
|
||
|
INLINE int TextEncoder::
|
||
|
get_unicode_char(size_t index) const {
|
||
|
get_wtext();
|
||
|
if (index < _wtext.length()) {
|
||
|
return _wtext[index];
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Sets the Unicode value of the nth character in the stored text. This may
|
||
|
* be a wide character (greater than 255), after the string has been decoded
|
||
|
* according to set_encoding().
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
set_unicode_char(size_t index, char32_t character) {
|
||
|
get_wtext();
|
||
|
if (index < _wtext.length()) {
|
||
|
_wtext[index] = character;
|
||
|
_flags &= ~F_got_text;
|
||
|
text_changed();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the nth char of the stored text, as a one-, two-, or three-byte
|
||
|
* encoded string.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
get_encoded_char(size_t index) const {
|
||
|
return get_encoded_char(index, get_encoding());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the nth char of the stored text, as a one-, two-, or three-byte
|
||
|
* encoded string.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
get_encoded_char(size_t index, TextEncoder::Encoding encoding) const {
|
||
|
std::wstring wch(1, (wchar_t)get_unicode_char(index));
|
||
|
return encode_wtext(wch, encoding);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the text associated with the node, converted as nearly as possible
|
||
|
* to a fully-ASCII representation. This means replacing accented letters
|
||
|
* with their unaccented ASCII equivalents.
|
||
|
*
|
||
|
* It is possible that some characters in the string cannot be converted to
|
||
|
* ASCII. (The string may involve symbols like the copyright symbol, for
|
||
|
* instance, or it might involve letters in some other alphabet such as Greek
|
||
|
* or Cyrillic, or even Latin letters like thorn or eth that are not part of
|
||
|
* the ASCII character set.) In this case, as much of the string as possible
|
||
|
* will be converted to ASCII, and the nonconvertible characters will remain
|
||
|
* encoded in the encoding specified by set_encoding().
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
get_text_as_ascii() const {
|
||
|
return encode_wtext(get_wtext_as_ascii());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given the indicated text string, which is assumed to be encoded via the
|
||
|
* encoding "from", decodes it and then reencodes it into the encoding "to",
|
||
|
* and returns the newly encoded string. This does not change or affect any
|
||
|
* properties on the TextEncoder itself.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
reencode_text(const std::string &text, TextEncoder::Encoding from,
|
||
|
TextEncoder::Encoding to) {
|
||
|
return encode_wtext(decode_text(text, from), to);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true if the indicated character is an alphabetic letter, false
|
||
|
* otherwise. This is akin to ctype's isalpha(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE bool TextEncoder::
|
||
|
unicode_isalpha(char32_t character) {
|
||
|
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
|
||
|
if (entry == nullptr) {
|
||
|
return false;
|
||
|
}
|
||
|
return entry->_char_type == UnicodeLatinMap::CT_upper ||
|
||
|
entry->_char_type == UnicodeLatinMap::CT_lower;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true if the indicated character is a numeric digit, false
|
||
|
* otherwise. This is akin to ctype's isdigit(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE bool TextEncoder::
|
||
|
unicode_isdigit(char32_t character) {
|
||
|
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
|
||
|
if (entry == nullptr) {
|
||
|
// The digits aren't actually listed in the map.
|
||
|
return (character >= '0' && character <= '9');
|
||
|
}
|
||
|
// This silly test (!= 0) is necessary to prevent a VC++ warning.
|
||
|
return (isdigit(entry->_ascii_equiv) != 0);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true if the indicated character is a punctuation mark, false
|
||
|
* otherwise. This is akin to ctype's ispunct(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE bool TextEncoder::
|
||
|
unicode_ispunct(char32_t character) {
|
||
|
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
|
||
|
if (entry == nullptr) {
|
||
|
// Some punctuation marks aren't listed in the map.
|
||
|
return (character < 128 && ispunct(character));
|
||
|
}
|
||
|
return entry->_char_type == UnicodeLatinMap::CT_punct;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true if the indicated character is an uppercase letter, false
|
||
|
* otherwise. This is akin to ctype's isupper(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE bool TextEncoder::
|
||
|
unicode_isupper(char32_t character) {
|
||
|
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
|
||
|
if (entry == nullptr) {
|
||
|
return false;
|
||
|
}
|
||
|
return entry->_char_type == UnicodeLatinMap::CT_upper;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true if the indicated character is a whitespace letter, false
|
||
|
* otherwise. This is akin to ctype's isspace(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE bool TextEncoder::
|
||
|
unicode_isspace(char32_t character) {
|
||
|
switch (character) {
|
||
|
case ' ':
|
||
|
case '\t':
|
||
|
case '\n':
|
||
|
return true;
|
||
|
|
||
|
default:
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns true if the indicated character is a lowercase letter, false
|
||
|
* otherwise. This is akin to ctype's islower(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE bool TextEncoder::
|
||
|
unicode_islower(char32_t character) {
|
||
|
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
|
||
|
if (entry == nullptr) {
|
||
|
return false;
|
||
|
}
|
||
|
return entry->_char_type == UnicodeLatinMap::CT_lower;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the uppercase equivalent of the given Unicode character. This is
|
||
|
* akin to ctype's toupper(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE int TextEncoder::
|
||
|
unicode_toupper(char32_t character) {
|
||
|
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
|
||
|
if (entry == nullptr) {
|
||
|
return character;
|
||
|
}
|
||
|
return entry->_toupper_character;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the uppercase equivalent of the given Unicode character. This is
|
||
|
* akin to ctype's tolower(), extended to Unicode.
|
||
|
*/
|
||
|
INLINE int TextEncoder::
|
||
|
unicode_tolower(char32_t character) {
|
||
|
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
|
||
|
if (entry == nullptr) {
|
||
|
return character;
|
||
|
}
|
||
|
return entry->_tolower_character;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Converts the string to uppercase, assuming the string is encoded in the
|
||
|
* default encoding.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
upper(const std::string &source) {
|
||
|
return upper(source, get_default_encoding());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Converts the string to uppercase, assuming the string is encoded in the
|
||
|
* indicated encoding.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
upper(const std::string &source, TextEncoder::Encoding encoding) {
|
||
|
TextEncoder encoder;
|
||
|
encoder.set_encoding(encoding);
|
||
|
encoder.set_text(source);
|
||
|
encoder.make_upper();
|
||
|
return encoder.get_text();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Converts the string to lowercase, assuming the string is encoded in the
|
||
|
* default encoding.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
lower(const std::string &source) {
|
||
|
return lower(source, get_default_encoding());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Converts the string to lowercase, assuming the string is encoded in the
|
||
|
* indicated encoding.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
lower(const std::string &source, TextEncoder::Encoding encoding) {
|
||
|
TextEncoder encoder;
|
||
|
encoder.set_encoding(encoding);
|
||
|
encoder.set_text(source);
|
||
|
encoder.make_lower();
|
||
|
return encoder.get_text();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Changes the text that is stored in the encoder. Subsequent calls to
|
||
|
* get_wtext() will return this same string, while get_text() will return the
|
||
|
* encoded version of the string.
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
set_wtext(const std::wstring &wtext) {
|
||
|
if (!has_text() || _wtext != wtext) {
|
||
|
_wtext = wtext;
|
||
|
_flags = (_flags | F_got_wtext) & ~F_got_text;
|
||
|
text_changed();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the text associated with the TextEncoder, as a wide-character
|
||
|
* string.
|
||
|
*/
|
||
|
INLINE const std::wstring &TextEncoder::
|
||
|
get_wtext() const {
|
||
|
if ((_flags & F_got_wtext) == 0) {
|
||
|
((TextEncoder *)this)->_wtext = decode_text(_text);
|
||
|
((TextEncoder *)this)->_flags |= F_got_wtext;
|
||
|
}
|
||
|
return _wtext;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Appends the indicates string to the end of the stored wide-character text.
|
||
|
*/
|
||
|
INLINE void TextEncoder::
|
||
|
append_wtext(const std::wstring &wtext) {
|
||
|
if (!wtext.empty()) {
|
||
|
_wtext = get_wtext() + wtext;
|
||
|
_flags = (_flags | F_got_wtext) & ~F_got_text;
|
||
|
text_changed();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Encodes a wide-text string into a single-char string, according to the
|
||
|
* current encoding.
|
||
|
*/
|
||
|
INLINE std::string TextEncoder::
|
||
|
encode_wtext(const std::wstring &wtext) const {
|
||
|
return encode_wtext(wtext, _encoding);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the given wstring decoded to a single-byte string, via the current
|
||
|
* encoding system.
|
||
|
*/
|
||
|
INLINE std::wstring TextEncoder::
|
||
|
decode_text(const std::string &text) const {
|
||
|
return decode_text(text, _encoding);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Uses the current default encoding to output the wstring.
|
||
|
*/
|
||
|
INLINE std::ostream &
|
||
|
operator << (std::ostream &out, const std::wstring &str) {
|
||
|
TextEncoder encoder;
|
||
|
encoder.set_wtext(str);
|
||
|
out << encoder.get_text();
|
||
|
return out;
|
||
|
}
|