diff options
author | Adam Borowski <kilobyte@angband.pl> | 2010-09-16 16:07:06 +0200 |
---|---|---|
committer | Adam Borowski <kilobyte@angband.pl> | 2010-09-16 16:07:06 +0200 |
commit | 0fac9b0ec9f22978b94594373d65c5a70a4441fd (patch) | |
tree | 0771ba1208db794d38bb2fc7e305c46fec329e69 /crawl-ref/source/unicode.cc | |
parent | 5c062a1c50e2b2554ae401631f168ac3e64d77b8 (diff) | |
download | crawl-ref-0fac9b0ec9f22978b94594373d65c5a70a4441fd.tar.gz crawl-ref-0fac9b0ec9f22978b94594373d65c5a70a4441fd.zip |
Basic charset conversions: local<->utf8<->utf16.
Their intended use is:
* local: config files, morgues. File names (non-Windows).
* utf8: everything internal. Data files (des, database).
* utf16: (Windows only) file names and similar syscalls.
On Unix systems, the "local" charset is in a vast majority of cases UTF-8,
but we can't rely on that and have to convert anyway.
Diffstat (limited to 'crawl-ref/source/unicode.cc')
-rw-r--r-- | crawl-ref/source/unicode.cc | 217 |
1 files changed, 217 insertions, 0 deletions
diff --git a/crawl-ref/source/unicode.cc b/crawl-ref/source/unicode.cc new file mode 100644 index 0000000000..7c043fb00e --- /dev/null +++ b/crawl-ref/source/unicode.cc @@ -0,0 +1,217 @@ +/* + * File: unicode.cc + * Summary: Conversions between Unicode and local charsets, string + * manipulation functions that act on character types. + * Written by: Adam Borowski + */ + +#include "AppHdr.h" + +#include <locale.h> +#include <string> +#include <string.h> +#include <limits.h> + +#include "unicode.h" + +// there must be at least 4 bytes free, NOT CHECKED! +int wctoutf8(char *d, ucs_t s) +{ + if (s < 0x80) + { + d[0] = s; + return 1; + } + if (s < 0x800) + { + d[0] = ( s >> 6) | 0xc0; + d[1] = ( s & 0x3f) | 0x80; + return 2; + } + if (s < 0x10000) + { + d[0] = ( s >> 12) | 0xe0; + d[1] = ((s >> 6) & 0x3f) | 0x80; + d[2] = ( s & 0x3f) | 0x80; + return 3; + } + if (s < 0x110000) + { + d[0] = ( s >> 18) | 0xf0; + d[1] = ((s >> 12) & 0x3f) | 0x80; + d[2] = ((s >> 6) & 0x3f) | 0x80; + d[3] = ( s & 0x3f) | 0x80; + return 4; + } + // Invalid char marker (U+FFFD). + d[0] = 0xef; + d[1] = 0xbf; + d[2] = 0xbd; + return 3; +} + +int utf8towc(ucs_t *d, const char *s) +{ + if (*s == 0) + { + *d = 0; + return 0; + } + if (!(*s & 0x80)) + { + *d = *s; + return 1; + } + if ((*s & 0xc0) == 0x80) + { // bare tail, invalid + *d = 0xFFFD; + int bad = 0; + do bad++; while((s[bad] & 0xc0) == 0x80); + return bad; + } + + int cnt; + ucs_t c; + if ((*s & 0xe0) == 0xc0) + cnt=2, c = *s & 0x1f; + else if ((*s & 0xf0) == 0xe0) + cnt=3, c = *s & 0x0f; + else if ((*s & 0xf8) == 0xf0) + cnt=4, c =*s & 0x07; + /* valid UTF-8, invalid Unicode + else if ((*s & 0xfc) == 0xf8) + cnt=5, c = *s & 0x03; + else if ((*s & 0xfe) == 0xfc) + cnt=6, c = *s & 0x01; + */ + else + { // 0xfe or 0xff, invalid + *d = 0xFFFD; + return 1; + } + + for (int i = 1; i < cnt; i++) + { + if ((s[i] & 0xc0) != 0x80) + { // only tail characters are allowed here, invalid + *d = 0xFFFD; + return i; + } + c = (c << 6) | (s[i] & 0x3f); + } + + if (c < 0xA0 // illegal characters + || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogates + || (cnt == 3 && c < 0x800) // overlong characters + || (cnt == 4 && c < 0x10000) // overlong characters + || c > 0x10FFFF) // outside Unicode + { + c = 0xFFFD; + } + *d = c; + return cnt; +} + +#ifdef TARGET_OS_WINDOWS +std::wstring utf8_to_16(const char *s) +{ + std::wstring d; + ucs_t c; + + while(int l = utf8towc(&c, s)) + { + s += l; + if (c >= 0x10000) + { + c -= 0x10000; + d.push_back(0xD800 + (c >> 10)); + d.push_back(0xDC00 + (c & 0x3FF)); + } + else + d.push_back(c); + } + return d; +} + +std::string utf16_to_8(const wchar_t *s) +{ + std::string d; + ucs_t c; + + while(*s) + { + if (*s >= 0xD800 && *s <= 0xDBFF) + if (s[1] >= 0xDC00 && s[1] <= 0xDFFF) + { + c = (((ucs_t)s[0]) << 10) + s[1] - 0x35fdc00; + s++; + } + else + c = 0xFFFD; // leading surrogate without its tail + else if (*s >= 0xDC00 && *s <= 0xDFFF) + c = 0xFFFD; // unpaired trailing surrogate + else + c = *s; + s++; + + char buf[4]; + int l = wctoutf8(buf, c); + for (int i = 0; i < l; i++) + d.push_back(buf[i]); + } + + return d; +} +#endif + +std::string utf8_to_mb(const char *s) +{ + std::string d; + ucs_t c; + int l; + mbstate_t ps; + + memset(&ps, 0, sizeof(ps)); + while((l = utf8towc(&c, s))) + { + s += l; + + char buf[MB_LEN_MAX]; + int r = wcrtomb(buf, c, &ps); + if (r != -1) + { + for (int i = 0; i < l; i++) + d.push_back(buf[i]); + } + else + d.push_back('?'); // TODO: try to transliterate + } + return d; +} + +std::string mb_to_utf8(const char *s) +{ + std::string d; + wchar_t c; + int l; + mbstate_t ps; + + memset(&ps, 0, sizeof(ps)); + // the input is zero-terminated, so third argument doesn't matter + while((l = mbrtowc(&c, s, MB_LEN_MAX, &ps))) + { + if (l > 0) + s += l; + else + { // invalid input, mark it and try to recover + s++; + c = 0xFFFD; + } + + char buf[4]; + int r = wctoutf8(buf, c); + for (int i = 0; i < r; i++) + d.push_back(buf[i]); + } + return d; +} |