summaryrefslogtreecommitdiffstats
path: root/crawl-ref/source/unicode.cc
diff options
context:
space:
mode:
authorAdam Borowski <kilobyte@angband.pl>2010-09-16 16:07:06 +0200
committerAdam Borowski <kilobyte@angband.pl>2010-09-16 16:07:06 +0200
commit0fac9b0ec9f22978b94594373d65c5a70a4441fd (patch)
tree0771ba1208db794d38bb2fc7e305c46fec329e69 /crawl-ref/source/unicode.cc
parent5c062a1c50e2b2554ae401631f168ac3e64d77b8 (diff)
downloadcrawl-ref-0fac9b0ec9f22978b94594373d65c5a70a4441fd.tar.gz
crawl-ref-0fac9b0ec9f22978b94594373d65c5a70a4441fd.zip
Basic charset conversions: local<->utf8<->utf16.
Their intended use is: * local: config files, morgues. File names (non-Windows). * utf8: everything internal. Data files (des, database). * utf16: (Windows only) file names and similar syscalls. On Unix systems, the "local" charset is in a vast majority of cases UTF-8, but we can't rely on that and have to convert anyway.
Diffstat (limited to 'crawl-ref/source/unicode.cc')
-rw-r--r--crawl-ref/source/unicode.cc217
1 files changed, 217 insertions, 0 deletions
diff --git a/crawl-ref/source/unicode.cc b/crawl-ref/source/unicode.cc
new file mode 100644
index 0000000000..7c043fb00e
--- /dev/null
+++ b/crawl-ref/source/unicode.cc
@@ -0,0 +1,217 @@
+/*
+ * File: unicode.cc
+ * Summary: Conversions between Unicode and local charsets, string
+ * manipulation functions that act on character types.
+ * Written by: Adam Borowski
+ */
+
+#include "AppHdr.h"
+
+#include <locale.h>
+#include <string>
+#include <string.h>
+#include <limits.h>
+
+#include "unicode.h"
+
+// there must be at least 4 bytes free, NOT CHECKED!
+int wctoutf8(char *d, ucs_t s)
+{
+ if (s < 0x80)
+ {
+ d[0] = s;
+ return 1;
+ }
+ if (s < 0x800)
+ {
+ d[0] = ( s >> 6) | 0xc0;
+ d[1] = ( s & 0x3f) | 0x80;
+ return 2;
+ }
+ if (s < 0x10000)
+ {
+ d[0] = ( s >> 12) | 0xe0;
+ d[1] = ((s >> 6) & 0x3f) | 0x80;
+ d[2] = ( s & 0x3f) | 0x80;
+ return 3;
+ }
+ if (s < 0x110000)
+ {
+ d[0] = ( s >> 18) | 0xf0;
+ d[1] = ((s >> 12) & 0x3f) | 0x80;
+ d[2] = ((s >> 6) & 0x3f) | 0x80;
+ d[3] = ( s & 0x3f) | 0x80;
+ return 4;
+ }
+ // Invalid char marker (U+FFFD).
+ d[0] = 0xef;
+ d[1] = 0xbf;
+ d[2] = 0xbd;
+ return 3;
+}
+
+int utf8towc(ucs_t *d, const char *s)
+{
+ if (*s == 0)
+ {
+ *d = 0;
+ return 0;
+ }
+ if (!(*s & 0x80))
+ {
+ *d = *s;
+ return 1;
+ }
+ if ((*s & 0xc0) == 0x80)
+ { // bare tail, invalid
+ *d = 0xFFFD;
+ int bad = 0;
+ do bad++; while((s[bad] & 0xc0) == 0x80);
+ return bad;
+ }
+
+ int cnt;
+ ucs_t c;
+ if ((*s & 0xe0) == 0xc0)
+ cnt=2, c = *s & 0x1f;
+ else if ((*s & 0xf0) == 0xe0)
+ cnt=3, c = *s & 0x0f;
+ else if ((*s & 0xf8) == 0xf0)
+ cnt=4, c =*s & 0x07;
+ /* valid UTF-8, invalid Unicode
+ else if ((*s & 0xfc) == 0xf8)
+ cnt=5, c = *s & 0x03;
+ else if ((*s & 0xfe) == 0xfc)
+ cnt=6, c = *s & 0x01;
+ */
+ else
+ { // 0xfe or 0xff, invalid
+ *d = 0xFFFD;
+ return 1;
+ }
+
+ for (int i = 1; i < cnt; i++)
+ {
+ if ((s[i] & 0xc0) != 0x80)
+ { // only tail characters are allowed here, invalid
+ *d = 0xFFFD;
+ return i;
+ }
+ c = (c << 6) | (s[i] & 0x3f);
+ }
+
+ if (c < 0xA0 // illegal characters
+ || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogates
+ || (cnt == 3 && c < 0x800) // overlong characters
+ || (cnt == 4 && c < 0x10000) // overlong characters
+ || c > 0x10FFFF) // outside Unicode
+ {
+ c = 0xFFFD;
+ }
+ *d = c;
+ return cnt;
+}
+
+#ifdef TARGET_OS_WINDOWS
+std::wstring utf8_to_16(const char *s)
+{
+ std::wstring d;
+ ucs_t c;
+
+ while(int l = utf8towc(&c, s))
+ {
+ s += l;
+ if (c >= 0x10000)
+ {
+ c -= 0x10000;
+ d.push_back(0xD800 + (c >> 10));
+ d.push_back(0xDC00 + (c & 0x3FF));
+ }
+ else
+ d.push_back(c);
+ }
+ return d;
+}
+
+std::string utf16_to_8(const wchar_t *s)
+{
+ std::string d;
+ ucs_t c;
+
+ while(*s)
+ {
+ if (*s >= 0xD800 && *s <= 0xDBFF)
+ if (s[1] >= 0xDC00 && s[1] <= 0xDFFF)
+ {
+ c = (((ucs_t)s[0]) << 10) + s[1] - 0x35fdc00;
+ s++;
+ }
+ else
+ c = 0xFFFD; // leading surrogate without its tail
+ else if (*s >= 0xDC00 && *s <= 0xDFFF)
+ c = 0xFFFD; // unpaired trailing surrogate
+ else
+ c = *s;
+ s++;
+
+ char buf[4];
+ int l = wctoutf8(buf, c);
+ for (int i = 0; i < l; i++)
+ d.push_back(buf[i]);
+ }
+
+ return d;
+}
+#endif
+
+std::string utf8_to_mb(const char *s)
+{
+ std::string d;
+ ucs_t c;
+ int l;
+ mbstate_t ps;
+
+ memset(&ps, 0, sizeof(ps));
+ while((l = utf8towc(&c, s)))
+ {
+ s += l;
+
+ char buf[MB_LEN_MAX];
+ int r = wcrtomb(buf, c, &ps);
+ if (r != -1)
+ {
+ for (int i = 0; i < l; i++)
+ d.push_back(buf[i]);
+ }
+ else
+ d.push_back('?'); // TODO: try to transliterate
+ }
+ return d;
+}
+
+std::string mb_to_utf8(const char *s)
+{
+ std::string d;
+ wchar_t c;
+ int l;
+ mbstate_t ps;
+
+ memset(&ps, 0, sizeof(ps));
+ // the input is zero-terminated, so third argument doesn't matter
+ while((l = mbrtowc(&c, s, MB_LEN_MAX, &ps)))
+ {
+ if (l > 0)
+ s += l;
+ else
+ { // invalid input, mark it and try to recover
+ s++;
+ c = 0xFFFD;
+ }
+
+ char buf[4];
+ int r = wctoutf8(buf, c);
+ for (int i = 0; i < r; i++)
+ d.push_back(buf[i]);
+ }
+ return d;
+}