A reader for Windows-style BOMmed files.

author: Adam Borowski <kilobyte@angband.pl> 2010-12-16 15:09:40 +0100
committer: Adam Borowski <kilobyte@angband.pl> 2010-12-16 16:53:03 +0100
commit: 64ca3ade493233cf0d4c43dfc68cf22622d23b76 (patch)
tree: 0779de71d31ab8390c93db4ea9b9a05c842b2dad /crawl-ref/source/unicode.cc
parent: 851cdeb3d20406d126469933afefbec5e233a417 (diff)
download: crawl-ref-64ca3ade493233cf0d4c43dfc68cf22622d23b76.tar.gz
crawl-ref-64ca3ade493233cf0d4c43dfc68cf22622d23b76.zip
1 files changed, 201 insertions, 2 deletions
diff --git a/crawl-ref/source/unicode.cc b/crawl-ref/source/unicode.cc
index 7c043fb00e..87d43576e1 100644
--- a/crawl-ref/source/unicode.cc
+++ b/crawl-ref/source/unicode.cc
@@ -12,6 +12,7 @@
 #include <string.h>
 #include <limits.h>
 
+#include "syscalls.h"
 #include "unicode.h"
 
 // there must be at least 4 bytes free, NOT CHECKED!
@@ -112,7 +113,6 @@ int utf8towc(ucs_t *d, const char *s)
     return cnt;
 }
 
-#ifdef TARGET_OS_WINDOWS
 std::wstring utf8_to_16(const char *s)
 {
     std::wstring d;
@@ -162,7 +162,6 @@ std::string utf16_to_8(const wchar_t *s)
 
     return d;
 }
-#endif
 
 std::string utf8_to_mb(const char *s)
 {
@@ -215,3 +214,203 @@ std::string mb_to_utf8(const char *s)
     }
     return d;
 }
+
+static std::string utf8_validate(const char *s)
+{
+    std::string d;
+    ucs_t c;
+    int l;
+
+    while((l = utf8towc(&c, s)))
+    {
+        s += l;
+
+        char buf[4];
+        int r = wctoutf8(buf, c);
+        for (int i = 0; i < r; i++)
+            d.push_back(buf[i]);
+    }
+    return d;
+}
+
+static bool _check_trail(FILE *f, const char* bytes, int len)
+{
+    while (len--)
+    {
+        if (fgetc(f) != (unsigned char)*bytes++)
+        {
+            rewind(f);
+            return false;
+        }
+    }
+    return true;
+}
+
+TextFileReader::TextFileReader(const char *name)
+{
+    f = fopen_u(name, "r");
+    if (!f)
+    {
+        seen_eof = true;
+        return;
+    }
+    seen_eof = false;
+
+    bom = BOM_NORMAL;
+    int ch = fgetc(f);
+    switch(ch)
+    {
+    case 0xEF:
+        if (_check_trail(f, "\xBB\xBF", 2))
+            bom = BOM_UTF8;
+        break;
+    case 0xFE:
+        if (_check_trail(f, "\xFF", 1))
+            bom = BOM_UTF16BE;
+        break;
+    case 0xFF:
+        if (_check_trail(f, "\xFE\x00\x00", 3))
+            bom = BOM_UTF32LE;
+        else if (_check_trail(f, "\xFF\xFE", 2)) // rewound
+            bom = BOM_UTF16LE;
+        break;
+    case 0x00:
+        if (_check_trail(f, "\x00\xFE\xFF", 3))
+            bom = BOM_UTF32BE;
+        break;
+    default:
+        ungetc(ch, f);
+    }
+}
+
+TextFileReader::~TextFileReader()
+{
+    if (f)
+        fclose(f);
+}
+
+std::string TextFileReader::get_line()
+{
+    ASSERT(f);
+    std::wstring win; // actually, these are more of a lose
+    std::string out;
+    char buf[512];
+    ucs_t c;
+    int len;
+
+    switch(bom)
+    {
+    case BOM_NORMAL:
+        do
+        {
+            if (!fgets(buf, sizeof buf, f))
+            {
+                seen_eof = true;
+                break;
+            }
+            out += buf;
+            if (out[out.length() - 1] == '\n')
+            {
+                out.erase(out.length() - 1);
+                break;
+            }
+        } while (!seen_eof);
+        return mb_to_utf8(out.c_str());
+
+    case BOM_UTF8:
+        do
+        {
+            if (!fgets(buf, sizeof buf, f))
+            {
+                seen_eof = true;
+                break;
+            }
+            out += buf;
+            if (out[out.length() - 1] == '\n')
+            {
+                out.erase(out.length() - 1);
+                break;
+            }
+        } while (!seen_eof);
+        return utf8_validate(out.c_str());
+
+    case BOM_UTF16LE:
+        do
+        {
+            if (fread(buf, 2, 1, f) != 1)
+            {
+                seen_eof = true;
+                break;
+            }
+            c = ((uint32_t)((unsigned char)buf[0]))
+              | ((uint32_t)((unsigned char)buf[1])) << 8;
+            if (c == '\n')
+                break;
+            win.push_back(c);
+        }
+        while (!seen_eof);
+        return utf16_to_8(win.c_str());
+
+    case BOM_UTF16BE:
+        do
+        {
+            if (fread(buf, 2, 1, f) != 1)
+            {
+                seen_eof = true;
+                break;
+            }
+            c = ((uint32_t)((unsigned char)buf[1]))
+              | ((uint32_t)((unsigned char)buf[0])) << 8;
+            if (c == '\n')
+                break;
+            win.push_back(c);
+        }
+        while (!seen_eof);
+        return utf16_to_8(win.c_str());
+
+    case BOM_UTF32LE:
+        do
+        {
+            if (fread(buf, 4, 1, f) != 1)
+            {
+                seen_eof = true;
+                break;
+            }
+            c = ((uint32_t)((unsigned char)buf[0]))
+              | ((uint32_t)((unsigned char)buf[1])) << 8
+              | ((uint32_t)((unsigned char)buf[2])) << 16
+              | ((uint32_t)((unsigned char)buf[3])) << 24;
+            if (c == '\n')
+                break;
+            len = wctoutf8(buf, c);
+            for (int i = 0; i < len; i++)
+                out.push_back(buf[i]);
+        }
+        while (!seen_eof);
+        return out;
+
+    case BOM_UTF32BE:
+        do
+        {
+            if (fread(buf, 4, 1, f) != 1)
+            {
+                seen_eof = true;
+                break;
+            }
+            c = ((uint32_t)((unsigned char)buf[0])) << 24
+              | ((uint32_t)((unsigned char)buf[1])) << 16
+              | ((uint32_t)((unsigned char)buf[2])) << 8
+              | ((uint32_t)((unsigned char)buf[3]));
+            if (c == '\n')
+                break;
+            len = wctoutf8(buf, c);
+            for (int i = 0; i < len; i++)
+                out.push_back(buf[i]);
+        }
+        while (!seen_eof);
+        return out;
+    }
+
+    ASSERT(!"our memory got trampled!");
+    return "говно";
+}
author	Adam Borowski <kilobyte@angband.pl>	2010-12-16 15:09:40 +0100
committer	Adam Borowski <kilobyte@angband.pl>	2010-12-16 16:53:03 +0100
commit	64ca3ade493233cf0d4c43dfc68cf22622d23b76 (patch)
tree	0779de71d31ab8390c93db4ea9b9a05c842b2dad /crawl-ref/source/unicode.cc
parent	851cdeb3d20406d126469933afefbec5e233a417 (diff)
download	crawl-ref-64ca3ade493233cf0d4c43dfc68cf22622d23b76.tar.gz crawl-ref-64ca3ade493233cf0d4c43dfc68cf22622d23b76.zip