summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesse Luehrs <doy@tozt.net>2016-05-03 04:22:09 -0400
committerJesse Luehrs <doy@tozt.net>2016-05-03 04:22:09 -0400
commit9c2a7522054b15ccd95c0c45e2a566376a037b72 (patch)
tree328275b370344a2c13d5e6f74f48f4fddd93ff2e
parentb09da58557d35552cb1ddefa206012fafcbaa95f (diff)
downloadlibvt100-9c2a7522054b15ccd95c0c45e2a566376a037b72.tar.gz
libvt100-9c2a7522054b15ccd95c0c45e2a566376a037b72.zip
recognize most emoji as wide
-rw-r--r--Makefile3
-rw-r--r--src/screen.c2
-rw-r--r--src/unicode-extra.c95
-rw-r--r--src/unicode-extra.h8
-rw-r--r--src/vt100.h1
5 files changed, 107 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index de9518b..b1a6484 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,8 @@ SOUT = libvt100.a
BUILD = build/
SRC = src/
OBJ = $(BUILD)parser.o \
- $(BUILD)screen.o
+ $(BUILD)screen.o \
+ $(BUILD)unicode-extra.o
LIBS = glib-2.0
CFLAGS ?= -g -Wall -Wextra -Werror
LDFLAGS ?= -g -Wall -Wextra -Werror
diff --git a/src/screen.c b/src/screen.c
index 3faa070..c3c6af7 100644
--- a/src/screen.c
+++ b/src/screen.c
@@ -210,7 +210,7 @@ void vt100_screen_show_string_utf8(VT100Screen *vt, char *buf, size_t len)
uc = g_utf8_get_char(c);
/* XXX handle zero width characters */
- is_wide = g_unichar_iswide(uc);
+ is_wide = vt100_is_wide_char(uc);
ctype = g_unichar_type(uc);
/* XXX should this also include spacing marks? */
is_combining = ctype == G_UNICODE_ENCLOSING_MARK
diff --git a/src/unicode-extra.c b/src/unicode-extra.c
new file mode 100644
index 0000000..3a59378
--- /dev/null
+++ b/src/unicode-extra.c
@@ -0,0 +1,95 @@
+#include <glib.h>
+
+#include "vt100.h"
+
+/*
+ * so, here's the story. unicode doesn't actually define monospace width for
+ * characters in a way that's useful. there's an "east asian width" property
+ * that mostly works, but leaves a bunch of things ambiguous (if you're
+ * displaying this as part of a bunch of east asian text, then it's wide, but
+ * if you're not, then it's narrow). we in general treat ambiguous characters
+ * as narrow for now (although this should perhaps be an option in the future).
+ * one place where this does not work out, though, is emoji. emoji do not have
+ * a useful width property (see
+ * http://www.unicode.org/L2/L2016/16027-emoji-terminals-eaw.pdf), and even the
+ * proposal in that link to make every character with Emoji_Presentation=true a
+ * wide character isn't really how things work in practice - for instance,
+ * U+231A (WATCH) is rendered by most monospace fonts (that I've seen, anyway)
+ * as a narrow character. as far as i can tell, it appears (although i'm not
+ * certain of this) that all BMP characters with Emoji_Presentation=true are
+ * narrow and all astral plane characters with Emoji_Presentation=true are
+ * wide, so that's what i'm going to go with here. character ranges and data in
+ * this file are taken from
+ * http://www.unicode.org/Public/emoji/2.0//emoji-data.txt.
+ */
+struct vt100_char_range {
+ gunichar start;
+ gunichar end;
+};
+
+static struct vt100_char_range vt100_wide_emoji[] = {
+ { 0x1F004, 0x1F004 },
+ { 0x1F0CF, 0x1F0CF },
+ { 0x1F18E, 0x1F18E },
+ { 0x1F191, 0x1F19A },
+ { 0x1F1E6, 0x1F1FF },
+ { 0x1F201, 0x1F201 },
+ { 0x1F21A, 0x1F21A },
+ { 0x1F22F, 0x1F22F },
+ { 0x1F232, 0x1F236 },
+ { 0x1F238, 0x1F23A },
+ { 0x1F250, 0x1F251 },
+ { 0x1F300, 0x1F320 },
+ { 0x1F32D, 0x1F335 },
+ { 0x1F337, 0x1F37C },
+ { 0x1F37E, 0x1F393 },
+ { 0x1F3A0, 0x1F3CA },
+ { 0x1F3CF, 0x1F3D3 },
+ { 0x1F3E0, 0x1F3F0 },
+ { 0x1F3F4, 0x1F3F4 },
+ { 0x1F3F8, 0x1F43E },
+ { 0x1F440, 0x1F440 },
+ { 0x1F442, 0x1F4FC },
+ { 0x1F4FF, 0x1F53D },
+ { 0x1F54B, 0x1F54E },
+ { 0x1F550, 0x1F567 },
+ { 0x1F595, 0x1F596 },
+ { 0x1F5FB, 0x1F64F },
+ { 0x1F680, 0x1F6C5 },
+ { 0x1F6CC, 0x1F6CC },
+ { 0x1F6D0, 0x1F6D0 },
+ { 0x1F6EB, 0x1F6EC },
+ { 0x1F910, 0x1F918 },
+ { 0x1F980, 0x1F984 },
+ { 0x1F9C0, 0x1F9C0 },
+};
+
+static int vt100_is_wide_emoji(gunichar codepoint);
+
+int vt100_is_wide_char(gunichar codepoint)
+{
+ return g_unichar_iswide(codepoint) || vt100_is_wide_emoji(codepoint);
+}
+
+static int vt100_is_wide_emoji(gunichar codepoint)
+{
+ static size_t ranges = sizeof(vt100_wide_emoji) / sizeof(struct vt100_char_range);
+ ssize_t low = 0, high = ranges - 1;
+
+ do {
+ ssize_t cur = (high + low) / 2;
+ struct vt100_char_range range = vt100_wide_emoji[cur];
+
+ if (codepoint < range.start) {
+ high = cur - 1;
+ }
+ else if (codepoint > range.end) {
+ low = cur + 1;
+ }
+ else {
+ return 1;
+ }
+ } while (low <= high);
+
+ return 0;
+}
diff --git a/src/unicode-extra.h b/src/unicode-extra.h
new file mode 100644
index 0000000..eaa3dce
--- /dev/null
+++ b/src/unicode-extra.h
@@ -0,0 +1,8 @@
+#ifndef _VT100_UNICODE_EXTRA_H
+#define _VT100_UNICODE_EXTRA_H
+
+#include <glib.h>
+
+int vt100_is_wide_char(gunichar codepoint);
+
+#endif
diff --git a/src/vt100.h b/src/vt100.h
index a3429ef..28e02b8 100644
--- a/src/vt100.h
+++ b/src/vt100.h
@@ -6,5 +6,6 @@ struct vt100_screen;
typedef struct vt100_screen VT100Screen;
#include "screen.h"
+#include "unicode-extra.h"
#endif