From 9c2a7522054b15ccd95c0c45e2a566376a037b72 Mon Sep 17 00:00:00 2001 From: Jesse Luehrs Date: Tue, 3 May 2016 04:22:09 -0400 Subject: recognize most emoji as wide --- Makefile | 3 +- src/screen.c | 2 +- src/unicode-extra.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/unicode-extra.h | 8 +++++ src/vt100.h | 1 + 5 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 src/unicode-extra.c create mode 100644 src/unicode-extra.h diff --git a/Makefile b/Makefile index de9518b..b1a6484 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,8 @@ SOUT = libvt100.a BUILD = build/ SRC = src/ OBJ = $(BUILD)parser.o \ - $(BUILD)screen.o + $(BUILD)screen.o \ + $(BUILD)unicode-extra.o LIBS = glib-2.0 CFLAGS ?= -g -Wall -Wextra -Werror LDFLAGS ?= -g -Wall -Wextra -Werror diff --git a/src/screen.c b/src/screen.c index 3faa070..c3c6af7 100644 --- a/src/screen.c +++ b/src/screen.c @@ -210,7 +210,7 @@ void vt100_screen_show_string_utf8(VT100Screen *vt, char *buf, size_t len) uc = g_utf8_get_char(c); /* XXX handle zero width characters */ - is_wide = g_unichar_iswide(uc); + is_wide = vt100_is_wide_char(uc); ctype = g_unichar_type(uc); /* XXX should this also include spacing marks? */ is_combining = ctype == G_UNICODE_ENCLOSING_MARK diff --git a/src/unicode-extra.c b/src/unicode-extra.c new file mode 100644 index 0000000..3a59378 --- /dev/null +++ b/src/unicode-extra.c @@ -0,0 +1,95 @@ +#include + +#include "vt100.h" + +/* + * so, here's the story. unicode doesn't actually define monospace width for + * characters in a way that's useful. there's an "east asian width" property + * that mostly works, but leaves a bunch of things ambiguous (if you're + * displaying this as part of a bunch of east asian text, then it's wide, but + * if you're not, then it's narrow). we in general treat ambiguous characters + * as narrow for now (although this should perhaps be an option in the future). + * one place where this does not work out, though, is emoji. emoji do not have + * a useful width property (see + * http://www.unicode.org/L2/L2016/16027-emoji-terminals-eaw.pdf), and even the + * proposal in that link to make every character with Emoji_Presentation=true a + * wide character isn't really how things work in practice - for instance, + * U+231A (WATCH) is rendered by most monospace fonts (that I've seen, anyway) + * as a narrow character. as far as i can tell, it appears (although i'm not + * certain of this) that all BMP characters with Emoji_Presentation=true are + * narrow and all astral plane characters with Emoji_Presentation=true are + * wide, so that's what i'm going to go with here. character ranges and data in + * this file are taken from + * http://www.unicode.org/Public/emoji/2.0//emoji-data.txt. + */ +struct vt100_char_range { + gunichar start; + gunichar end; +}; + +static struct vt100_char_range vt100_wide_emoji[] = { + { 0x1F004, 0x1F004 }, + { 0x1F0CF, 0x1F0CF }, + { 0x1F18E, 0x1F18E }, + { 0x1F191, 0x1F19A }, + { 0x1F1E6, 0x1F1FF }, + { 0x1F201, 0x1F201 }, + { 0x1F21A, 0x1F21A }, + { 0x1F22F, 0x1F22F }, + { 0x1F232, 0x1F236 }, + { 0x1F238, 0x1F23A }, + { 0x1F250, 0x1F251 }, + { 0x1F300, 0x1F320 }, + { 0x1F32D, 0x1F335 }, + { 0x1F337, 0x1F37C }, + { 0x1F37E, 0x1F393 }, + { 0x1F3A0, 0x1F3CA }, + { 0x1F3CF, 0x1F3D3 }, + { 0x1F3E0, 0x1F3F0 }, + { 0x1F3F4, 0x1F3F4 }, + { 0x1F3F8, 0x1F43E }, + { 0x1F440, 0x1F440 }, + { 0x1F442, 0x1F4FC }, + { 0x1F4FF, 0x1F53D }, + { 0x1F54B, 0x1F54E }, + { 0x1F550, 0x1F567 }, + { 0x1F595, 0x1F596 }, + { 0x1F5FB, 0x1F64F }, + { 0x1F680, 0x1F6C5 }, + { 0x1F6CC, 0x1F6CC }, + { 0x1F6D0, 0x1F6D0 }, + { 0x1F6EB, 0x1F6EC }, + { 0x1F910, 0x1F918 }, + { 0x1F980, 0x1F984 }, + { 0x1F9C0, 0x1F9C0 }, +}; + +static int vt100_is_wide_emoji(gunichar codepoint); + +int vt100_is_wide_char(gunichar codepoint) +{ + return g_unichar_iswide(codepoint) || vt100_is_wide_emoji(codepoint); +} + +static int vt100_is_wide_emoji(gunichar codepoint) +{ + static size_t ranges = sizeof(vt100_wide_emoji) / sizeof(struct vt100_char_range); + ssize_t low = 0, high = ranges - 1; + + do { + ssize_t cur = (high + low) / 2; + struct vt100_char_range range = vt100_wide_emoji[cur]; + + if (codepoint < range.start) { + high = cur - 1; + } + else if (codepoint > range.end) { + low = cur + 1; + } + else { + return 1; + } + } while (low <= high); + + return 0; +} diff --git a/src/unicode-extra.h b/src/unicode-extra.h new file mode 100644 index 0000000..eaa3dce --- /dev/null +++ b/src/unicode-extra.h @@ -0,0 +1,8 @@ +#ifndef _VT100_UNICODE_EXTRA_H +#define _VT100_UNICODE_EXTRA_H + +#include + +int vt100_is_wide_char(gunichar codepoint); + +#endif diff --git a/src/vt100.h b/src/vt100.h index a3429ef..28e02b8 100644 --- a/src/vt100.h +++ b/src/vt100.h @@ -6,5 +6,6 @@ struct vt100_screen; typedef struct vt100_screen VT100Screen; #include "screen.h" +#include "unicode-extra.h" #endif -- cgit v1.2.3