From 7f11c68a439e10e97bee7964efe844818cff9bd5 Mon Sep 17 00:00:00 2001 From: gotmor Date: Thu, 12 Feb 2009 11:52:16 +0000 Subject: replaced libc strlen() with optimized strlen_utf8() version git-svn-id: http://dzen.googlecode.com/svn/trunk@246 f2baff5b-bf2c-0410-a398-912abdc3d8b2 --- util.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'util.c') diff --git a/util.c b/util.c index ced6368..b931b35 100644 --- a/util.c +++ b/util.c @@ -13,6 +13,8 @@ #include #include +#define ONEMASK ((size_t)(-1) / 0xFF) + void * emalloc(unsigned int size) { void *res = malloc(size); @@ -62,3 +64,57 @@ spawn(const char *arg) { wait(0); } +/* http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html */ +size_t +strlen_utf8(const char * _s) +{ + const char * s; + size_t count = 0; + size_t u; + unsigned char b; + + /* Handle any initial misaligned bytes. */ + for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) { + b = *s; + + /* Exit if we hit a zero byte. */ + if (b == '\0') + goto done; + + /* Is this byte NOT the first byte of a character? */ + count += (b >> 7) & ((~b) >> 6); + } + + /* Handle complete blocks. */ + for (; ; s += sizeof(size_t)) { + /* Prefetch 256 bytes ahead. */ + __builtin_prefetch(&s[256], 0, 0); + + /* Grab 4 or 8 bytes of UTF-8 data. */ + u = *(size_t *)(s); + + /* Exit the loop if there are any zero bytes. */ + if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80)) + break; + + /* Count bytes which are NOT the first byte of a character. */ + u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6); + count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8); + } + + /* Take care of any left-over bytes. */ + for (; ; s++) { + b = *s; + + /* Exit if we hit a zero byte. */ + if (b == '\0') + break; + + /* Is this byte NOT the first byte of a character? */ + count += (b >> 7) & ((~b) >> 6); + } + +done: + return ((s - _s) - count); +} + -- cgit v1.2.3-54-g00ecf