aboutsummaryrefslogtreecommitdiffstats
path: root/util.c
diff options
context:
space:
mode:
authorgotmor <gotmor@f2baff5b-bf2c-0410-a398-912abdc3d8b2>2009-02-12 11:52:16 +0000
committergotmor <gotmor@f2baff5b-bf2c-0410-a398-912abdc3d8b2>2009-02-12 11:52:16 +0000
commit7f11c68a439e10e97bee7964efe844818cff9bd5 (patch)
tree4bae704f88518de7473f51b89e67a104f592e2a9 /util.c
parent33fd99257cc46ce90313734bbd1c0821f7792f88 (diff)
downloaddzen-7f11c68a439e10e97bee7964efe844818cff9bd5.tar.gz
dzen-7f11c68a439e10e97bee7964efe844818cff9bd5.zip
replaced libc strlen() with optimized strlen_utf8() version
git-svn-id: http://dzen.googlecode.com/svn/trunk@246 f2baff5b-bf2c-0410-a398-912abdc3d8b2
Diffstat (limited to 'util.c')
-rw-r--r--util.c56
1 files changed, 56 insertions, 0 deletions
diff --git a/util.c b/util.c
index ced6368..b931b35 100644
--- a/util.c
+++ b/util.c
@@ -13,6 +13,8 @@
#include <sys/wait.h>
#include <unistd.h>
+#define ONEMASK ((size_t)(-1) / 0xFF)
+
void *
emalloc(unsigned int size) {
void *res = malloc(size);
@@ -62,3 +64,57 @@ spawn(const char *arg) {
wait(0);
}
+/* http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html */
+size_t
+strlen_utf8(const char * _s)
+{
+ const char * s;
+ size_t count = 0;
+ size_t u;
+ unsigned char b;
+
+ /* Handle any initial misaligned bytes. */
+ for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
+ b = *s;
+
+ /* Exit if we hit a zero byte. */
+ if (b == '\0')
+ goto done;
+
+ /* Is this byte NOT the first byte of a character? */
+ count += (b >> 7) & ((~b) >> 6);
+ }
+
+ /* Handle complete blocks. */
+ for (; ; s += sizeof(size_t)) {
+ /* Prefetch 256 bytes ahead. */
+ __builtin_prefetch(&s[256], 0, 0);
+
+ /* Grab 4 or 8 bytes of UTF-8 data. */
+ u = *(size_t *)(s);
+
+ /* Exit the loop if there are any zero bytes. */
+ if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
+ break;
+
+ /* Count bytes which are NOT the first byte of a character. */
+ u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
+ count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
+ }
+
+ /* Take care of any left-over bytes. */
+ for (; ; s++) {
+ b = *s;
+
+ /* Exit if we hit a zero byte. */
+ if (b == '\0')
+ break;
+
+ /* Is this byte NOT the first byte of a character? */
+ count += (b >> 7) & ((~b) >> 6);
+ }
+
+done:
+ return ((s - _s) - count);
+}
+