From 2fb5b204d83a3da79dc743fed25f9cad81367a13 Mon Sep 17 00:00:00 2001
From: Jesse Luehrs <doy@tozt.net>
Date: Sat, 13 Mar 2021 14:15:39 -0500
Subject: fix a bunch of utf8 parsing issues

---
 src/blocking/input.rs | 27 +++++++++++-------------
 src/input.rs          | 27 +++++++++++-------------
 src/private.rs        | 58 +++++++++++++++++++++++++++++++--------------------
 3 files changed, 59 insertions(+), 53 deletions(-)
diff --git a/src/blocking/input.rs b/src/blocking/input.rs
index b133e7f..7a35f25 100644
--- a/src/blocking/input.rs
+++ b/src/blocking/input.rs
@@ -180,30 +180,27 @@ impl Input {
     }
 
     fn fill_buf(&mut self) -> Result<bool> {
-        if !self.buf_is_empty() {
-            return Ok(true);
-        }
-
-        self.buf.resize(4096, 0);
-        self.pos = 0;
-        let bytes = read_stdin(&mut self.buf)?;
-        if bytes == 0 {
-            return Ok(false);
+        if self.buf_is_empty() {
+            self.buf.resize(4096, 0);
+            self.pos = 0;
+            let bytes = read_stdin(&mut self.buf)?;
+            if bytes == 0 {
+                return Ok(false);
+            }
+            self.buf.truncate(bytes);
         }
-        self.buf.truncate(bytes);
 
         if self.parse_utf8 {
-            let mut extra = self.find_truncated_utf8();
-            if extra > 0 {
+            let expected_bytes = self.expected_leading_utf8_bytes();
+            if self.buf.len() < self.pos + expected_bytes {
                 let mut cur = self.buf.len();
-                self.buf.resize(4096 + extra, 0);
-                while extra > 0 {
+                self.buf.resize(4096 + expected_bytes, 0);
+                while cur < self.pos + expected_bytes {
                     let bytes = read_stdin(&mut self.buf[cur..])?;
                     if bytes == 0 {
                         return Ok(false);
                     }
                     cur += bytes;
-                    extra = extra.saturating_sub(bytes);
                 }
                 self.buf.truncate(cur);
             }
diff --git a/src/input.rs b/src/input.rs
index e34187e..abcdd7f 100644
--- a/src/input.rs
+++ b/src/input.rs
@@ -192,24 +192,22 @@ impl Input {
     }
 
     async fn fill_buf(&mut self) -> Result<bool> {
-        if !self.buf_is_empty() {
-            return Ok(true);
-        }
-
-        self.buf.resize(4096, 0);
-        self.pos = 0;
-        let bytes = read_stdin(&mut self.stdin, &mut self.buf).await?;
-        if bytes == 0 {
-            return Ok(false);
+        if self.buf_is_empty() {
+            self.buf.resize(4096, 0);
+            self.pos = 0;
+            let bytes = read_stdin(&mut self.stdin, &mut self.buf).await?;
+            if bytes == 0 {
+                return Ok(false);
+            }
+            self.buf.truncate(bytes);
         }
-        self.buf.truncate(bytes);
 
         if self.parse_utf8 {
-            let mut extra = self.find_truncated_utf8();
-            if extra > 0 {
+            let expected_bytes = self.expected_leading_utf8_bytes();
+            if self.buf.len() < self.pos + expected_bytes {
                 let mut cur = self.buf.len();
-                self.buf.resize(4096 + extra, 0);
-                while extra > 0 {
+                self.buf.resize(4096 + expected_bytes, 0);
+                while cur < self.pos + expected_bytes {
                     let bytes =
                         read_stdin(&mut self.stdin, &mut self.buf[cur..])
                             .await?;
@@ -217,7 +215,6 @@ impl Input {
                         return Ok(false);
                     }
                     cur += bytes;
-                    extra = extra.saturating_sub(bytes);
                 }
                 self.buf.truncate(cur);
             }
diff --git a/src/private.rs b/src/private.rs
index 0b5d2f2..07dbb4c 100644
--- a/src/private.rs
+++ b/src/private.rs
@@ -39,13 +39,33 @@ pub trait Input {
             .buf()
             .iter()
             .copied()
-            .take_while(|&c| matches!(c, 32..=126 | 128..=255))
+            .take_while(|&c| matches!(c, 32..=126 | 128..=247))
             .collect();
         if !prefix.is_empty() {
-            self.consume(prefix.len());
-            match std::string::String::from_utf8(prefix) {
-                Ok(s) => return Ok(Some(crate::Key::String(s))),
-                Err(e) => return Ok(Some(crate::Key::Bytes(e.into_bytes()))),
+            match std::string::String::from_utf8_lossy(&prefix) {
+                std::borrow::Cow::Borrowed(s) => {
+                    self.consume(s.len());
+                    return Ok(Some(crate::Key::String(s.to_string())));
+                }
+                std::borrow::Cow::Owned(mut s) => {
+                    for (i, window) in s.as_bytes().windows(3).enumerate() {
+                        if window == [0xef, 0xbf, 0xbd] {
+                            if i > 0 {
+                                self.consume(i);
+                                s.truncate(i);
+                                return Ok(Some(crate::Key::String(s)));
+                            } else {
+                                // not quite correct, but figuring out how to
+                                // take only the invalid utf8 seems hard (and
+                                // this should come up very rarely)
+                                self.consume(prefix.len());
+                                return Ok(Some(crate::Key::Bytes(prefix)));
+                            }
+                        }
+                    }
+                    self.consume(s.len());
+                    return Ok(Some(crate::Key::String(s)));
+                }
             }
         }
 
@@ -67,7 +87,8 @@ pub trait Input {
                 28..=31 => true,
                 32..=126 => !self.should_parse_utf8(),
                 127 => !self.should_parse_special_keys(),
-                128..=255 => !self.should_parse_utf8(),
+                128..=247 => !self.should_parse_utf8(),
+                248..=255 => true,
             })
             .collect();
         if !prefix.is_empty() {
@@ -261,6 +282,7 @@ pub trait Input {
                         if (0b1000_0000..=0b1011_1111).contains(&c) {
                             c
                         } else {
+                            self.ungetc(c);
                             fail!()
                         }
                     }
@@ -317,23 +339,13 @@ pub trait Input {
         }
     }
 
-    fn find_truncated_utf8(&self) -> usize {
-        for i in 0..4 {
-            match self.buf()[self.buf().len() - 1 - i] {
-                0b0000_0000..=0b0111_1111 => return 0,
-                0b1100_0000..=0b1101_1111 => {
-                    return 1usize.saturating_sub(i);
-                }
-                0b1110_0000..=0b1110_1111 => {
-                    return 2usize.saturating_sub(i);
-                }
-                0b1111_0000..=0b1111_0111 => {
-                    return 3usize.saturating_sub(i);
-                }
-                0b1000_0000..=0b1011_1111 => {}
-                _ => return 0,
-            }
+    fn expected_leading_utf8_bytes(&self) -> usize {
+        match self.buf()[0] {
+            0b0000_0000..=0b0111_1111 => 1,
+            0b1100_0000..=0b1101_1111 => 2,
+            0b1110_0000..=0b1110_1111 => 3,
+            0b1111_0000..=0b1111_0111 => 4,
+            _ => 1,
         }
-        0
     }
 }
-- 
cgit v1.2.3