aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesse Luehrs <doy@tozt.net>2021-03-13 14:15:39 -0500
committerJesse Luehrs <doy@tozt.net>2021-03-13 14:15:39 -0500
commit2fb5b204d83a3da79dc743fed25f9cad81367a13 (patch)
treed4dfd5a87568ed57b2987cd2da2c28ce8ce3ef25
parent88cce3303b5b0e0c69604c6d6d9ac603f083a540 (diff)
downloadtextmode-2fb5b204d83a3da79dc743fed25f9cad81367a13.tar.gz
textmode-2fb5b204d83a3da79dc743fed25f9cad81367a13.zip
fix a bunch of utf8 parsing issues
-rw-r--r--src/blocking/input.rs27
-rw-r--r--src/input.rs27
-rw-r--r--src/private.rs58
3 files changed, 59 insertions, 53 deletions
diff --git a/src/blocking/input.rs b/src/blocking/input.rs
index b133e7f..7a35f25 100644
--- a/src/blocking/input.rs
+++ b/src/blocking/input.rs
@@ -180,30 +180,27 @@ impl Input {
}
fn fill_buf(&mut self) -> Result<bool> {
- if !self.buf_is_empty() {
- return Ok(true);
- }
-
- self.buf.resize(4096, 0);
- self.pos = 0;
- let bytes = read_stdin(&mut self.buf)?;
- if bytes == 0 {
- return Ok(false);
+ if self.buf_is_empty() {
+ self.buf.resize(4096, 0);
+ self.pos = 0;
+ let bytes = read_stdin(&mut self.buf)?;
+ if bytes == 0 {
+ return Ok(false);
+ }
+ self.buf.truncate(bytes);
}
- self.buf.truncate(bytes);
if self.parse_utf8 {
- let mut extra = self.find_truncated_utf8();
- if extra > 0 {
+ let expected_bytes = self.expected_leading_utf8_bytes();
+ if self.buf.len() < self.pos + expected_bytes {
let mut cur = self.buf.len();
- self.buf.resize(4096 + extra, 0);
- while extra > 0 {
+ self.buf.resize(4096 + expected_bytes, 0);
+ while cur < self.pos + expected_bytes {
let bytes = read_stdin(&mut self.buf[cur..])?;
if bytes == 0 {
return Ok(false);
}
cur += bytes;
- extra = extra.saturating_sub(bytes);
}
self.buf.truncate(cur);
}
diff --git a/src/input.rs b/src/input.rs
index e34187e..abcdd7f 100644
--- a/src/input.rs
+++ b/src/input.rs
@@ -192,24 +192,22 @@ impl Input {
}
async fn fill_buf(&mut self) -> Result<bool> {
- if !self.buf_is_empty() {
- return Ok(true);
- }
-
- self.buf.resize(4096, 0);
- self.pos = 0;
- let bytes = read_stdin(&mut self.stdin, &mut self.buf).await?;
- if bytes == 0 {
- return Ok(false);
+ if self.buf_is_empty() {
+ self.buf.resize(4096, 0);
+ self.pos = 0;
+ let bytes = read_stdin(&mut self.stdin, &mut self.buf).await?;
+ if bytes == 0 {
+ return Ok(false);
+ }
+ self.buf.truncate(bytes);
}
- self.buf.truncate(bytes);
if self.parse_utf8 {
- let mut extra = self.find_truncated_utf8();
- if extra > 0 {
+ let expected_bytes = self.expected_leading_utf8_bytes();
+ if self.buf.len() < self.pos + expected_bytes {
let mut cur = self.buf.len();
- self.buf.resize(4096 + extra, 0);
- while extra > 0 {
+ self.buf.resize(4096 + expected_bytes, 0);
+ while cur < self.pos + expected_bytes {
let bytes =
read_stdin(&mut self.stdin, &mut self.buf[cur..])
.await?;
@@ -217,7 +215,6 @@ impl Input {
return Ok(false);
}
cur += bytes;
- extra = extra.saturating_sub(bytes);
}
self.buf.truncate(cur);
}
diff --git a/src/private.rs b/src/private.rs
index 0b5d2f2..07dbb4c 100644
--- a/src/private.rs
+++ b/src/private.rs
@@ -39,13 +39,33 @@ pub trait Input {
.buf()
.iter()
.copied()
- .take_while(|&c| matches!(c, 32..=126 | 128..=255))
+ .take_while(|&c| matches!(c, 32..=126 | 128..=247))
.collect();
if !prefix.is_empty() {
- self.consume(prefix.len());
- match std::string::String::from_utf8(prefix) {
- Ok(s) => return Ok(Some(crate::Key::String(s))),
- Err(e) => return Ok(Some(crate::Key::Bytes(e.into_bytes()))),
+ match std::string::String::from_utf8_lossy(&prefix) {
+ std::borrow::Cow::Borrowed(s) => {
+ self.consume(s.len());
+ return Ok(Some(crate::Key::String(s.to_string())));
+ }
+ std::borrow::Cow::Owned(mut s) => {
+ for (i, window) in s.as_bytes().windows(3).enumerate() {
+ if window == [0xef, 0xbf, 0xbd] {
+ if i > 0 {
+ self.consume(i);
+ s.truncate(i);
+ return Ok(Some(crate::Key::String(s)));
+ } else {
+ // not quite correct, but figuring out how to
+ // take only the invalid utf8 seems hard (and
+ // this should come up very rarely)
+ self.consume(prefix.len());
+ return Ok(Some(crate::Key::Bytes(prefix)));
+ }
+ }
+ }
+ self.consume(s.len());
+ return Ok(Some(crate::Key::String(s)));
+ }
}
}
@@ -67,7 +87,8 @@ pub trait Input {
28..=31 => true,
32..=126 => !self.should_parse_utf8(),
127 => !self.should_parse_special_keys(),
- 128..=255 => !self.should_parse_utf8(),
+ 128..=247 => !self.should_parse_utf8(),
+ 248..=255 => true,
})
.collect();
if !prefix.is_empty() {
@@ -261,6 +282,7 @@ pub trait Input {
if (0b1000_0000..=0b1011_1111).contains(&c) {
c
} else {
+ self.ungetc(c);
fail!()
}
}
@@ -317,23 +339,13 @@ pub trait Input {
}
}
- fn find_truncated_utf8(&self) -> usize {
- for i in 0..4 {
- match self.buf()[self.buf().len() - 1 - i] {
- 0b0000_0000..=0b0111_1111 => return 0,
- 0b1100_0000..=0b1101_1111 => {
- return 1usize.saturating_sub(i);
- }
- 0b1110_0000..=0b1110_1111 => {
- return 2usize.saturating_sub(i);
- }
- 0b1111_0000..=0b1111_0111 => {
- return 3usize.saturating_sub(i);
- }
- 0b1000_0000..=0b1011_1111 => {}
- _ => return 0,
- }
+ fn expected_leading_utf8_bytes(&self) -> usize {
+ match self.buf()[0] {
+ 0b0000_0000..=0b0111_1111 => 1,
+ 0b1100_0000..=0b1101_1111 => 2,
+ 0b1110_0000..=0b1110_1111 => 3,
+ 0b1111_0000..=0b1111_0111 => 4,
+ _ => 1,
}
- 0
}
}