From 2fb5b204d83a3da79dc743fed25f9cad81367a13 Mon Sep 17 00:00:00 2001 From: Jesse Luehrs Date: Sat, 13 Mar 2021 14:15:39 -0500 Subject: fix a bunch of utf8 parsing issues --- src/blocking/input.rs | 27 +++++++++++------------- src/input.rs | 27 +++++++++++------------- src/private.rs | 58 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 59 insertions(+), 53 deletions(-) diff --git a/src/blocking/input.rs b/src/blocking/input.rs index b133e7f..7a35f25 100644 --- a/src/blocking/input.rs +++ b/src/blocking/input.rs @@ -180,30 +180,27 @@ impl Input { } fn fill_buf(&mut self) -> Result { - if !self.buf_is_empty() { - return Ok(true); - } - - self.buf.resize(4096, 0); - self.pos = 0; - let bytes = read_stdin(&mut self.buf)?; - if bytes == 0 { - return Ok(false); + if self.buf_is_empty() { + self.buf.resize(4096, 0); + self.pos = 0; + let bytes = read_stdin(&mut self.buf)?; + if bytes == 0 { + return Ok(false); + } + self.buf.truncate(bytes); } - self.buf.truncate(bytes); if self.parse_utf8 { - let mut extra = self.find_truncated_utf8(); - if extra > 0 { + let expected_bytes = self.expected_leading_utf8_bytes(); + if self.buf.len() < self.pos + expected_bytes { let mut cur = self.buf.len(); - self.buf.resize(4096 + extra, 0); - while extra > 0 { + self.buf.resize(4096 + expected_bytes, 0); + while cur < self.pos + expected_bytes { let bytes = read_stdin(&mut self.buf[cur..])?; if bytes == 0 { return Ok(false); } cur += bytes; - extra = extra.saturating_sub(bytes); } self.buf.truncate(cur); } diff --git a/src/input.rs b/src/input.rs index e34187e..abcdd7f 100644 --- a/src/input.rs +++ b/src/input.rs @@ -192,24 +192,22 @@ impl Input { } async fn fill_buf(&mut self) -> Result { - if !self.buf_is_empty() { - return Ok(true); - } - - self.buf.resize(4096, 0); - self.pos = 0; - let bytes = read_stdin(&mut self.stdin, &mut self.buf).await?; - if bytes == 0 { - return Ok(false); + if self.buf_is_empty() { + self.buf.resize(4096, 0); + self.pos = 0; + let bytes = read_stdin(&mut self.stdin, &mut self.buf).await?; + if bytes == 0 { + return Ok(false); + } + self.buf.truncate(bytes); } - self.buf.truncate(bytes); if self.parse_utf8 { - let mut extra = self.find_truncated_utf8(); - if extra > 0 { + let expected_bytes = self.expected_leading_utf8_bytes(); + if self.buf.len() < self.pos + expected_bytes { let mut cur = self.buf.len(); - self.buf.resize(4096 + extra, 0); - while extra > 0 { + self.buf.resize(4096 + expected_bytes, 0); + while cur < self.pos + expected_bytes { let bytes = read_stdin(&mut self.stdin, &mut self.buf[cur..]) .await?; @@ -217,7 +215,6 @@ impl Input { return Ok(false); } cur += bytes; - extra = extra.saturating_sub(bytes); } self.buf.truncate(cur); } diff --git a/src/private.rs b/src/private.rs index 0b5d2f2..07dbb4c 100644 --- a/src/private.rs +++ b/src/private.rs @@ -39,13 +39,33 @@ pub trait Input { .buf() .iter() .copied() - .take_while(|&c| matches!(c, 32..=126 | 128..=255)) + .take_while(|&c| matches!(c, 32..=126 | 128..=247)) .collect(); if !prefix.is_empty() { - self.consume(prefix.len()); - match std::string::String::from_utf8(prefix) { - Ok(s) => return Ok(Some(crate::Key::String(s))), - Err(e) => return Ok(Some(crate::Key::Bytes(e.into_bytes()))), + match std::string::String::from_utf8_lossy(&prefix) { + std::borrow::Cow::Borrowed(s) => { + self.consume(s.len()); + return Ok(Some(crate::Key::String(s.to_string()))); + } + std::borrow::Cow::Owned(mut s) => { + for (i, window) in s.as_bytes().windows(3).enumerate() { + if window == [0xef, 0xbf, 0xbd] { + if i > 0 { + self.consume(i); + s.truncate(i); + return Ok(Some(crate::Key::String(s))); + } else { + // not quite correct, but figuring out how to + // take only the invalid utf8 seems hard (and + // this should come up very rarely) + self.consume(prefix.len()); + return Ok(Some(crate::Key::Bytes(prefix))); + } + } + } + self.consume(s.len()); + return Ok(Some(crate::Key::String(s))); + } } } @@ -67,7 +87,8 @@ pub trait Input { 28..=31 => true, 32..=126 => !self.should_parse_utf8(), 127 => !self.should_parse_special_keys(), - 128..=255 => !self.should_parse_utf8(), + 128..=247 => !self.should_parse_utf8(), + 248..=255 => true, }) .collect(); if !prefix.is_empty() { @@ -261,6 +282,7 @@ pub trait Input { if (0b1000_0000..=0b1011_1111).contains(&c) { c } else { + self.ungetc(c); fail!() } } @@ -317,23 +339,13 @@ pub trait Input { } } - fn find_truncated_utf8(&self) -> usize { - for i in 0..4 { - match self.buf()[self.buf().len() - 1 - i] { - 0b0000_0000..=0b0111_1111 => return 0, - 0b1100_0000..=0b1101_1111 => { - return 1usize.saturating_sub(i); - } - 0b1110_0000..=0b1110_1111 => { - return 2usize.saturating_sub(i); - } - 0b1111_0000..=0b1111_0111 => { - return 3usize.saturating_sub(i); - } - 0b1000_0000..=0b1011_1111 => {} - _ => return 0, - } + fn expected_leading_utf8_bytes(&self) -> usize { + match self.buf()[0] { + 0b0000_0000..=0b0111_1111 => 1, + 0b1100_0000..=0b1101_1111 => 2, + 0b1110_0000..=0b1110_1111 => 3, + 0b1111_0000..=0b1111_0111 => 4, + _ => 1, } - 0 } } -- cgit v1.2.3