From 2fb5b204d83a3da79dc743fed25f9cad81367a13 Mon Sep 17 00:00:00 2001 From: Jesse Luehrs Date: Sat, 13 Mar 2021 14:15:39 -0500 Subject: fix a bunch of utf8 parsing issues --- src/private.rs | 58 +++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'src/private.rs') diff --git a/src/private.rs b/src/private.rs index 0b5d2f2..07dbb4c 100644 --- a/src/private.rs +++ b/src/private.rs @@ -39,13 +39,33 @@ pub trait Input { .buf() .iter() .copied() - .take_while(|&c| matches!(c, 32..=126 | 128..=255)) + .take_while(|&c| matches!(c, 32..=126 | 128..=247)) .collect(); if !prefix.is_empty() { - self.consume(prefix.len()); - match std::string::String::from_utf8(prefix) { - Ok(s) => return Ok(Some(crate::Key::String(s))), - Err(e) => return Ok(Some(crate::Key::Bytes(e.into_bytes()))), + match std::string::String::from_utf8_lossy(&prefix) { + std::borrow::Cow::Borrowed(s) => { + self.consume(s.len()); + return Ok(Some(crate::Key::String(s.to_string()))); + } + std::borrow::Cow::Owned(mut s) => { + for (i, window) in s.as_bytes().windows(3).enumerate() { + if window == [0xef, 0xbf, 0xbd] { + if i > 0 { + self.consume(i); + s.truncate(i); + return Ok(Some(crate::Key::String(s))); + } else { + // not quite correct, but figuring out how to + // take only the invalid utf8 seems hard (and + // this should come up very rarely) + self.consume(prefix.len()); + return Ok(Some(crate::Key::Bytes(prefix))); + } + } + } + self.consume(s.len()); + return Ok(Some(crate::Key::String(s))); + } } } @@ -67,7 +87,8 @@ pub trait Input { 28..=31 => true, 32..=126 => !self.should_parse_utf8(), 127 => !self.should_parse_special_keys(), - 128..=255 => !self.should_parse_utf8(), + 128..=247 => !self.should_parse_utf8(), + 248..=255 => true, }) .collect(); if !prefix.is_empty() { @@ -261,6 +282,7 @@ pub trait Input { if (0b1000_0000..=0b1011_1111).contains(&c) { c } else { + self.ungetc(c); fail!() } } @@ -317,23 +339,13 @@ pub trait Input { } } - fn find_truncated_utf8(&self) -> usize { - for i in 0..4 { - match self.buf()[self.buf().len() - 1 - i] { - 0b0000_0000..=0b0111_1111 => return 0, - 0b1100_0000..=0b1101_1111 => { - return 1usize.saturating_sub(i); - } - 0b1110_0000..=0b1110_1111 => { - return 2usize.saturating_sub(i); - } - 0b1111_0000..=0b1111_0111 => { - return 3usize.saturating_sub(i); - } - 0b1000_0000..=0b1011_1111 => {} - _ => return 0, - } + fn expected_leading_utf8_bytes(&self) -> usize { + match self.buf()[0] { + 0b0000_0000..=0b0111_1111 => 1, + 0b1100_0000..=0b1101_1111 => 2, + 0b1110_0000..=0b1110_1111 => 3, + 0b1111_0000..=0b1111_0111 => 4, + _ => 1, } - 0 } } -- cgit v1.2.3-54-g00ecf