diff --git a/Cargo.lock b/Cargo.lock index 714bd4a..c36cf63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -62,6 +62,11 @@ name = "cfg-if" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cloudabi" version = "0.0.3" @@ -188,6 +193,14 @@ name = "encoding_index_tests" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "encoding_rs" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "env_logger" version = "0.3.5" @@ -742,6 +755,7 @@ dependencies = [ "bytes 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-channel 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", "encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.8.26 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.29 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1106,6 +1120,7 @@ dependencies = [ "checksum bytes 0.4.12 (registry+https://github.com/rust-lang/crates.io-index)" = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" "checksum bytes 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ad1f8e949d755f9d79112b5bb46938e0ef9d3804a0b16dfab13aafcaa5f0fa72" "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +"checksum cfg-if 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum crossbeam-channel 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "c8ec7fcd21571dc78f96cc96243cab8d8f035247c3efd16c687be154c3fa9efa" "checksum crossbeam-deque 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" @@ -1120,6 +1135,7 @@ dependencies = [ "checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" "checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" "checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" +"checksum encoding_rs 0.8.26 (registry+https://github.com/rust-lang/crates.io-index)" = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum fnv 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" diff --git a/Cargo.toml b/Cargo.toml index 9c91c29..9d851c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ byteorder = "^0.5.1" bytes = "^1.0" crossbeam-channel = "^0.3" encoding = "^0.2" +encoding_rs = "^0.8" env_logger = "^0.3.2" futures = "^0.1" log = "^0.3.5" diff --git a/src/proto/value_codec.rs b/src/proto/value_codec.rs index bbd6795..e93a69b 100644 --- a/src/proto/value_codec.rs +++ b/src/proto/value_codec.rs @@ -19,8 +19,7 @@ use std::io; use std::net; use bytes::{BufMut, BytesMut}; -use encoding::all::WINDOWS_1252; -use encoding::{DecoderTrap, EncoderTrap, Encoding}; +use encoding_rs::WINDOWS_1252; use std::convert::{TryFrom, TryInto}; use thiserror::Error; @@ -225,11 +224,11 @@ impl<'a> ValueDecoder<'a> { let position = self.position; let bytes = self.consume(length)?; - let result = WINDOWS_1252.decode(bytes, DecoderTrap::Strict); + let result = WINDOWS_1252.decode_without_bom_handling_and_without_replacement(bytes); match result { - Ok(string) => Ok(string), - Err(error) => Err(ValueDecodeError::InvalidString { - cause: error.to_string(), + Some(string) => Ok(string.into_owned()), + None => Err(ValueDecodeError::InvalidString { + cause: "malformed sequence in Windows-1252-encoded string".to_string(), position: position, }), } @@ -361,12 +360,13 @@ impl<'a> ValueEncoder<'a> { // Reserve space for the length prefix. let mut prefixer = Prefixer::new(self.buffer); - // Encode the string. We are quite certain this cannot fail because we - // use EncoderTrap::Replace to replace any unencodable characters with - // '?' which is always encodable in Windows-1252. - // - // TODO: Switch to the encoding_rs crate. - let bytes = WINDOWS_1252.encode(val, EncoderTrap::Replace).unwrap(); + // Encode the string. This cannot fail because any unmappable characters + // are replaced. + let (bytes, encoding, _did_replace) = WINDOWS_1252.encode(val); + + // Encodings in full generality can have a different "output encoding" + // but that is not the case of Windows-1252. + assert_eq!(encoding, WINDOWS_1252); prefixer.suffix_mut().extend_from_slice(&bytes); @@ -747,11 +747,18 @@ pub mod tests { } // A few strings and their corresponding encodings. - const STRING_ENCODINGS: [(&'static str, &'static [u8]); 3] = [ + const STRING_ENCODINGS: [(&'static str, &'static [u8]); 4] = [ ("", &[0, 0, 0, 0]), ("hey!", &[4, 0, 0, 0, 104, 101, 121, 33]), // Windows 1252 specific codepoints. ("‘’“”€", &[5, 0, 0, 0, 145, 146, 147, 148, 128]), + // Undefined codepoints. They are not decoded to representable + // characters, but they do not generate errors either. In particular, + // they survive round-trips through the codec. + ( + "\u{81}\u{8D}\u{8F}\u{90}\u{9D}", + &[5, 0, 0, 0, 0x81, 0x8D, 0x8F, 0x90, 0x9D], + ), ]; #[test] @@ -774,14 +781,18 @@ pub mod tests { fn encode_string_with_unencodable_characters() { let mut bytes = BytesMut::new(); - ValueEncoder::new(&mut bytes) - .encode_string("忠犬ハチ公") - .unwrap(); + ValueEncoder::new(&mut bytes).encode_string("你好").unwrap(); - // Characters not in the Windows 1252 codepage are rendered as '?'. - assert_eq!(bytes, vec![5, 0, 0, 0, 63, 63, 63, 63, 63]); + // Characters not in the Windows 1252 codepage are replaced with their + // decimal representations. Thus the output is longer than the input. + assert_eq!(bytes[0..4], [16, 0, 0, 0]); + assert_eq!(&bytes[4..], b"你好"); - assert_eq!(ValueDecoder::new(&bytes).decode_string().unwrap(), "?????"); + // The replaced characters are not decoded back to their original values. + assert_eq!( + ValueDecoder::new(&bytes).decode_string().unwrap(), + "你好" + ); } #[test]