Use encoding crate in value_codec.rs.

5 years ago · a98672f282
--- a/Cargo.lock
+++ b/Cargo.lock
@ -62,6 +62,11 @@ name = "cfg-if"
 version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "cloudabi"
 version = "0.0.3"
@ -188,6 +193,14 @@ name = "encoding_index_tests"
 version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "encoding_rs"
 version = "0.8.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "cfg-if 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "env_logger"
 version = "0.3.5"
@ -742,6 +755,7 @@ dependencies = [
 "bytes 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "crossbeam-channel 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
 "encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)",
 "encoding_rs 0.8.26 (registry+https://github.com/rust-lang/crates.io-index)",
 "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
 "futures 0.1.29 (registry+https://github.com/rust-lang/crates.io-index)",
 "log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1106,6 +1120,7 @@ dependencies = [
 "checksum bytes 0.4.12 (registry+https://github.com/rust-lang/crates.io-index)" = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c"
 "checksum bytes 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ad1f8e949d755f9d79112b5bb46938e0ef9d3804a0b16dfab13aafcaa5f0fa72"
 "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 "checksum cfg-if 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
 "checksum crossbeam-channel 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "c8ec7fcd21571dc78f96cc96243cab8d8f035247c3efd16c687be154c3fa9efa"
 "checksum crossbeam-deque 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285"
@ -1120,6 +1135,7 @@ dependencies = [
 "checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
 "checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
 "checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
 "checksum encoding_rs 0.8.26 (registry+https://github.com/rust-lang/crates.io-index)" = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283"
 "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
 "checksum fnv 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,6 +9,7 @@ byteorder = "^0.5.1"
 bytes = "^1.0"
 crossbeam-channel = "^0.3"
 encoding = "^0.2"
 encoding_rs = "^0.8"
 env_logger = "^0.3.2"
 futures = "^0.1"
 log = "^0.3.5"
--- a/src/proto/value_codec.rs
+++ b/src/proto/value_codec.rs
@ -19,8 +19,7 @@ use std::io;
 use std::net;
 use bytes::{BufMut, BytesMut};
 use encoding::all::WINDOWS_1252;
 use encoding::{DecoderTrap, EncoderTrap, Encoding};
 use encoding_rs::WINDOWS_1252;
 use std::convert::{TryFrom, TryInto};
 use thiserror::Error;
@ -225,11 +224,11 @@ impl<'a> ValueDecoder<'a> {
        let position = self.position;
        let bytes = self.consume(length)?;
        let result = WINDOWS_1252.decode(bytes, DecoderTrap::Strict);
        let result = WINDOWS_1252.decode_without_bom_handling_and_without_replacement(bytes);
        match result {
            Ok(string) => Ok(string),
            Err(error) => Err(ValueDecodeError::InvalidString {
                cause: error.to_string(),
            Some(string) => Ok(string.into_owned()),
            None => Err(ValueDecodeError::InvalidString {
                cause: "malformed sequence in Windows-1252-encoded string".to_string(),
                position: position,
            }),
        }
@ -361,12 +360,13 @@ impl<'a> ValueEncoder<'a> {
        // Reserve space for the length prefix.
        let mut prefixer = Prefixer::new(self.buffer);
        // Encode the string. We are quite certain this cannot fail because we
        // use EncoderTrap::Replace to replace any unencodable characters with
        // '?' which is always encodable in Windows-1252.
        //
        // TODO: Switch to the encoding_rs crate.
        let bytes = WINDOWS_1252.encode(val, EncoderTrap::Replace).unwrap();
        // Encode the string. This cannot fail because any unmappable characters
        // are replaced.
        let (bytes, encoding, _did_replace) = WINDOWS_1252.encode(val);
        // Encodings in full generality can have a different "output encoding"
        // but that is not the case of Windows-1252.
        assert_eq!(encoding, WINDOWS_1252);
        prefixer.suffix_mut().extend_from_slice(&bytes);
@ -747,11 +747,18 @@ pub mod tests {
    }
    // A few strings and their corresponding encodings.
    const STRING_ENCODINGS: [(&'static str, &'static [u8]); 3] = [
    const STRING_ENCODINGS: [(&'static str, &'static [u8]); 4] = [
        ("", &[0, 0, 0, 0]),
        ("hey!", &[4, 0, 0, 0, 104, 101, 121, 33]),
        // Windows 1252 specific codepoints.
        ("‘’“”€", &[5, 0, 0, 0, 145, 146, 147, 148, 128]),
        // Undefined codepoints. They are not decoded to representable
        // characters, but they do not generate errors either. In particular,
        // they survive round-trips through the codec.
        (
            "\u{81}\u{8D}\u{8F}\u{90}\u{9D}",
            &[5, 0, 0, 0, 0x81, 0x8D, 0x8F, 0x90, 0x9D],
        ),
    ];
    #[test]
@ -774,14 +781,18 @@ pub mod tests {
    fn encode_string_with_unencodable_characters() {
        let mut bytes = BytesMut::new();
        ValueEncoder::new(&mut bytes)
            .encode_string("忠犬ハチ公")
            .unwrap();
        ValueEncoder::new(&mut bytes).encode_string("你好").unwrap();
        // Characters not in the Windows 1252 codepage are rendered as '?'.
        assert_eq!(bytes, vec![5, 0, 0, 0, 63, 63, 63, 63, 63]);
        // Characters not in the Windows 1252 codepage are replaced with their
        // decimal representations. Thus the output is longer than the input.
        assert_eq!(bytes[0..4], [16, 0, 0, 0]);
        assert_eq!(&bytes[4..], b"&#20320;&#22909;");
        assert_eq!(ValueDecoder::new(&bytes).decode_string().unwrap(), "?????");
        // The replaced characters are not decoded back to their original values.
        assert_eq!(
            ValueDecoder::new(&bytes).decode_string().unwrap(),
            "&#20320;&#22909;"
        );
    }
    #[test]