diff options
author | chriseth <chris@ethereum.org> | 2017-06-27 20:38:03 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-27 20:38:03 +0800 |
commit | 36044c8c95890bfc25a199510e32a0481e8082d0 (patch) | |
tree | b39a53434983fe7a89051697872b646101d725fe | |
parent | bc31d4969ccdea8804f573bcf5104c154df9aff6 (diff) | |
parent | e715dd0b7e382b71abf50c974f943423048d138e (diff) | |
download | dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar.gz dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar.bz2 dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar.lz dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar.xz dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar.zst dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.zip |
Merge pull request #2413 from ethereum/utf8-strict-parser
Implement strict UTF-8 validation
-rw-r--r-- | Changelog.md | 2 | ||||
-rw-r--r-- | libdevcore/UTF8.cpp | 84 | ||||
-rw-r--r-- | test/libdevcore/UTF8.cpp | 216 |
3 files changed, 291 insertions, 11 deletions
diff --git a/Changelog.md b/Changelog.md index 6d9fe477..3d8701ca 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,6 +14,7 @@ Features: * Type Checker: Warn about copies in storage that might overwrite unexpectedly. * Code Generator: Added the Whiskers template system. * Remove obsolete Why3 output. + * Type Checker: Enforce strict UTF-8 validation. Bugfixes: * Code generator: Use ``REVERT`` instead of ``INVALID`` for generated input validation routines. @@ -22,6 +23,7 @@ Bugfixes: * Type Checker: Make UTF8-validation a bit more sloppy to include more valid sequences. * Type Checker: Disallow comparisons between mapping and non-internal function types. * Type Checker: Do not treat strings that look like addresses as addresses. + * Type Checker: Support valid, but incorrectly rejected UTF-8 sequences. * Fixed crash concerning non-callable types. * Unused variable warnings no longer issued for variables used inside inline assembly. * Code Generator: Fix ABI encoding of empty literal string. diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index 449ccc5d..2ae720ec 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -27,25 +27,74 @@ namespace dev { +namespace +{ -bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +/// Validate byte sequence against Unicode chapter 3 Table 3-7. +bool isWellFormed(unsigned char byte1, unsigned char byte2) +{ + if (byte1 == 0xc0 || byte1 == 0xc1) + return false; + else if (byte1 >= 0xc2 && byte1 <= 0xdf) + return true; + else if (byte1 == 0xe0) + { + if (byte2 < 0xa0) + return false; + else + return true; + } + else if (byte1 >= 0xe1 && byte1 <= 0xec) + return true; + else if (byte1 == 0xed) + { + if (byte2 > 0x9f) + return false; + else + return true; + } + else if (byte1 == 0xee || byte1 == 0xef) + return true; + else if (byte1 == 0xf0) + { + if (byte2 < 0x90) + return false; + else + return true; + } + else if (byte1 >= 0xf1 && byte1 <= 0xf3) + return true; + else if (byte1 == 0xf4) + { + if (byte2 > 0x8f) + return false; + else + return true; + } + /// 0xf5 .. 0xf7 is disallowed + /// Technically anything below 0xc0 or above 0xf7 is + /// not possible to encode using Table 3-6 anyway. + return false; +} + +bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidPosition) { - const size_t length = _input.length(); bool valid = true; size_t i = 0; - for (; i < length; i++) + for (; i < _length; i++) { - if ((unsigned char)_input[i] < 0x80) + // Check for Unicode Chapter 3 Table 3-6 conformity. + if (_input[i] < 0x80) continue; size_t count = 0; - switch(_input[i] & 0xf0) { - case 0xc0: count = 1; break; - case 0xe0: count = 2; break; - case 0xf0: count = 3; break; - default: break; - } + if (_input[i] >= 0xc0 && _input[i] <= 0xdf) + count = 1; + else if (_input[i] >= 0xe0 && _input[i] <= 0xef) + count = 2; + else if (_input[i] >= 0xf0 && _input[i] <= 0xf7) + count = 3; if (count == 0) { @@ -53,7 +102,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) break; } - if ((i + count) >= length) + if ((i + count) >= _length) { valid = false; break; @@ -67,6 +116,13 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) valid = false; break; } + + // Check for Unicode Chapter 3 Table 3-7 conformity. + if ((j == 0) && !isWellFormed(_input[i - 1], _input[i])) + { + valid = false; + break; + } } } @@ -77,5 +133,11 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) return false; } +} + +bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +{ + return validateUTF8(reinterpret_cast<unsigned char const*>(_input.c_str()), _input.length(), _invalidPosition); +} } diff --git a/test/libdevcore/UTF8.cpp b/test/libdevcore/UTF8.cpp new file mode 100644 index 00000000..719ada72 --- /dev/null +++ b/test/libdevcore/UTF8.cpp @@ -0,0 +1,216 @@ +/* + This file is part of solidity. + + solidity is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + solidity is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with solidity. If not, see <http://www.gnu.org/licenses/>. +*/ +/** + * Unit tests for UTF-8 validation. + */ + +#include <libdevcore/CommonData.h> +#include <libdevcore/UTF8.h> + +#include "../TestHelper.h" + +using namespace std; + +namespace dev +{ +namespace test +{ + +BOOST_AUTO_TEST_SUITE(UTF8) + +namespace { + +bool isValidUTF8(string const& _value) +{ + size_t pos; + return validateUTF8(asString(fromHex(_value)), pos); +} + +bool isInvalidUTF8(string const& _value, size_t _expectedPos) +{ + size_t pos; + if (validateUTF8(asString(fromHex(_value)), pos)) + return false; + if (pos != _expectedPos) + return false; + return true; +} + +} + +BOOST_AUTO_TEST_CASE(valid) +{ + BOOST_CHECK(isValidUTF8("00")); + BOOST_CHECK(isValidUTF8("20")); + BOOST_CHECK(isValidUTF8("7f")); + BOOST_CHECK(isValidUTF8("c281")); + BOOST_CHECK(isValidUTF8("df81")); + BOOST_CHECK(isValidUTF8("e0a081")); + BOOST_CHECK(isValidUTF8("e18081")); + BOOST_CHECK(isValidUTF8("ec8081")); + BOOST_CHECK(isValidUTF8("ed8081")); + BOOST_CHECK(isValidUTF8("ee8081")); + BOOST_CHECK(isValidUTF8("ef8081")); + BOOST_CHECK(isValidUTF8("f0908081")); + BOOST_CHECK(isValidUTF8("f3808081")); + BOOST_CHECK(isValidUTF8("f2808081")); + BOOST_CHECK(isValidUTF8("f3808081")); + BOOST_CHECK(isValidUTF8("f48e8081")); +} + +BOOST_AUTO_TEST_CASE(invalid) +{ + // anything between 0x80 and 0xc0 is disallowed + BOOST_CHECK(isInvalidUTF8("80", 0)); // invalid per table 3.6 + BOOST_CHECK(isInvalidUTF8("a0", 0)); // invalid per table 3.6 + BOOST_CHECK(isInvalidUTF8("c0", 0)); // invalid per table 3.7 + BOOST_CHECK(isInvalidUTF8("c1", 0)); // invalid per table 3.7 + BOOST_CHECK(isInvalidUTF8("c2", 0)); // too short (position is reported as the first byte) + BOOST_CHECK(isInvalidUTF8("e08081", 2)); // e0 must be followed by >= a0 + BOOST_CHECK(isInvalidUTF8("e180", 0)); // too short + BOOST_CHECK(isInvalidUTF8("ec80", 0)); // too short + BOOST_CHECK(isInvalidUTF8("f08f8001", 2)); // f0 must be followed by >= 90 + BOOST_CHECK(isInvalidUTF8("f18080", 0)); // too short + BOOST_CHECK(isInvalidUTF8("f4908081", 2)); // f4 must be followed by < 90 + // anything above 0xf7 is disallowed + BOOST_CHECK(isInvalidUTF8("f8", 0)); // invalid per table 3.7 + BOOST_CHECK(isInvalidUTF8("f9", 0)); // invalid per table 3.7 +} + +BOOST_AUTO_TEST_CASE(corpus) +{ + string source = R"( +κόσμε + +hélló + +Ā ā Ă ă Ą ą + +ƀ Ɓ Ƃ ƃ Ƅ ƅ + +ɐ ɑ ɒ ɓ ɔ ɕ + +ʰ ʱ ʲ ʳ ʴ ʵ + +̀ ́ ̂ ̃ ̄ ̅ + +ϩ Ϫ ϫ Ϭ ϭ Ϯ + +Ё Ђ Ѓ Є Ѕ І + +Ա Բ Գ Դ Ե Զ + + ק ר ש ת װ ױ + +ځ ڂ ڃ ڄ څ چ + +ऑ ऒ ओ औ क ख + +ও ঔ ক খ গ ঘ + +ਘ ਙ ਚ ਛ ਜ ਝ + +ઓ ઔ ક ખ ગ ઘ + +ଗ ଘ ଙ ଚ ଛ ଜ + +ஔ க ங ச ஜ ஞ + +ఎ ఏ ఐ ఒ ఓ ఔ + +ಓ ಔ ಕ ಖ ಗ ಘ + +ഐ ഒ ഓ ഔ ക + +ฒ ณ ด ต ถ ท + +ມ ຢ ຣ ລ ວ ສ + +༄ ༅ ༆ ༇ ༈ ༉ + +Ⴑ Ⴒ Ⴓ Ⴔ Ⴕ Ⴖ + +ᄌ ᄍ ᄎ ᄏ ᄐ + +Ḕ ḕ Ḗ ḗ Ḙ ḙ Ḛ + +ἐ ἑ ἒ ἓ ἔ ἕ + +₠ ₡ ₢ ₣ ₤ ₥ + +⃐ ⃑ ⃒ ⃓ ⃔ ⃕ ⃖ ⃗ ⃘ ⃙ ⃚ + +ℋ ℌ ℍ ℎ ℏ ℐ ℑ + +⅓ ⅔ ⅕ ⅖ ⅗ + +∬ ∭ ∮ ∯ ∰ + +⌖ ⌗ ⌘ ⌙ ⌚ ⌛ + +␀ ␁ ␂ ␃ ␄ ␅ + +⑀ ⑁ ⑂ ⑃ ⑄ + +① ② ③ ④ ⑤ + +╘ ╙ ╚ ╛ ╜ ╝ + +▁ ▂ ▃ ▄ ▅ ▆ + +▤ ▥ ▦ ▧ ▨ + +♔ ♕ ♖ ♗ ♘ ♙ + +✈ ✉ ✌ ✍ ✎ + +ぁ あ ぃ い ぅ + +ァ ア ィ イ ゥ + +ㄅ ㄆ ㄇ ㄈ ㄉ + +ㄱ ㄲ ㄳ ㄴ ㄵ + +㆚ ㆛ ㆜ ㆝ ㆞ + +㈀ ㈁ ㈂ ㈃ ㈄ + +㌀ ㌁ ㌂ ㌃ ㌄ + +乺 乻 乼 乽 乾 + +걺 걻 걼 걽 걾 + +豈 更 車 賈 滑 + +שּׁ שּׂ אַ אָ אּ + +ﮄ ﮅ ﮆ ﮇ ﮈ ﮉ + + ﺵ ﺶ ﺷ ﺸ + +「 」 、 ・ ヲ ァ ィ ゥ + )"; + size_t pos; + BOOST_CHECK(validateUTF8(source, pos)); +} + +BOOST_AUTO_TEST_SUITE_END() + +} +} |