From 569e0c53f276eccbd9452988910aa8f3b4bcf13f Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Fri, 16 Jun 2017 17:13:18 +0100 Subject: Implement strict UTF-8 validation --- Changelog.md | 3 ++- libdevcore/UTF8.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/Changelog.md b/Changelog.md index cfedf1fc..5f4ec10b 100644 --- a/Changelog.md +++ b/Changelog.md @@ -13,13 +13,14 @@ Features: * Inline Assembly: function definitions and function calls. * Code Generator: Added the Whiskers template system. * Remove obsolete Why3 output. + * Type Checker: Enforce strict UTF-8 validation. Bugfixes: * Code generator: Use ``REVERT`` instead of ``INVALID`` for generated input validation routines. * Type Checker: Fix address literals not being treated as compile-time constants. * Type Checker: Disallow invoking the same modifier multiple times. - * Type Checker: Make UTF8-validation a bit more sloppy to include more valid sequences. * Type Checker: Do not treat strings that look like addresses as addresses. + * Type Checker: Support valid, but incorrectly rejected UTF-8 sequences. * Fixed crash concerning non-callable types. * Unused variable warnings no longer issued for variables used inside inline assembly. * Code Generator: Fix ABI encoding of empty literal string. diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index 449ccc5d..793bc080 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -27,6 +27,50 @@ namespace dev { +namespace +{ + +/// Validate byte sequence against Unicode chapter 3 Table 3-7. +bool isWellFormed(unsigned char byte1, unsigned char byte2) +{ + switch (byte1) + { + case 0xc0 ... 0xc1: + return false; + case 0xc2 ... 0xdf: + break; + case 0xe0: + if (byte2 < 0xa0) + return false; + break; + case 0xe1 ... 0xec: + break; + case 0xed: + if (byte2 > 0x9f) + return false; + break; + case 0xee ... 0xef: + break; + case 0xf0: + if (byte2 < 0x90) + return false; + break; + case 0xf1 ... 0xf3: + break; + case 0xf4: + if (byte2 > 0x8f) + return false; + break; + case 0xf5 ... 0xf7: + default: + /// Technically anything below 0xc0 or above 0xf7 is + /// not possible to encode using Table 3-6 anyway. + return false; + } + return true; +} + +} bool validateUTF8(std::string const& _input, size_t& _invalidPosition) { @@ -36,6 +80,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) for (; i < length; i++) { + // Check for Unicode Chapter 3 Table 3-6 conformity. if ((unsigned char)_input[i] < 0x80) continue; @@ -67,6 +112,13 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) valid = false; break; } + + // Check for Unicode Chapter 3 Table 3-7 conformity. + if ((j == 0) && !isWellFormed(_input[i - 1], _input[i])) + { + valid = false; + break; + } } } @@ -77,5 +129,4 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) return false; } - } -- cgit v1.2.3 From 6488f7e0795a77e4175361e3cb7270b47168a22a Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Fri, 16 Jun 2017 17:23:11 +0100 Subject: Validate first byte properly for UTF8 --- libdevcore/UTF8.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index 793bc080..ad62c8b0 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -85,11 +85,19 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) continue; size_t count = 0; - switch(_input[i] & 0xf0) { - case 0xc0: count = 1; break; - case 0xe0: count = 2; break; - case 0xf0: count = 3; break; - default: break; + switch ((unsigned char)_input[i]) + { + case 0xc0 ... 0xdf: + count = 1; + break; + case 0xe0 ... 0xef: + count = 2; + break; + case 0xf0 ... 0xf7: + count = 3; + break; + default: + break; } if (count == 0) -- cgit v1.2.3 From c45dbab00cbe3c8af61695a1fc095612e089358a Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 22 Jun 2017 11:24:15 +0100 Subject: Rewrite validateUTF8 to use char --- libdevcore/UTF8.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index ad62c8b0..affe64b2 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -70,18 +70,15 @@ bool isWellFormed(unsigned char byte1, unsigned char byte2) return true; } -} - -bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidPosition) { - const size_t length = _input.length(); bool valid = true; size_t i = 0; - for (; i < length; i++) + for (; i < _length; i++) { // Check for Unicode Chapter 3 Table 3-6 conformity. - if ((unsigned char)_input[i] < 0x80) + if (_input[i] < 0x80) continue; size_t count = 0; @@ -106,7 +103,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) break; } - if ((i + count) >= length) + if ((i + count) >= _length) { valid = false; break; @@ -138,3 +135,10 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) } } + +bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +{ + return validateUTF8(reinterpret_cast(_input.c_str()), _input.length(), _invalidPosition); +} + +} -- cgit v1.2.3 From aea5f90ad38bb69e7ef4346c036815aaa16f0ab3 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 22 Jun 2017 13:37:38 +0100 Subject: Rewrite validateUTF8 using if/else --- libdevcore/UTF8.cpp | 77 ++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index affe64b2..2ae720ec 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -33,41 +33,48 @@ namespace /// Validate byte sequence against Unicode chapter 3 Table 3-7. bool isWellFormed(unsigned char byte1, unsigned char byte2) { - switch (byte1) - { - case 0xc0 ... 0xc1: + if (byte1 == 0xc0 || byte1 == 0xc1) return false; - case 0xc2 ... 0xdf: - break; - case 0xe0: + else if (byte1 >= 0xc2 && byte1 <= 0xdf) + return true; + else if (byte1 == 0xe0) + { if (byte2 < 0xa0) return false; - break; - case 0xe1 ... 0xec: - break; - case 0xed: + else + return true; + } + else if (byte1 >= 0xe1 && byte1 <= 0xec) + return true; + else if (byte1 == 0xed) + { if (byte2 > 0x9f) return false; - break; - case 0xee ... 0xef: - break; - case 0xf0: + else + return true; + } + else if (byte1 == 0xee || byte1 == 0xef) + return true; + else if (byte1 == 0xf0) + { if (byte2 < 0x90) return false; - break; - case 0xf1 ... 0xf3: - break; - case 0xf4: + else + return true; + } + else if (byte1 >= 0xf1 && byte1 <= 0xf3) + return true; + else if (byte1 == 0xf4) + { if (byte2 > 0x8f) return false; - break; - case 0xf5 ... 0xf7: - default: - /// Technically anything below 0xc0 or above 0xf7 is - /// not possible to encode using Table 3-6 anyway. - return false; + else + return true; } - return true; + /// 0xf5 .. 0xf7 is disallowed + /// Technically anything below 0xc0 or above 0xf7 is + /// not possible to encode using Table 3-6 anyway. + return false; } bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidPosition) @@ -82,20 +89,12 @@ bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidP continue; size_t count = 0; - switch ((unsigned char)_input[i]) - { - case 0xc0 ... 0xdf: - count = 1; - break; - case 0xe0 ... 0xef: - count = 2; - break; - case 0xf0 ... 0xf7: - count = 3; - break; - default: - break; - } + if (_input[i] >= 0xc0 && _input[i] <= 0xdf) + count = 1; + else if (_input[i] >= 0xe0 && _input[i] <= 0xef) + count = 2; + else if (_input[i] >= 0xf0 && _input[i] <= 0xf7) + count = 3; if (count == 0) { -- cgit v1.2.3 From 551e19e88436591502c9f423400fe4e58a689032 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Mon, 26 Jun 2017 14:28:24 +0100 Subject: Add comprehensive test for UTF-8 validation --- test/libdevcore/UTF8.cpp | 103 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 test/libdevcore/UTF8.cpp diff --git a/test/libdevcore/UTF8.cpp b/test/libdevcore/UTF8.cpp new file mode 100644 index 00000000..4b7a4372 --- /dev/null +++ b/test/libdevcore/UTF8.cpp @@ -0,0 +1,103 @@ +/* + This file is part of solidity. + + solidity is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + solidity is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with solidity. If not, see . +*/ +/** + * Unit tests for UTF-8 validation. + */ + +#include +#include + +#include "../TestHelper.h" + +using namespace std; + +namespace dev +{ +namespace test +{ + +BOOST_AUTO_TEST_SUITE(UTF8) + +namespace { + +bool isValidUTF8(string const& _value) +{ + size_t pos; + return validateUTF8(asString(fromHex(_value)), pos); +} + +bool isInvalidUTF8(string const& _value, size_t _expectedPos) +{ + size_t pos; + if (validateUTF8(asString(fromHex(_value)), pos)) + return false; + if (pos != _expectedPos) + return false; + return true; +} + +} + +BOOST_AUTO_TEST_CASE(valid) +{ + BOOST_CHECK(isValidUTF8("00")); + BOOST_CHECK(isValidUTF8("20")); + BOOST_CHECK(isValidUTF8("7f")); + BOOST_CHECK(isValidUTF8("c281")); + BOOST_CHECK(isValidUTF8("df81")); + BOOST_CHECK(isValidUTF8("e0a081")); + BOOST_CHECK(isValidUTF8("e18081")); + BOOST_CHECK(isValidUTF8("ec8081")); + BOOST_CHECK(isValidUTF8("ed8081")); + BOOST_CHECK(isValidUTF8("ee8081")); + BOOST_CHECK(isValidUTF8("ef8081")); + BOOST_CHECK(isValidUTF8("f0908081")); + BOOST_CHECK(isValidUTF8("f3808081")); + BOOST_CHECK(isValidUTF8("f2808081")); + BOOST_CHECK(isValidUTF8("f3808081")); + BOOST_CHECK(isValidUTF8("f48e8081")); +} + +BOOST_AUTO_TEST_CASE(invalid) +{ + // anything between 0x80 and 0xc0 is disallowed + BOOST_CHECK(isInvalidUTF8("80", 0)); // invalid per table 3.6 + BOOST_CHECK(isInvalidUTF8("a0", 0)); // invalid per table 3.6 + BOOST_CHECK(isInvalidUTF8("c0", 0)); // invalid per table 3.7 + BOOST_CHECK(isInvalidUTF8("c1", 0)); // invalid per table 3.7 + BOOST_CHECK(isInvalidUTF8("c2", 0)); // too short (position is reported as the first byte) + BOOST_CHECK(isInvalidUTF8("e08081", 2)); // e0 must be followed by >= a0 + BOOST_CHECK(isInvalidUTF8("e180", 0)); // too short + BOOST_CHECK(isInvalidUTF8("ec80", 0)); // too short + BOOST_CHECK(isInvalidUTF8("f08f8001", 2)); // f0 must be followed by >= 90 + BOOST_CHECK(isInvalidUTF8("f18080", 0)); // too short + BOOST_CHECK(isInvalidUTF8("f4908081", 2)); // f4 must be followed by < 90 + // anything above 0xf7 is disallowed + BOOST_CHECK(isInvalidUTF8("f8", 0)); // invalid per table 3.7 + BOOST_CHECK(isInvalidUTF8("f9", 0)); // invalid per table 3.7 +} + +BOOST_AUTO_TEST_CASE(corpus) +{ + string source = R"( + )"; +} + +BOOST_AUTO_TEST_SUITE_END() + +} +} -- cgit v1.2.3 From ec15df2aa76c4df532126ec34761b268a1e78b2d Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Mon, 26 Jun 2017 14:29:00 +0100 Subject: Add a text corpus to the UTF-8 tests --- test/libdevcore/UTF8.cpp | 113 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/test/libdevcore/UTF8.cpp b/test/libdevcore/UTF8.cpp index 4b7a4372..719ada72 100644 --- a/test/libdevcore/UTF8.cpp +++ b/test/libdevcore/UTF8.cpp @@ -94,7 +94,120 @@ BOOST_AUTO_TEST_CASE(invalid) BOOST_AUTO_TEST_CASE(corpus) { string source = R"( +κόσμε + +hélló + +Ā ā Ă ă Ą ą + +ƀ Ɓ Ƃ ƃ Ƅ ƅ + +ɐ ɑ ɒ ɓ ɔ ɕ + +ʰ ʱ ʲ ʳ ʴ ʵ + +̀ ́ ̂ ̃ ̄ ̅ + +ϩ Ϫ ϫ Ϭ ϭ Ϯ + +Ё Ђ Ѓ Є Ѕ І + +Ա Բ Գ Դ Ե Զ + + ק ר ש ת װ ױ + +ځ ڂ ڃ ڄ څ چ + +ऑ ऒ ओ औ क ख + +ও ঔ ক খ গ ঘ + +ਘ ਙ ਚ ਛ ਜ ਝ + +ઓ ઔ ક ખ ગ ઘ + +ଗ ଘ ଙ ଚ ଛ ଜ + +ஔ க ங ச ஜ ஞ + +ఎ ఏ ఐ ఒ ఓ ఔ + +ಓ ಔ ಕ ಖ ಗ ಘ + +ഐ ഒ ഓ ഔ ക + +ฒ ณ ด ต ถ ท + +ມ ຢ ຣ ລ ວ ສ + +༄ ༅ ༆ ༇ ༈ ༉ + +Ⴑ Ⴒ Ⴓ Ⴔ Ⴕ Ⴖ + +ᄌ ᄍ ᄎ ᄏ ᄐ + +Ḕ ḕ Ḗ ḗ Ḙ ḙ Ḛ + +ἐ ἑ ἒ ἓ ἔ ἕ + +₠ ₡ ₢ ₣ ₤ ₥ + +⃐ ⃑ ⃒ ⃓ ⃔ ⃕ ⃖ ⃗ ⃘ ⃙ ⃚ + +ℋ ℌ ℍ ℎ ℏ ℐ ℑ + +⅓ ⅔ ⅕ ⅖ ⅗ + +∬ ∭ ∮ ∯ ∰ + +⌖ ⌗ ⌘ ⌙ ⌚ ⌛ + +␀ ␁ ␂ ␃ ␄ ␅ + +⑀ ⑁ ⑂ ⑃ ⑄ + +① ② ③ ④ ⑤ + +╘ ╙ ╚ ╛ ╜ ╝ + +▁ ▂ ▃ ▄ ▅ ▆ + +▤ ▥ ▦ ▧ ▨ + +♔ ♕ ♖ ♗ ♘ ♙ + +✈ ✉ ✌ ✍ ✎ + +ぁ あ ぃ い ぅ + +ァ ア ィ イ ゥ + +ㄅ ㄆ ㄇ ㄈ ㄉ + +ㄱ ㄲ ㄳ ㄴ ㄵ + +㆚ ㆛ ㆜ ㆝ ㆞ + +㈀ ㈁ ㈂ ㈃ ㈄ + +㌀ ㌁ ㌂ ㌃ ㌄ + +乺 乻 乼 乽 乾 + +걺 걻 걼 걽 걾 + +豈 更 車 賈 滑 + +שּׁ שּׂ אַ אָ אּ + +ﮄ ﮅ ﮆ ﮇ ﮈ ﮉ + + ﺵ ﺶ ﺷ ﺸ + +「 」 、 ・ ヲ ァ ィ ゥ )"; + size_t pos; + BOOST_CHECK(validateUTF8(source, pos)); } BOOST_AUTO_TEST_SUITE_END() -- cgit v1.2.3