From 569e0c53f276eccbd9452988910aa8f3b4bcf13f Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Fri, 16 Jun 2017 17:13:18 +0100 Subject: Implement strict UTF-8 validation --- libdevcore/UTF8.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'libdevcore/UTF8.cpp') diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index 449ccc5d..793bc080 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -27,6 +27,50 @@ namespace dev { +namespace +{ + +/// Validate byte sequence against Unicode chapter 3 Table 3-7. +bool isWellFormed(unsigned char byte1, unsigned char byte2) +{ + switch (byte1) + { + case 0xc0 ... 0xc1: + return false; + case 0xc2 ... 0xdf: + break; + case 0xe0: + if (byte2 < 0xa0) + return false; + break; + case 0xe1 ... 0xec: + break; + case 0xed: + if (byte2 > 0x9f) + return false; + break; + case 0xee ... 0xef: + break; + case 0xf0: + if (byte2 < 0x90) + return false; + break; + case 0xf1 ... 0xf3: + break; + case 0xf4: + if (byte2 > 0x8f) + return false; + break; + case 0xf5 ... 0xf7: + default: + /// Technically anything below 0xc0 or above 0xf7 is + /// not possible to encode using Table 3-6 anyway. + return false; + } + return true; +} + +} bool validateUTF8(std::string const& _input, size_t& _invalidPosition) { @@ -36,6 +80,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) for (; i < length; i++) { + // Check for Unicode Chapter 3 Table 3-6 conformity. if ((unsigned char)_input[i] < 0x80) continue; @@ -67,6 +112,13 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) valid = false; break; } + + // Check for Unicode Chapter 3 Table 3-7 conformity. + if ((j == 0) && !isWellFormed(_input[i - 1], _input[i])) + { + valid = false; + break; + } } } @@ -77,5 +129,4 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) return false; } - } -- cgit v1.2.3 From 6488f7e0795a77e4175361e3cb7270b47168a22a Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Fri, 16 Jun 2017 17:23:11 +0100 Subject: Validate first byte properly for UTF8 --- libdevcore/UTF8.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'libdevcore/UTF8.cpp') diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index 793bc080..ad62c8b0 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -85,11 +85,19 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) continue; size_t count = 0; - switch(_input[i] & 0xf0) { - case 0xc0: count = 1; break; - case 0xe0: count = 2; break; - case 0xf0: count = 3; break; - default: break; + switch ((unsigned char)_input[i]) + { + case 0xc0 ... 0xdf: + count = 1; + break; + case 0xe0 ... 0xef: + count = 2; + break; + case 0xf0 ... 0xf7: + count = 3; + break; + default: + break; } if (count == 0) -- cgit v1.2.3 From c45dbab00cbe3c8af61695a1fc095612e089358a Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 22 Jun 2017 11:24:15 +0100 Subject: Rewrite validateUTF8 to use char --- libdevcore/UTF8.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'libdevcore/UTF8.cpp') diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index ad62c8b0..affe64b2 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -70,18 +70,15 @@ bool isWellFormed(unsigned char byte1, unsigned char byte2) return true; } -} - -bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidPosition) { - const size_t length = _input.length(); bool valid = true; size_t i = 0; - for (; i < length; i++) + for (; i < _length; i++) { // Check for Unicode Chapter 3 Table 3-6 conformity. - if ((unsigned char)_input[i] < 0x80) + if (_input[i] < 0x80) continue; size_t count = 0; @@ -106,7 +103,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) break; } - if ((i + count) >= length) + if ((i + count) >= _length) { valid = false; break; @@ -138,3 +135,10 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) } } + +bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +{ + return validateUTF8(reinterpret_cast(_input.c_str()), _input.length(), _invalidPosition); +} + +} -- cgit v1.2.3 From aea5f90ad38bb69e7ef4346c036815aaa16f0ab3 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 22 Jun 2017 13:37:38 +0100 Subject: Rewrite validateUTF8 using if/else --- libdevcore/UTF8.cpp | 77 ++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) (limited to 'libdevcore/UTF8.cpp') diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index affe64b2..2ae720ec 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -33,41 +33,48 @@ namespace /// Validate byte sequence against Unicode chapter 3 Table 3-7. bool isWellFormed(unsigned char byte1, unsigned char byte2) { - switch (byte1) - { - case 0xc0 ... 0xc1: + if (byte1 == 0xc0 || byte1 == 0xc1) return false; - case 0xc2 ... 0xdf: - break; - case 0xe0: + else if (byte1 >= 0xc2 && byte1 <= 0xdf) + return true; + else if (byte1 == 0xe0) + { if (byte2 < 0xa0) return false; - break; - case 0xe1 ... 0xec: - break; - case 0xed: + else + return true; + } + else if (byte1 >= 0xe1 && byte1 <= 0xec) + return true; + else if (byte1 == 0xed) + { if (byte2 > 0x9f) return false; - break; - case 0xee ... 0xef: - break; - case 0xf0: + else + return true; + } + else if (byte1 == 0xee || byte1 == 0xef) + return true; + else if (byte1 == 0xf0) + { if (byte2 < 0x90) return false; - break; - case 0xf1 ... 0xf3: - break; - case 0xf4: + else + return true; + } + else if (byte1 >= 0xf1 && byte1 <= 0xf3) + return true; + else if (byte1 == 0xf4) + { if (byte2 > 0x8f) return false; - break; - case 0xf5 ... 0xf7: - default: - /// Technically anything below 0xc0 or above 0xf7 is - /// not possible to encode using Table 3-6 anyway. - return false; + else + return true; } - return true; + /// 0xf5 .. 0xf7 is disallowed + /// Technically anything below 0xc0 or above 0xf7 is + /// not possible to encode using Table 3-6 anyway. + return false; } bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidPosition) @@ -82,20 +89,12 @@ bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidP continue; size_t count = 0; - switch ((unsigned char)_input[i]) - { - case 0xc0 ... 0xdf: - count = 1; - break; - case 0xe0 ... 0xef: - count = 2; - break; - case 0xf0 ... 0xf7: - count = 3; - break; - default: - break; - } + if (_input[i] >= 0xc0 && _input[i] <= 0xdf) + count = 1; + else if (_input[i] >= 0xe0 && _input[i] <= 0xef) + count = 2; + else if (_input[i] >= 0xf0 && _input[i] <= 0xf7) + count = 3; if (count == 0) { -- cgit v1.2.3