diff options
Diffstat (limited to 'libdevcore/UTF8.cpp')
-rw-r--r-- | libdevcore/UTF8.cpp | 84 |
1 files changed, 73 insertions, 11 deletions
diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp index 449ccc5d..2ae720ec 100644 --- a/libdevcore/UTF8.cpp +++ b/libdevcore/UTF8.cpp @@ -27,25 +27,74 @@ namespace dev { +namespace +{ -bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +/// Validate byte sequence against Unicode chapter 3 Table 3-7. +bool isWellFormed(unsigned char byte1, unsigned char byte2) +{ + if (byte1 == 0xc0 || byte1 == 0xc1) + return false; + else if (byte1 >= 0xc2 && byte1 <= 0xdf) + return true; + else if (byte1 == 0xe0) + { + if (byte2 < 0xa0) + return false; + else + return true; + } + else if (byte1 >= 0xe1 && byte1 <= 0xec) + return true; + else if (byte1 == 0xed) + { + if (byte2 > 0x9f) + return false; + else + return true; + } + else if (byte1 == 0xee || byte1 == 0xef) + return true; + else if (byte1 == 0xf0) + { + if (byte2 < 0x90) + return false; + else + return true; + } + else if (byte1 >= 0xf1 && byte1 <= 0xf3) + return true; + else if (byte1 == 0xf4) + { + if (byte2 > 0x8f) + return false; + else + return true; + } + /// 0xf5 .. 0xf7 is disallowed + /// Technically anything below 0xc0 or above 0xf7 is + /// not possible to encode using Table 3-6 anyway. + return false; +} + +bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidPosition) { - const size_t length = _input.length(); bool valid = true; size_t i = 0; - for (; i < length; i++) + for (; i < _length; i++) { - if ((unsigned char)_input[i] < 0x80) + // Check for Unicode Chapter 3 Table 3-6 conformity. + if (_input[i] < 0x80) continue; size_t count = 0; - switch(_input[i] & 0xf0) { - case 0xc0: count = 1; break; - case 0xe0: count = 2; break; - case 0xf0: count = 3; break; - default: break; - } + if (_input[i] >= 0xc0 && _input[i] <= 0xdf) + count = 1; + else if (_input[i] >= 0xe0 && _input[i] <= 0xef) + count = 2; + else if (_input[i] >= 0xf0 && _input[i] <= 0xf7) + count = 3; if (count == 0) { @@ -53,7 +102,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) break; } - if ((i + count) >= length) + if ((i + count) >= _length) { valid = false; break; @@ -67,6 +116,13 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) valid = false; break; } + + // Check for Unicode Chapter 3 Table 3-7 conformity. + if ((j == 0) && !isWellFormed(_input[i - 1], _input[i])) + { + valid = false; + break; + } } } @@ -77,5 +133,11 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition) return false; } +} + +bool validateUTF8(std::string const& _input, size_t& _invalidPosition) +{ + return validateUTF8(reinterpret_cast<unsigned char const*>(_input.c_str()), _input.length(), _invalidPosition); +} } |