From 87821c53c3a73d3e35a0e50a7c159d9aa5d6b253 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Wed, 14 Nov 2018 14:59:30 +0100 Subject: Isolating files shared between Yul- and Solidity language frontend. --- liblangutil/Scanner.cpp | 920 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 920 insertions(+) create mode 100644 liblangutil/Scanner.cpp (limited to 'liblangutil/Scanner.cpp') diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp new file mode 100644 index 00000000..246f5ea6 --- /dev/null +++ b/liblangutil/Scanner.cpp @@ -0,0 +1,920 @@ +/* + This file is part of solidity. + + solidity is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + solidity is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with solidity. If not, see . + + This file is derived from the file "scanner.cc", which was part of the + V8 project. The original copyright header follows: + + Copyright 2006-2012, the V8 project authors. All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/** + * @author Christian + * @date 2014 + * Solidity scanner. + */ + +#include +#include +#include +#include + +using namespace std; + +namespace dev +{ +namespace solidity +{ + +namespace +{ +bool isDecimalDigit(char c) +{ + return '0' <= c && c <= '9'; +} +bool isHexDigit(char c) +{ + return isDecimalDigit(c) + || ('a' <= c && c <= 'f') + || ('A' <= c && c <= 'F'); +} +bool isLineTerminator(char c) +{ + return c == '\n'; +} +bool isWhiteSpace(char c) +{ + return c == ' ' || c == '\n' || c == '\t' || c == '\r'; +} +bool isIdentifierStart(char c) +{ + return c == '_' || c == '$' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); +} +bool isIdentifierPart(char c) +{ + return isIdentifierStart(c) || isDecimalDigit(c); +} +int hexValue(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else return -1; +} +} // end anonymous namespace + + + +/// Scoped helper for literal recording. Automatically drops the literal +/// if aborting the scanning before it's complete. +enum LiteralType { + LITERAL_TYPE_STRING, + LITERAL_TYPE_NUMBER, // not really different from string type in behaviour + LITERAL_TYPE_COMMENT +}; + +class LiteralScope +{ +public: + explicit LiteralScope(Scanner* _self, enum LiteralType _type): m_type(_type) + , m_scanner(_self) + , m_complete(false) + { + if (_type == LITERAL_TYPE_COMMENT) + m_scanner->m_nextSkippedComment.literal.clear(); + else + m_scanner->m_nextToken.literal.clear(); + } + ~LiteralScope() + { + if (!m_complete) + { + if (m_type == LITERAL_TYPE_COMMENT) + m_scanner->m_nextSkippedComment.literal.clear(); + else + m_scanner->m_nextToken.literal.clear(); + } + } + void complete() { m_complete = true; } + +private: + enum LiteralType m_type; + Scanner* m_scanner; + bool m_complete; +}; // end of LiteralScope class + + +void Scanner::reset(CharStream const& _source, string const& _sourceName) +{ + m_source = _source; + m_sourceName = make_shared(_sourceName); + reset(); +} + +void Scanner::reset() +{ + m_source.reset(); + m_char = m_source.get(); + skipWhitespace(); + scanToken(); + next(); +} + +bool Scanner::scanHexByte(char& o_scannedByte) +{ + char x = 0; + for (int i = 0; i < 2; i++) + { + int d = hexValue(m_char); + if (d < 0) + { + rollback(i); + return false; + } + x = x * 16 + d; + advance(); + } + o_scannedByte = x; + return true; +} + +bool Scanner::scanUnicode(unsigned & o_codepoint) +{ + unsigned x = 0; + for (int i = 0; i < 4; i++) + { + int d = hexValue(m_char); + if (d < 0) + { + rollback(i); + return false; + } + x = x * 16 + d; + advance(); + } + o_codepoint = x; + return true; +} + +// This supports codepoints between 0000 and FFFF. +void Scanner::addUnicodeAsUTF8(unsigned codepoint) +{ + if (codepoint <= 0x7f) + addLiteralChar(codepoint); + else if (codepoint <= 0x7ff) + { + addLiteralChar(0xc0 | (codepoint >> 6)); + addLiteralChar(0x80 | (codepoint & 0x3f)); + } + else + { + addLiteralChar(0xe0 | (codepoint >> 12)); + addLiteralChar(0x80 | ((codepoint >> 6) & 0x3f)); + addLiteralChar(0x80 | (codepoint & 0x3f)); + } +} + +// Ensure that tokens can be stored in a byte. +BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100); + +Token Scanner::next() +{ + m_currentToken = m_nextToken; + m_skippedComment = m_nextSkippedComment; + scanToken(); + + return m_currentToken.token; +} + +Token Scanner::selectToken(char _next, Token _then, Token _else) +{ + advance(); + if (m_char == _next) + return selectToken(_then); + else + return _else; +} + +bool Scanner::skipWhitespace() +{ + int const startPosition = sourcePos(); + while (isWhiteSpace(m_char)) + advance(); + // Return whether or not we skipped any characters. + return sourcePos() != startPosition; +} + +void Scanner::skipWhitespaceExceptUnicodeLinebreak() +{ + while (isWhiteSpace(m_char) && !isUnicodeLinebreak()) + advance(); +} + +Token Scanner::skipSingleLineComment() +{ + // Line terminator is not part of the comment. If it is a + // non-ascii line terminator, it will result in a parser error. + while (!isUnicodeLinebreak()) + if (!advance()) break; + + return Token::Whitespace; +} + +Token Scanner::scanSingleLineDocComment() +{ + LiteralScope literal(this, LITERAL_TYPE_COMMENT); + advance(); //consume the last '/' at /// + + skipWhitespaceExceptUnicodeLinebreak(); + + while (!isSourcePastEndOfInput()) + { + if (isLineTerminator(m_char)) + { + // check if next line is also a documentation comment + skipWhitespace(); + if (!m_source.isPastEndOfInput(3) && + m_source.get(0) == '/' && + m_source.get(1) == '/' && + m_source.get(2) == '/') + { + addCommentLiteralChar('\n'); + m_char = m_source.advanceAndGet(3); + } + else + break; // next line is not a documentation comment, we are done + + } + else if (isUnicodeLinebreak()) + // Any line terminator that is not '\n' is considered to end the + // comment. + break; + addCommentLiteralChar(m_char); + advance(); + } + literal.complete(); + return Token::CommentLiteral; +} + +Token Scanner::skipMultiLineComment() +{ + advance(); + while (!isSourcePastEndOfInput()) + { + char ch = m_char; + advance(); + + // If we have reached the end of the multi-line comment, we + // consume the '/' and insert a whitespace. This way all + // multi-line comments are treated as whitespace. + if (ch == '*' && m_char == '/') + { + m_char = ' '; + return Token::Whitespace; + } + } + // Unterminated multi-line comment. + return Token::Illegal; +} + +Token Scanner::scanMultiLineDocComment() +{ + LiteralScope literal(this, LITERAL_TYPE_COMMENT); + bool endFound = false; + bool charsAdded = false; + + while (isWhiteSpace(m_char) && !isLineTerminator(m_char)) + advance(); + + while (!isSourcePastEndOfInput()) + { + //handle newlines in multline comments + if (isLineTerminator(m_char)) + { + skipWhitespace(); + if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*') + { // it is unknown if this leads to the end of the comment + addCommentLiteralChar('*'); + advance(); + } + else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/') + { // skip first '*' in subsequent lines + if (charsAdded) + addCommentLiteralChar('\n'); + m_char = m_source.advanceAndGet(2); + } + else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/') + { // if after newline the comment ends, don't insert the newline + m_char = m_source.advanceAndGet(2); + endFound = true; + break; + } + else if (charsAdded) + addCommentLiteralChar('\n'); + } + + if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/') + { + m_char = m_source.advanceAndGet(2); + endFound = true; + break; + } + addCommentLiteralChar(m_char); + charsAdded = true; + advance(); + } + literal.complete(); + if (!endFound) + return Token::Illegal; + else + return Token::CommentLiteral; +} + +Token Scanner::scanSlash() +{ + int firstSlashPosition = sourcePos(); + advance(); + if (m_char == '/') + { + if (!advance()) /* double slash comment directly before EOS */ + return Token::Whitespace; + else if (m_char == '/') + { + // doxygen style /// comment + Token comment; + m_nextSkippedComment.location.start = firstSlashPosition; + comment = scanSingleLineDocComment(); + m_nextSkippedComment.location.end = sourcePos(); + m_nextSkippedComment.token = comment; + return Token::Whitespace; + } + else + return skipSingleLineComment(); + } + else if (m_char == '*') + { + // doxygen style /** natspec comment + if (!advance()) /* slash star comment before EOS */ + return Token::Illegal; + else if (m_char == '*') + { + advance(); //consume the last '*' at /** + + // "/**/" + if (m_char == '/') + { + advance(); //skip the closing slash + return Token::Whitespace; + } + // we actually have a multiline documentation comment + Token comment; + m_nextSkippedComment.location.start = firstSlashPosition; + comment = scanMultiLineDocComment(); + m_nextSkippedComment.location.end = sourcePos(); + m_nextSkippedComment.token = comment; + if (comment == Token::Illegal) + return Token::Illegal; + else + return Token::Whitespace; + } + else + return skipMultiLineComment(); + } + else if (m_char == '=') + return selectToken(Token::AssignDiv); + else + return Token::Div; +} + +void Scanner::scanToken() +{ + m_nextToken.literal.clear(); + m_nextToken.extendedTokenInfo = make_tuple(0, 0); + m_nextSkippedComment.literal.clear(); + m_nextSkippedComment.extendedTokenInfo = make_tuple(0, 0); + + Token token; + // M and N are for the purposes of grabbing different type sizes + unsigned m; + unsigned n; + do + { + // Remember the position of the next token + m_nextToken.location.start = sourcePos(); + switch (m_char) + { + case '"': + case '\'': + token = scanString(); + break; + case '<': + // < <= << <<= + advance(); + if (m_char == '=') + token = selectToken(Token::LessThanOrEqual); + else if (m_char == '<') + token = selectToken('=', Token::AssignShl, Token::SHL); + else + token = Token::LessThan; + break; + case '>': + // > >= >> >>= >>> >>>= + advance(); + if (m_char == '=') + token = selectToken(Token::GreaterThanOrEqual); + else if (m_char == '>') + { + // >> >>= >>> >>>= + advance(); + if (m_char == '=') + token = selectToken(Token::AssignSar); + else if (m_char == '>') + token = selectToken('=', Token::AssignShr, Token::SHR); + else + token = Token::SAR; + } + else + token = Token::GreaterThan; + break; + case '=': + // = == => + advance(); + if (m_char == '=') + token = selectToken(Token::Equal); + else if (m_char == '>') + token = selectToken(Token::Arrow); + else + token = Token::Assign; + break; + case '!': + // ! != + advance(); + if (m_char == '=') + token = selectToken(Token::NotEqual); + else + token = Token::Not; + break; + case '+': + // + ++ += + advance(); + if (m_char == '+') + token = selectToken(Token::Inc); + else if (m_char == '=') + token = selectToken(Token::AssignAdd); + else + token = Token::Add; + break; + case '-': + // - -- -= + advance(); + if (m_char == '-') + token = selectToken(Token::Dec); + else if (m_char == '=') + token = selectToken(Token::AssignSub); + else + token = Token::Sub; + break; + case '*': + // * ** *= + advance(); + if (m_char == '*') + token = selectToken(Token::Exp); + else if (m_char == '=') + token = selectToken(Token::AssignMul); + else + token = Token::Mul; + break; + case '%': + // % %= + token = selectToken('=', Token::AssignMod, Token::Mod); + break; + case '/': + // / // /* /= + token = scanSlash(); + break; + case '&': + // & && &= + advance(); + if (m_char == '&') + token = selectToken(Token::And); + else if (m_char == '=') + token = selectToken(Token::AssignBitAnd); + else + token = Token::BitAnd; + break; + case '|': + // | || |= + advance(); + if (m_char == '|') + token = selectToken(Token::Or); + else if (m_char == '=') + token = selectToken(Token::AssignBitOr); + else + token = Token::BitOr; + break; + case '^': + // ^ ^= + token = selectToken('=', Token::AssignBitXor, Token::BitXor); + break; + case '.': + // . Number + advance(); + if (isDecimalDigit(m_char)) + token = scanNumber('.'); + else + token = Token::Period; + break; + case ':': + token = selectToken(Token::Colon); + break; + case ';': + token = selectToken(Token::Semicolon); + break; + case ',': + token = selectToken(Token::Comma); + break; + case '(': + token = selectToken(Token::LParen); + break; + case ')': + token = selectToken(Token::RParen); + break; + case '[': + token = selectToken(Token::LBrack); + break; + case ']': + token = selectToken(Token::RBrack); + break; + case '{': + token = selectToken(Token::LBrace); + break; + case '}': + token = selectToken(Token::RBrace); + break; + case '?': + token = selectToken(Token::Conditional); + break; + case '~': + token = selectToken(Token::BitNot); + break; + default: + if (isIdentifierStart(m_char)) + { + tie(token, m, n) = scanIdentifierOrKeyword(); + + // Special case for hexadecimal literals + if (token == Token::Hex) + { + // reset + m = 0; + n = 0; + + // Special quoted hex string must follow + if (m_char == '"' || m_char == '\'') + token = scanHexString(); + else + token = Token::IllegalHex; + } + } + else if (isDecimalDigit(m_char)) + token = scanNumber(); + else if (skipWhitespace()) + token = Token::Whitespace; + else if (isSourcePastEndOfInput()) + token = Token::EOS; + else + token = selectToken(Token::Illegal); + break; + } + // Continue scanning for tokens as long as we're just skipping + // whitespace. + } + while (token == Token::Whitespace); + m_nextToken.location.end = sourcePos(); + m_nextToken.token = token; + m_nextToken.extendedTokenInfo = make_tuple(m, n); +} + +bool Scanner::scanEscape() +{ + char c = m_char; + advance(); + // Skip escaped newlines. + if (isLineTerminator(c)) + return true; + switch (c) + { + case '\'': // fall through + case '"': // fall through + case '\\': + break; + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + case 'u': + { + unsigned codepoint; + if (!scanUnicode(codepoint)) + return false; + addUnicodeAsUTF8(codepoint); + return true; + } + case 'x': + if (!scanHexByte(c)) + return false; + break; + default: + return false; + } + + addLiteralChar(c); + return true; +} + +bool Scanner::isUnicodeLinebreak() +{ + if (0x0a <= m_char && m_char <= 0x0d) + // line feed, vertical tab, form feed, carriage return + return true; + else if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85) + // NEL - U+0085, C2 85 in utf8 + return true; + else if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && ( + uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9 + )) + // LS - U+2028, E2 80 A8 in utf8 + // PS - U+2029, E2 80 A9 in utf8 + return true; + else + return false; +} + +Token Scanner::scanString() +{ + char const quote = m_char; + advance(); // consume quote + LiteralScope literal(this, LITERAL_TYPE_STRING); + while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak()) + { + char c = m_char; + advance(); + if (c == '\\') + { + if (isSourcePastEndOfInput() || !scanEscape()) + return Token::Illegal; + } + else + addLiteralChar(c); + } + if (m_char != quote) + return Token::Illegal; + literal.complete(); + advance(); // consume quote + return Token::StringLiteral; +} + +Token Scanner::scanHexString() +{ + char const quote = m_char; + advance(); // consume quote + LiteralScope literal(this, LITERAL_TYPE_STRING); + while (m_char != quote && !isSourcePastEndOfInput()) + { + char c = m_char; + if (!scanHexByte(c)) + return Token::IllegalHex; + addLiteralChar(c); + } + if (m_char != quote) + return Token::IllegalHex; + literal.complete(); + advance(); // consume quote + return Token::StringLiteral; +} + +// Parse for regex [:digit:]+(_[:digit:]+)* +void Scanner::scanDecimalDigits() +{ + // MUST begin with a decimal digit. + if (!isDecimalDigit(m_char)) + return; + + // May continue with decimal digit or underscore for grouping. + do addLiteralCharAndAdvance(); + while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_')); + + // Defer further validation of underscore to SyntaxChecker. +} + +Token Scanner::scanNumber(char _charSeen) +{ + enum { DECIMAL, HEX, BINARY } kind = DECIMAL; + LiteralScope literal(this, LITERAL_TYPE_NUMBER); + if (_charSeen == '.') + { + // we have already seen a decimal point of the float + addLiteralChar('.'); + if (m_char == '_') + return Token::Illegal; + scanDecimalDigits(); // we know we have at least one digit + } + else + { + solAssert(_charSeen == 0, ""); + // if the first character is '0' we must check for octals and hex + if (m_char == '0') + { + addLiteralCharAndAdvance(); + // either 0, 0exxx, 0Exxx, 0.xxx or a hex number + if (m_char == 'x') + { + // hex number + kind = HEX; + addLiteralCharAndAdvance(); + if (!isHexDigit(m_char)) + return Token::Illegal; // we must have at least one hex digit after 'x' + + while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation + addLiteralCharAndAdvance(); + } + else if (isDecimalDigit(m_char)) + // We do not allow octal numbers + return Token::Illegal; + } + // Parse decimal digits and allow trailing fractional part. + if (kind == DECIMAL) + { + scanDecimalDigits(); // optional + if (m_char == '.') + { + if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') + { + // Assume the input may be a floating point number with leading '_' in fraction part. + // Recover by consuming it all but returning `Illegal` right away. + addLiteralCharAndAdvance(); // '.' + addLiteralCharAndAdvance(); // '_' + scanDecimalDigits(); + } + if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1))) + { + // A '.' has to be followed by a number. + literal.complete(); + return Token::Number; + } + addLiteralCharAndAdvance(); + scanDecimalDigits(); + } + } + } + // scan exponent, if any + if (m_char == 'e' || m_char == 'E') + { + solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number"); + if (kind != DECIMAL) + return Token::Illegal; + else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') + { + // Recover from wrongly placed underscore as delimiter in literal with scientific + // notation by consuming until the end. + addLiteralCharAndAdvance(); // 'e' + addLiteralCharAndAdvance(); // '_' + scanDecimalDigits(); + literal.complete(); + return Token::Number; + } + // scan exponent + addLiteralCharAndAdvance(); // 'e' | 'E' + if (m_char == '+' || m_char == '-') + addLiteralCharAndAdvance(); + if (!isDecimalDigit(m_char)) + return Token::Illegal; // we must have at least one decimal digit after 'e'/'E' + scanDecimalDigits(); + } + // The source character immediately following a numeric literal must + // not be an identifier start or a decimal digit; see ECMA-262 + // section 7.8.3, page 17 (note that we read only one decimal digit + // if the value is 0). + if (isDecimalDigit(m_char) || isIdentifierStart(m_char)) + return Token::Illegal; + literal.complete(); + return Token::Number; +} + +tuple Scanner::scanIdentifierOrKeyword() +{ + solAssert(isIdentifierStart(m_char), ""); + LiteralScope literal(this, LITERAL_TYPE_STRING); + addLiteralCharAndAdvance(); + // Scan the rest of the identifier characters. + while (isIdentifierPart(m_char)) //get full literal + addLiteralCharAndAdvance(); + literal.complete(); + return TokenTraits::fromIdentifierOrKeyword(m_nextToken.literal); +} + +char CharStream::advanceAndGet(size_t _chars) +{ + if (isPastEndOfInput()) + return 0; + m_position += _chars; + if (isPastEndOfInput()) + return 0; + return m_source[m_position]; +} + +char CharStream::rollback(size_t _amount) +{ + solAssert(m_position >= _amount, ""); + m_position -= _amount; + return get(); +} + +string CharStream::lineAtPosition(int _position) const +{ + // if _position points to \n, it returns the line before the \n + using size_type = string::size_type; + size_type searchStart = min(m_source.size(), _position); + if (searchStart > 0) + searchStart--; + size_type lineStart = m_source.rfind('\n', searchStart); + if (lineStart == string::npos) + lineStart = 0; + else + lineStart++; + return m_source.substr(lineStart, min(m_source.find('\n', lineStart), + m_source.size()) - lineStart); +} + +tuple CharStream::translatePositionToLineColumn(int _position) const +{ + using size_type = string::size_type; + size_type searchPosition = min(m_source.size(), _position); + int lineNumber = count(m_source.begin(), m_source.begin() + searchPosition, '\n'); + size_type lineStart; + if (searchPosition == 0) + lineStart = 0; + else + { + lineStart = m_source.rfind('\n', searchPosition - 1); + lineStart = lineStart == string::npos ? 0 : lineStart + 1; + } + return tuple(lineNumber, searchPosition - lineStart); +} + + +} +} -- cgit v1.2.3 From 2518b5314c0c16dcdbbc0b093f5b716e9e85dbc2 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Wed, 14 Nov 2018 15:18:55 +0100 Subject: Splitting out CharStream from Scanner. --- liblangutil/Scanner.cpp | 50 ------------------------------------------------- 1 file changed, 50 deletions(-) (limited to 'liblangutil/Scanner.cpp') diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index 246f5ea6..beb39a4f 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -866,55 +866,5 @@ tuple Scanner::scanIdentifierOrKeyword() return TokenTraits::fromIdentifierOrKeyword(m_nextToken.literal); } -char CharStream::advanceAndGet(size_t _chars) -{ - if (isPastEndOfInput()) - return 0; - m_position += _chars; - if (isPastEndOfInput()) - return 0; - return m_source[m_position]; -} - -char CharStream::rollback(size_t _amount) -{ - solAssert(m_position >= _amount, ""); - m_position -= _amount; - return get(); -} - -string CharStream::lineAtPosition(int _position) const -{ - // if _position points to \n, it returns the line before the \n - using size_type = string::size_type; - size_type searchStart = min(m_source.size(), _position); - if (searchStart > 0) - searchStart--; - size_type lineStart = m_source.rfind('\n', searchStart); - if (lineStart == string::npos) - lineStart = 0; - else - lineStart++; - return m_source.substr(lineStart, min(m_source.find('\n', lineStart), - m_source.size()) - lineStart); -} - -tuple CharStream::translatePositionToLineColumn(int _position) const -{ - using size_type = string::size_type; - size_type searchPosition = min(m_source.size(), _position); - int lineNumber = count(m_source.begin(), m_source.begin() + searchPosition, '\n'); - size_type lineStart; - if (searchPosition == 0) - lineStart = 0; - else - { - lineStart = m_source.rfind('\n', searchPosition - 1); - lineStart = lineStart == string::npos ? 0 : lineStart + 1; - } - return tuple(lineNumber, searchPosition - lineStart); -} - - } } -- cgit v1.2.3 From d67322a1861d60a88151f7c25d6c3478a9a39acf Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Wed, 14 Nov 2018 17:11:55 +0100 Subject: Introduce namespace `langutil` in liblangutil directory. Also: - Use {}-style list initialisation for SourceLocation construction - Introduce new system includes - Changes the API of the Scanner to take source as value (with move) as opposed to as a reference --- liblangutil/Scanner.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'liblangutil/Scanner.cpp') diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index beb39a4f..3d7527d4 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -50,16 +50,14 @@ * Solidity scanner. */ -#include -#include #include #include +#include +#include using namespace std; -namespace dev -{ -namespace solidity +namespace langutil { namespace @@ -143,10 +141,10 @@ private: }; // end of LiteralScope class -void Scanner::reset(CharStream const& _source, string const& _sourceName) +void Scanner::reset(CharStream _source, string _sourceName) { - m_source = _source; - m_sourceName = make_shared(_sourceName); + m_source = std::move(_source); + m_sourceName = make_shared(std::move(_sourceName)); reset(); } @@ -866,5 +864,5 @@ tuple Scanner::scanIdentifierOrKeyword() return TokenTraits::fromIdentifierOrKeyword(m_nextToken.literal); } -} + } -- cgit v1.2.3 From e454737a3cf389ee400a9ef1d9f252c579a2ceea Mon Sep 17 00:00:00 2001 From: Lazaridis Date: Thu, 22 Nov 2018 18:37:19 +0200 Subject: adapt to latest code changes --- liblangutil/Scanner.cpp | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'liblangutil/Scanner.cpp') diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index 3d7527d4..091e9b89 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -311,7 +311,7 @@ Token Scanner::skipMultiLineComment() } } // Unterminated multi-line comment. - return Token::Illegal; + return Token::IllegalCommentTerminator; } Token Scanner::scanMultiLineDocComment() @@ -362,7 +362,7 @@ Token Scanner::scanMultiLineDocComment() } literal.complete(); if (!endFound) - return Token::Illegal; + return Token::IllegalCommentTerminator; else return Token::CommentLiteral; } @@ -392,7 +392,7 @@ Token Scanner::scanSlash() { // doxygen style /** natspec comment if (!advance()) /* slash star comment before EOS */ - return Token::Illegal; + return Token::IllegalCommentTerminator; else if (m_char == '*') { advance(); //consume the last '*' at /** @@ -409,8 +409,9 @@ Token Scanner::scanSlash() comment = scanMultiLineDocComment(); m_nextSkippedComment.location.end = sourcePos(); m_nextSkippedComment.token = comment; - if (comment == Token::Illegal) - return Token::Illegal; + // @todo possibly: if (comment.isIllegal) return comment; to pass all errors + if (comment == Token::IllegalCommentTerminator) + return Token::IllegalCommentTerminator; else return Token::Whitespace; } @@ -620,6 +621,7 @@ void Scanner::scanToken() else if (isSourcePastEndOfInput()) token = Token::EOS; else + // @todo verfiy if this is actually an "IllegalUnknown" case token = selectToken(Token::Illegal); break; } @@ -713,13 +715,13 @@ Token Scanner::scanString() if (c == '\\') { if (isSourcePastEndOfInput() || !scanEscape()) - return Token::Illegal; + return Token::IllegalStringEscape; } else addLiteralChar(c); } if (m_char != quote) - return Token::Illegal; + return Token::IllegalStringEndQuote; literal.complete(); advance(); // consume quote return Token::StringLiteral; @@ -767,7 +769,8 @@ Token Scanner::scanNumber(char _charSeen) // we have already seen a decimal point of the float addLiteralChar('.'); if (m_char == '_') - return Token::Illegal; + // @todo add test-case (change of return value did not break test) + return Token::IllegalNumberSeparator; scanDecimalDigits(); // we know we have at least one digit } else @@ -784,14 +787,14 @@ Token Scanner::scanNumber(char _charSeen) kind = HEX; addLiteralCharAndAdvance(); if (!isHexDigit(m_char)) - return Token::Illegal; // we must have at least one hex digit after 'x' + return Token::IllegalHexDigit; // we must have at least one hex digit after 'x' while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation addLiteralCharAndAdvance(); } else if (isDecimalDigit(m_char)) // We do not allow octal numbers - return Token::Illegal; + return Token::IllegalOctalNotAllowed; } // Parse decimal digits and allow trailing fractional part. if (kind == DECIMAL) @@ -823,7 +826,8 @@ Token Scanner::scanNumber(char _charSeen) { solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number"); if (kind != DECIMAL) - return Token::Illegal; + // @todo add test (change introduced no failing) + return Token::IllegalExponent; else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') { // Recover from wrongly placed underscore as delimiter in literal with scientific @@ -839,7 +843,7 @@ Token Scanner::scanNumber(char _charSeen) if (m_char == '+' || m_char == '-') addLiteralCharAndAdvance(); if (!isDecimalDigit(m_char)) - return Token::Illegal; // we must have at least one decimal digit after 'e'/'E' + return Token::IllegalExponent; // we must have at least one decimal digit after 'e'/'E' scanDecimalDigits(); } // The source character immediately following a numeric literal must @@ -847,7 +851,7 @@ Token Scanner::scanNumber(char _charSeen) // section 7.8.3, page 17 (note that we read only one decimal digit // if the value is 0). if (isDecimalDigit(m_char) || isIdentifierStart(m_char)) - return Token::Illegal; + return Token::IllegalNumberEnd; literal.complete(); return Token::Number; } -- cgit v1.2.3 From e4106bd06eebce9e17d51858a37bf82566b7f640 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Fri, 23 Nov 2018 16:47:34 +0100 Subject: Change scanner error diagnostics to be non-intrusive to the token API. This also implicitly eliminates the magic-token Token::IllegalHex, and streamlines error diagnostics over a custom enum class. --- liblangutil/Scanner.cpp | 70 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 22 deletions(-) (limited to 'liblangutil/Scanner.cpp') diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index 091e9b89..215171b3 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -53,6 +53,7 @@ #include #include #include +#include #include using namespace std; @@ -100,7 +101,32 @@ int hexValue(char c) } } // end anonymous namespace +std::string to_string(ScannerError _errorCode) +{ + switch (_errorCode) + { + case ScannerError::NoError: return "No error."; + case ScannerError::IllegalToken: return "Invalid token."; + case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles within double-quotes."; + case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid."; + case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator."; + case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence."; + case ScannerError::IllegalStringEndQuote: return "Expected string end-quote."; + case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'."; + case ScannerError::IllegalExponent: return "Invalid exponent."; + case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number."; + case ScannerError::OctalNotAllowed: return "Octal numbers not allowed."; + default: + solAssert(false, "Unhandled case in to_string(ScannerError)"); + return ""; + } +} +std::ostream& operator<<(std::ostream& os, ScannerError _errorCode) +{ + os << to_string(_errorCode); + return os; +} /// Scoped helper for literal recording. Automatically drops the literal /// if aborting the scanning before it's complete. @@ -311,7 +337,7 @@ Token Scanner::skipMultiLineComment() } } // Unterminated multi-line comment. - return Token::IllegalCommentTerminator; + return setError(ScannerError::IllegalCommentTerminator); } Token Scanner::scanMultiLineDocComment() @@ -362,7 +388,7 @@ Token Scanner::scanMultiLineDocComment() } literal.complete(); if (!endFound) - return Token::IllegalCommentTerminator; + return setError(ScannerError::IllegalCommentTerminator); else return Token::CommentLiteral; } @@ -392,7 +418,7 @@ Token Scanner::scanSlash() { // doxygen style /** natspec comment if (!advance()) /* slash star comment before EOS */ - return Token::IllegalCommentTerminator; + return setError(ScannerError::IllegalCommentTerminator); else if (m_char == '*') { advance(); //consume the last '*' at /** @@ -409,9 +435,8 @@ Token Scanner::scanSlash() comment = scanMultiLineDocComment(); m_nextSkippedComment.location.end = sourcePos(); m_nextSkippedComment.token = comment; - // @todo possibly: if (comment.isIllegal) return comment; to pass all errors - if (comment == Token::IllegalCommentTerminator) - return Token::IllegalCommentTerminator; + if (comment == Token::Illegal) + return Token::Illegal; // error already set else return Token::Whitespace; } @@ -426,6 +451,7 @@ Token Scanner::scanSlash() void Scanner::scanToken() { + m_nextToken.error = ScannerError::NoError; m_nextToken.literal.clear(); m_nextToken.extendedTokenInfo = make_tuple(0, 0); m_nextSkippedComment.literal.clear(); @@ -611,7 +637,7 @@ void Scanner::scanToken() if (m_char == '"' || m_char == '\'') token = scanHexString(); else - token = Token::IllegalHex; + token = setError(ScannerError::IllegalToken); } } else if (isDecimalDigit(m_char)) @@ -621,8 +647,7 @@ void Scanner::scanToken() else if (isSourcePastEndOfInput()) token = Token::EOS; else - // @todo verfiy if this is actually an "IllegalUnknown" case - token = selectToken(Token::Illegal); + token = selectErrorToken(ScannerError::IllegalToken); break; } // Continue scanning for tokens as long as we're just skipping @@ -715,13 +740,13 @@ Token Scanner::scanString() if (c == '\\') { if (isSourcePastEndOfInput() || !scanEscape()) - return Token::IllegalStringEscape; + return setError(ScannerError::IllegalEscapeSequence); } else addLiteralChar(c); } if (m_char != quote) - return Token::IllegalStringEndQuote; + return setError(ScannerError::IllegalStringEndQuote); literal.complete(); advance(); // consume quote return Token::StringLiteral; @@ -736,11 +761,14 @@ Token Scanner::scanHexString() { char c = m_char; if (!scanHexByte(c)) - return Token::IllegalHex; + // can only return false if hex-byte is incomplete (only one hex digit instead of two) + return setError(ScannerError::IllegalHexString); addLiteralChar(c); } + if (m_char != quote) - return Token::IllegalHex; + return setError(ScannerError::IllegalStringEndQuote); + literal.complete(); advance(); // consume quote return Token::StringLiteral; @@ -769,8 +797,7 @@ Token Scanner::scanNumber(char _charSeen) // we have already seen a decimal point of the float addLiteralChar('.'); if (m_char == '_') - // @todo add test-case (change of return value did not break test) - return Token::IllegalNumberSeparator; + return setError(ScannerError::IllegalToken); scanDecimalDigits(); // we know we have at least one digit } else @@ -787,14 +814,14 @@ Token Scanner::scanNumber(char _charSeen) kind = HEX; addLiteralCharAndAdvance(); if (!isHexDigit(m_char)) - return Token::IllegalHexDigit; // we must have at least one hex digit after 'x' + return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x' while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation addLiteralCharAndAdvance(); } else if (isDecimalDigit(m_char)) // We do not allow octal numbers - return Token::IllegalOctalNotAllowed; + return setError(ScannerError::OctalNotAllowed); } // Parse decimal digits and allow trailing fractional part. if (kind == DECIMAL) @@ -826,8 +853,7 @@ Token Scanner::scanNumber(char _charSeen) { solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number"); if (kind != DECIMAL) - // @todo add test (change introduced no failing) - return Token::IllegalExponent; + return setError(ScannerError::IllegalExponent); else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') { // Recover from wrongly placed underscore as delimiter in literal with scientific @@ -842,8 +868,8 @@ Token Scanner::scanNumber(char _charSeen) addLiteralCharAndAdvance(); // 'e' | 'E' if (m_char == '+' || m_char == '-') addLiteralCharAndAdvance(); - if (!isDecimalDigit(m_char)) - return Token::IllegalExponent; // we must have at least one decimal digit after 'e'/'E' + if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E' + return setError(ScannerError::IllegalExponent); scanDecimalDigits(); } // The source character immediately following a numeric literal must @@ -851,7 +877,7 @@ Token Scanner::scanNumber(char _charSeen) // section 7.8.3, page 17 (note that we read only one decimal digit // if the value is 0). if (isDecimalDigit(m_char) || isIdentifierStart(m_char)) - return Token::IllegalNumberEnd; + return setError(ScannerError::IllegalNumberEnd); literal.complete(); return Token::Number; } -- cgit v1.2.3 From c48a5264be4221873fe02cac57f6a41a32010fea Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Wed, 28 Nov 2018 16:19:22 +0100 Subject: liblangutil: SourceLocation: adds (shared) pointer to underlying CharStream source, eliminating sourceName Also, adapted affecting code to those changes. --- liblangutil/Scanner.cpp | 51 ++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'liblangutil/Scanner.cpp') diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index 215171b3..5c0f356e 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -169,15 +169,22 @@ private: void Scanner::reset(CharStream _source, string _sourceName) { - m_source = std::move(_source); + m_source = make_shared(std::move(_source)); m_sourceName = make_shared(std::move(_sourceName)); reset(); } +void Scanner::reset(std::shared_ptr _source) +{ + solAssert(_source.get() != nullptr, "You MUST provide a CharStream when resetting."); + m_source = _source; + reset(); +} + void Scanner::reset() { - m_source.reset(); - m_char = m_source.get(); + m_source->reset(); + m_char = m_source->get(); skipWhitespace(); scanToken(); next(); @@ -296,13 +303,13 @@ Token Scanner::scanSingleLineDocComment() { // check if next line is also a documentation comment skipWhitespace(); - if (!m_source.isPastEndOfInput(3) && - m_source.get(0) == '/' && - m_source.get(1) == '/' && - m_source.get(2) == '/') + if (!m_source->isPastEndOfInput(3) && + m_source->get(0) == '/' && + m_source->get(1) == '/' && + m_source->get(2) == '/') { addCommentLiteralChar('\n'); - m_char = m_source.advanceAndGet(3); + m_char = m_source->advanceAndGet(3); } else break; // next line is not a documentation comment, we are done @@ -355,20 +362,20 @@ Token Scanner::scanMultiLineDocComment() if (isLineTerminator(m_char)) { skipWhitespace(); - if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*') + if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '*') { // it is unknown if this leads to the end of the comment addCommentLiteralChar('*'); advance(); } - else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/') + else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) != '/') { // skip first '*' in subsequent lines if (charsAdded) addCommentLiteralChar('\n'); - m_char = m_source.advanceAndGet(2); + m_char = m_source->advanceAndGet(2); } - else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/') + else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/') { // if after newline the comment ends, don't insert the newline - m_char = m_source.advanceAndGet(2); + m_char = m_source->advanceAndGet(2); endFound = true; break; } @@ -376,9 +383,9 @@ Token Scanner::scanMultiLineDocComment() addCommentLiteralChar('\n'); } - if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/') + if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/') { - m_char = m_source.advanceAndGet(2); + m_char = m_source->advanceAndGet(2); endFound = true; break; } @@ -715,11 +722,11 @@ bool Scanner::isUnicodeLinebreak() if (0x0a <= m_char && m_char <= 0x0d) // line feed, vertical tab, form feed, carriage return return true; - else if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85) + else if (!m_source->isPastEndOfInput(1) && uint8_t(m_source->get(0)) == 0xc2 && uint8_t(m_source->get(1)) == 0x85) // NEL - U+0085, C2 85 in utf8 return true; - else if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && ( - uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9 + else if (!m_source->isPastEndOfInput(2) && uint8_t(m_source->get(0)) == 0xe2 && uint8_t(m_source->get(1)) == 0x80 && ( + uint8_t(m_source->get(2)) == 0xa8 || uint8_t(m_source->get(2)) == 0xa9 )) // LS - U+2028, E2 80 A8 in utf8 // PS - U+2029, E2 80 A9 in utf8 @@ -783,7 +790,7 @@ void Scanner::scanDecimalDigits() // May continue with decimal digit or underscore for grouping. do addLiteralCharAndAdvance(); - while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_')); + while (!m_source->isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_')); // Defer further validation of underscore to SyntaxChecker. } @@ -829,7 +836,7 @@ Token Scanner::scanNumber(char _charSeen) scanDecimalDigits(); // optional if (m_char == '.') { - if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') + if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_') { // Assume the input may be a floating point number with leading '_' in fraction part. // Recover by consuming it all but returning `Illegal` right away. @@ -837,7 +844,7 @@ Token Scanner::scanNumber(char _charSeen) addLiteralCharAndAdvance(); // '_' scanDecimalDigits(); } - if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1))) + if (m_source->isPastEndOfInput() || !isDecimalDigit(m_source->get(1))) { // A '.' has to be followed by a number. literal.complete(); @@ -854,7 +861,7 @@ Token Scanner::scanNumber(char _charSeen) solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number"); if (kind != DECIMAL) return setError(ScannerError::IllegalExponent); - else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_') + else if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_') { // Recover from wrongly placed underscore as delimiter in literal with scientific // notation by consuming until the end. -- cgit v1.2.3 From 435f7b3b72157e884344adbc7b62033bd08bb51c Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Thu, 29 Nov 2018 01:58:15 +0100 Subject: liblangutil: Scanner: remove superfluous sourceName field (it's in CharStream already) Also, ParserBase::sourceName() was dead code. Eliminating it should increase test coverage (how sneaky) :-) --- liblangutil/Scanner.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'liblangutil/Scanner.cpp') diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index 5c0f356e..ac298bd5 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -167,10 +167,9 @@ private: }; // end of LiteralScope class -void Scanner::reset(CharStream _source, string _sourceName) +void Scanner::reset(CharStream _source) { m_source = make_shared(std::move(_source)); - m_sourceName = make_shared(std::move(_sourceName)); reset(); } -- cgit v1.2.3