aboutsummaryrefslogtreecommitdiffstats
path: root/liblangutil/Scanner.cpp
diff options
context:
space:
mode:
authorchriseth <chris@ethereum.org>2018-12-03 22:48:03 +0800
committerGitHub <noreply@github.com>2018-12-03 22:48:03 +0800
commitc8a2cb62832afb2dc09ccee6fd42c1516dfdb981 (patch)
tree7977e9dcbbc215088c05b847f849871ef5d4ae66 /liblangutil/Scanner.cpp
parent1d4f565a64988a3400847d2655ca24f73f234bc6 (diff)
parent590be1d84cea9850ce69b68be3dc5294b39041e5 (diff)
downloaddexon-solidity-c8a2cb62832afb2dc09ccee6fd42c1516dfdb981.tar
dexon-solidity-c8a2cb62832afb2dc09ccee6fd42c1516dfdb981.tar.gz
dexon-solidity-c8a2cb62832afb2dc09ccee6fd42c1516dfdb981.tar.bz2
dexon-solidity-c8a2cb62832afb2dc09ccee6fd42c1516dfdb981.tar.lz
dexon-solidity-c8a2cb62832afb2dc09ccee6fd42c1516dfdb981.tar.xz
dexon-solidity-c8a2cb62832afb2dc09ccee6fd42c1516dfdb981.tar.zst
dexon-solidity-c8a2cb62832afb2dc09ccee6fd42c1516dfdb981.zip
Merge pull request #5571 from ethereum/develop
Version 0.5.1
Diffstat (limited to 'liblangutil/Scanner.cpp')
-rw-r--r--liblangutil/Scanner.cpp904
1 files changed, 904 insertions, 0 deletions
diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp
new file mode 100644
index 00000000..ac298bd5
--- /dev/null
+++ b/liblangutil/Scanner.cpp
@@ -0,0 +1,904 @@
+/*
+ This file is part of solidity.
+
+ solidity is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ solidity is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with solidity. If not, see <http://www.gnu.org/licenses/>.
+
+ This file is derived from the file "scanner.cc", which was part of the
+ V8 project. The original copyright header follows:
+
+ Copyright 2006-2012, the V8 project authors. All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/**
+ * @author Christian <c@ethdev.com>
+ * @date 2014
+ * Solidity scanner.
+ */
+
+#include <liblangutil/Exceptions.h>
+#include <liblangutil/Scanner.h>
+#include <algorithm>
+#include <ostream>
+#include <tuple>
+
+using namespace std;
+
+namespace langutil
+{
+
+namespace
+{
+bool isDecimalDigit(char c)
+{
+ return '0' <= c && c <= '9';
+}
+bool isHexDigit(char c)
+{
+ return isDecimalDigit(c)
+ || ('a' <= c && c <= 'f')
+ || ('A' <= c && c <= 'F');
+}
+bool isLineTerminator(char c)
+{
+ return c == '\n';
+}
+bool isWhiteSpace(char c)
+{
+ return c == ' ' || c == '\n' || c == '\t' || c == '\r';
+}
+bool isIdentifierStart(char c)
+{
+ return c == '_' || c == '$' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+}
+bool isIdentifierPart(char c)
+{
+ return isIdentifierStart(c) || isDecimalDigit(c);
+}
+int hexValue(char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (c >= 'a' && c <= 'f')
+ return c - 'a' + 10;
+ else if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ else return -1;
+}
+} // end anonymous namespace
+
+std::string to_string(ScannerError _errorCode)
+{
+ switch (_errorCode)
+ {
+ case ScannerError::NoError: return "No error.";
+ case ScannerError::IllegalToken: return "Invalid token.";
+ case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles within double-quotes.";
+ case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid.";
+ case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator.";
+ case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence.";
+ case ScannerError::IllegalStringEndQuote: return "Expected string end-quote.";
+ case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'.";
+ case ScannerError::IllegalExponent: return "Invalid exponent.";
+ case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
+ case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
+ default:
+ solAssert(false, "Unhandled case in to_string(ScannerError)");
+ return "";
+ }
+}
+
+std::ostream& operator<<(std::ostream& os, ScannerError _errorCode)
+{
+ os << to_string(_errorCode);
+ return os;
+}
+
+/// Scoped helper for literal recording. Automatically drops the literal
+/// if aborting the scanning before it's complete.
+enum LiteralType {
+ LITERAL_TYPE_STRING,
+ LITERAL_TYPE_NUMBER, // not really different from string type in behaviour
+ LITERAL_TYPE_COMMENT
+};
+
+class LiteralScope
+{
+public:
+ explicit LiteralScope(Scanner* _self, enum LiteralType _type): m_type(_type)
+ , m_scanner(_self)
+ , m_complete(false)
+ {
+ if (_type == LITERAL_TYPE_COMMENT)
+ m_scanner->m_nextSkippedComment.literal.clear();
+ else
+ m_scanner->m_nextToken.literal.clear();
+ }
+ ~LiteralScope()
+ {
+ if (!m_complete)
+ {
+ if (m_type == LITERAL_TYPE_COMMENT)
+ m_scanner->m_nextSkippedComment.literal.clear();
+ else
+ m_scanner->m_nextToken.literal.clear();
+ }
+ }
+ void complete() { m_complete = true; }
+
+private:
+ enum LiteralType m_type;
+ Scanner* m_scanner;
+ bool m_complete;
+}; // end of LiteralScope class
+
+
+void Scanner::reset(CharStream _source)
+{
+ m_source = make_shared<CharStream>(std::move(_source));
+ reset();
+}
+
+void Scanner::reset(std::shared_ptr<CharStream> _source)
+{
+ solAssert(_source.get() != nullptr, "You MUST provide a CharStream when resetting.");
+ m_source = _source;
+ reset();
+}
+
+void Scanner::reset()
+{
+ m_source->reset();
+ m_char = m_source->get();
+ skipWhitespace();
+ scanToken();
+ next();
+}
+
+bool Scanner::scanHexByte(char& o_scannedByte)
+{
+ char x = 0;
+ for (int i = 0; i < 2; i++)
+ {
+ int d = hexValue(m_char);
+ if (d < 0)
+ {
+ rollback(i);
+ return false;
+ }
+ x = x * 16 + d;
+ advance();
+ }
+ o_scannedByte = x;
+ return true;
+}
+
+bool Scanner::scanUnicode(unsigned & o_codepoint)
+{
+ unsigned x = 0;
+ for (int i = 0; i < 4; i++)
+ {
+ int d = hexValue(m_char);
+ if (d < 0)
+ {
+ rollback(i);
+ return false;
+ }
+ x = x * 16 + d;
+ advance();
+ }
+ o_codepoint = x;
+ return true;
+}
+
+// This supports codepoints between 0000 and FFFF.
+void Scanner::addUnicodeAsUTF8(unsigned codepoint)
+{
+ if (codepoint <= 0x7f)
+ addLiteralChar(codepoint);
+ else if (codepoint <= 0x7ff)
+ {
+ addLiteralChar(0xc0 | (codepoint >> 6));
+ addLiteralChar(0x80 | (codepoint & 0x3f));
+ }
+ else
+ {
+ addLiteralChar(0xe0 | (codepoint >> 12));
+ addLiteralChar(0x80 | ((codepoint >> 6) & 0x3f));
+ addLiteralChar(0x80 | (codepoint & 0x3f));
+ }
+}
+
+// Ensure that tokens can be stored in a byte.
+BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100);
+
+Token Scanner::next()
+{
+ m_currentToken = m_nextToken;
+ m_skippedComment = m_nextSkippedComment;
+ scanToken();
+
+ return m_currentToken.token;
+}
+
+Token Scanner::selectToken(char _next, Token _then, Token _else)
+{
+ advance();
+ if (m_char == _next)
+ return selectToken(_then);
+ else
+ return _else;
+}
+
+bool Scanner::skipWhitespace()
+{
+ int const startPosition = sourcePos();
+ while (isWhiteSpace(m_char))
+ advance();
+ // Return whether or not we skipped any characters.
+ return sourcePos() != startPosition;
+}
+
+void Scanner::skipWhitespaceExceptUnicodeLinebreak()
+{
+ while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
+ advance();
+}
+
+Token Scanner::skipSingleLineComment()
+{
+ // Line terminator is not part of the comment. If it is a
+ // non-ascii line terminator, it will result in a parser error.
+ while (!isUnicodeLinebreak())
+ if (!advance()) break;
+
+ return Token::Whitespace;
+}
+
+Token Scanner::scanSingleLineDocComment()
+{
+ LiteralScope literal(this, LITERAL_TYPE_COMMENT);
+ advance(); //consume the last '/' at ///
+
+ skipWhitespaceExceptUnicodeLinebreak();
+
+ while (!isSourcePastEndOfInput())
+ {
+ if (isLineTerminator(m_char))
+ {
+ // check if next line is also a documentation comment
+ skipWhitespace();
+ if (!m_source->isPastEndOfInput(3) &&
+ m_source->get(0) == '/' &&
+ m_source->get(1) == '/' &&
+ m_source->get(2) == '/')
+ {
+ addCommentLiteralChar('\n');
+ m_char = m_source->advanceAndGet(3);
+ }
+ else
+ break; // next line is not a documentation comment, we are done
+
+ }
+ else if (isUnicodeLinebreak())
+ // Any line terminator that is not '\n' is considered to end the
+ // comment.
+ break;
+ addCommentLiteralChar(m_char);
+ advance();
+ }
+ literal.complete();
+ return Token::CommentLiteral;
+}
+
+Token Scanner::skipMultiLineComment()
+{
+ advance();
+ while (!isSourcePastEndOfInput())
+ {
+ char ch = m_char;
+ advance();
+
+ // If we have reached the end of the multi-line comment, we
+ // consume the '/' and insert a whitespace. This way all
+ // multi-line comments are treated as whitespace.
+ if (ch == '*' && m_char == '/')
+ {
+ m_char = ' ';
+ return Token::Whitespace;
+ }
+ }
+ // Unterminated multi-line comment.
+ return setError(ScannerError::IllegalCommentTerminator);
+}
+
+Token Scanner::scanMultiLineDocComment()
+{
+ LiteralScope literal(this, LITERAL_TYPE_COMMENT);
+ bool endFound = false;
+ bool charsAdded = false;
+
+ while (isWhiteSpace(m_char) && !isLineTerminator(m_char))
+ advance();
+
+ while (!isSourcePastEndOfInput())
+ {
+ //handle newlines in multline comments
+ if (isLineTerminator(m_char))
+ {
+ skipWhitespace();
+ if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '*')
+ { // it is unknown if this leads to the end of the comment
+ addCommentLiteralChar('*');
+ advance();
+ }
+ else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) != '/')
+ { // skip first '*' in subsequent lines
+ if (charsAdded)
+ addCommentLiteralChar('\n');
+ m_char = m_source->advanceAndGet(2);
+ }
+ else if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/')
+ { // if after newline the comment ends, don't insert the newline
+ m_char = m_source->advanceAndGet(2);
+ endFound = true;
+ break;
+ }
+ else if (charsAdded)
+ addCommentLiteralChar('\n');
+ }
+
+ if (!m_source->isPastEndOfInput(1) && m_source->get(0) == '*' && m_source->get(1) == '/')
+ {
+ m_char = m_source->advanceAndGet(2);
+ endFound = true;
+ break;
+ }
+ addCommentLiteralChar(m_char);
+ charsAdded = true;
+ advance();
+ }
+ literal.complete();
+ if (!endFound)
+ return setError(ScannerError::IllegalCommentTerminator);
+ else
+ return Token::CommentLiteral;
+}
+
+Token Scanner::scanSlash()
+{
+ int firstSlashPosition = sourcePos();
+ advance();
+ if (m_char == '/')
+ {
+ if (!advance()) /* double slash comment directly before EOS */
+ return Token::Whitespace;
+ else if (m_char == '/')
+ {
+ // doxygen style /// comment
+ Token comment;
+ m_nextSkippedComment.location.start = firstSlashPosition;
+ comment = scanSingleLineDocComment();
+ m_nextSkippedComment.location.end = sourcePos();
+ m_nextSkippedComment.token = comment;
+ return Token::Whitespace;
+ }
+ else
+ return skipSingleLineComment();
+ }
+ else if (m_char == '*')
+ {
+ // doxygen style /** natspec comment
+ if (!advance()) /* slash star comment before EOS */
+ return setError(ScannerError::IllegalCommentTerminator);
+ else if (m_char == '*')
+ {
+ advance(); //consume the last '*' at /**
+
+ // "/**/"
+ if (m_char == '/')
+ {
+ advance(); //skip the closing slash
+ return Token::Whitespace;
+ }
+ // we actually have a multiline documentation comment
+ Token comment;
+ m_nextSkippedComment.location.start = firstSlashPosition;
+ comment = scanMultiLineDocComment();
+ m_nextSkippedComment.location.end = sourcePos();
+ m_nextSkippedComment.token = comment;
+ if (comment == Token::Illegal)
+ return Token::Illegal; // error already set
+ else
+ return Token::Whitespace;
+ }
+ else
+ return skipMultiLineComment();
+ }
+ else if (m_char == '=')
+ return selectToken(Token::AssignDiv);
+ else
+ return Token::Div;
+}
+
+void Scanner::scanToken()
+{
+ m_nextToken.error = ScannerError::NoError;
+ m_nextToken.literal.clear();
+ m_nextToken.extendedTokenInfo = make_tuple(0, 0);
+ m_nextSkippedComment.literal.clear();
+ m_nextSkippedComment.extendedTokenInfo = make_tuple(0, 0);
+
+ Token token;
+ // M and N are for the purposes of grabbing different type sizes
+ unsigned m;
+ unsigned n;
+ do
+ {
+ // Remember the position of the next token
+ m_nextToken.location.start = sourcePos();
+ switch (m_char)
+ {
+ case '"':
+ case '\'':
+ token = scanString();
+ break;
+ case '<':
+ // < <= << <<=
+ advance();
+ if (m_char == '=')
+ token = selectToken(Token::LessThanOrEqual);
+ else if (m_char == '<')
+ token = selectToken('=', Token::AssignShl, Token::SHL);
+ else
+ token = Token::LessThan;
+ break;
+ case '>':
+ // > >= >> >>= >>> >>>=
+ advance();
+ if (m_char == '=')
+ token = selectToken(Token::GreaterThanOrEqual);
+ else if (m_char == '>')
+ {
+ // >> >>= >>> >>>=
+ advance();
+ if (m_char == '=')
+ token = selectToken(Token::AssignSar);
+ else if (m_char == '>')
+ token = selectToken('=', Token::AssignShr, Token::SHR);
+ else
+ token = Token::SAR;
+ }
+ else
+ token = Token::GreaterThan;
+ break;
+ case '=':
+ // = == =>
+ advance();
+ if (m_char == '=')
+ token = selectToken(Token::Equal);
+ else if (m_char == '>')
+ token = selectToken(Token::Arrow);
+ else
+ token = Token::Assign;
+ break;
+ case '!':
+ // ! !=
+ advance();
+ if (m_char == '=')
+ token = selectToken(Token::NotEqual);
+ else
+ token = Token::Not;
+ break;
+ case '+':
+ // + ++ +=
+ advance();
+ if (m_char == '+')
+ token = selectToken(Token::Inc);
+ else if (m_char == '=')
+ token = selectToken(Token::AssignAdd);
+ else
+ token = Token::Add;
+ break;
+ case '-':
+ // - -- -=
+ advance();
+ if (m_char == '-')
+ token = selectToken(Token::Dec);
+ else if (m_char == '=')
+ token = selectToken(Token::AssignSub);
+ else
+ token = Token::Sub;
+ break;
+ case '*':
+ // * ** *=
+ advance();
+ if (m_char == '*')
+ token = selectToken(Token::Exp);
+ else if (m_char == '=')
+ token = selectToken(Token::AssignMul);
+ else
+ token = Token::Mul;
+ break;
+ case '%':
+ // % %=
+ token = selectToken('=', Token::AssignMod, Token::Mod);
+ break;
+ case '/':
+ // / // /* /=
+ token = scanSlash();
+ break;
+ case '&':
+ // & && &=
+ advance();
+ if (m_char == '&')
+ token = selectToken(Token::And);
+ else if (m_char == '=')
+ token = selectToken(Token::AssignBitAnd);
+ else
+ token = Token::BitAnd;
+ break;
+ case '|':
+ // | || |=
+ advance();
+ if (m_char == '|')
+ token = selectToken(Token::Or);
+ else if (m_char == '=')
+ token = selectToken(Token::AssignBitOr);
+ else
+ token = Token::BitOr;
+ break;
+ case '^':
+ // ^ ^=
+ token = selectToken('=', Token::AssignBitXor, Token::BitXor);
+ break;
+ case '.':
+ // . Number
+ advance();
+ if (isDecimalDigit(m_char))
+ token = scanNumber('.');
+ else
+ token = Token::Period;
+ break;
+ case ':':
+ token = selectToken(Token::Colon);
+ break;
+ case ';':
+ token = selectToken(Token::Semicolon);
+ break;
+ case ',':
+ token = selectToken(Token::Comma);
+ break;
+ case '(':
+ token = selectToken(Token::LParen);
+ break;
+ case ')':
+ token = selectToken(Token::RParen);
+ break;
+ case '[':
+ token = selectToken(Token::LBrack);
+ break;
+ case ']':
+ token = selectToken(Token::RBrack);
+ break;
+ case '{':
+ token = selectToken(Token::LBrace);
+ break;
+ case '}':
+ token = selectToken(Token::RBrace);
+ break;
+ case '?':
+ token = selectToken(Token::Conditional);
+ break;
+ case '~':
+ token = selectToken(Token::BitNot);
+ break;
+ default:
+ if (isIdentifierStart(m_char))
+ {
+ tie(token, m, n) = scanIdentifierOrKeyword();
+
+ // Special case for hexadecimal literals
+ if (token == Token::Hex)
+ {
+ // reset
+ m = 0;
+ n = 0;
+
+ // Special quoted hex string must follow
+ if (m_char == '"' || m_char == '\'')
+ token = scanHexString();
+ else
+ token = setError(ScannerError::IllegalToken);
+ }
+ }
+ else if (isDecimalDigit(m_char))
+ token = scanNumber();
+ else if (skipWhitespace())
+ token = Token::Whitespace;
+ else if (isSourcePastEndOfInput())
+ token = Token::EOS;
+ else
+ token = selectErrorToken(ScannerError::IllegalToken);
+ break;
+ }
+ // Continue scanning for tokens as long as we're just skipping
+ // whitespace.
+ }
+ while (token == Token::Whitespace);
+ m_nextToken.location.end = sourcePos();
+ m_nextToken.token = token;
+ m_nextToken.extendedTokenInfo = make_tuple(m, n);
+}
+
+bool Scanner::scanEscape()
+{
+ char c = m_char;
+ advance();
+ // Skip escaped newlines.
+ if (isLineTerminator(c))
+ return true;
+ switch (c)
+ {
+ case '\'': // fall through
+ case '"': // fall through
+ case '\\':
+ break;
+ case 'b':
+ c = '\b';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'v':
+ c = '\v';
+ break;
+ case 'u':
+ {
+ unsigned codepoint;
+ if (!scanUnicode(codepoint))
+ return false;
+ addUnicodeAsUTF8(codepoint);
+ return true;
+ }
+ case 'x':
+ if (!scanHexByte(c))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ addLiteralChar(c);
+ return true;
+}
+
+bool Scanner::isUnicodeLinebreak()
+{
+ if (0x0a <= m_char && m_char <= 0x0d)
+ // line feed, vertical tab, form feed, carriage return
+ return true;
+ else if (!m_source->isPastEndOfInput(1) && uint8_t(m_source->get(0)) == 0xc2 && uint8_t(m_source->get(1)) == 0x85)
+ // NEL - U+0085, C2 85 in utf8
+ return true;
+ else if (!m_source->isPastEndOfInput(2) && uint8_t(m_source->get(0)) == 0xe2 && uint8_t(m_source->get(1)) == 0x80 && (
+ uint8_t(m_source->get(2)) == 0xa8 || uint8_t(m_source->get(2)) == 0xa9
+ ))
+ // LS - U+2028, E2 80 A8 in utf8
+ // PS - U+2029, E2 80 A9 in utf8
+ return true;
+ else
+ return false;
+}
+
+Token Scanner::scanString()
+{
+ char const quote = m_char;
+ advance(); // consume quote
+ LiteralScope literal(this, LITERAL_TYPE_STRING);
+ while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
+ {
+ char c = m_char;
+ advance();
+ if (c == '\\')
+ {
+ if (isSourcePastEndOfInput() || !scanEscape())
+ return setError(ScannerError::IllegalEscapeSequence);
+ }
+ else
+ addLiteralChar(c);
+ }
+ if (m_char != quote)
+ return setError(ScannerError::IllegalStringEndQuote);
+ literal.complete();
+ advance(); // consume quote
+ return Token::StringLiteral;
+}
+
+Token Scanner::scanHexString()
+{
+ char const quote = m_char;
+ advance(); // consume quote
+ LiteralScope literal(this, LITERAL_TYPE_STRING);
+ while (m_char != quote && !isSourcePastEndOfInput())
+ {
+ char c = m_char;
+ if (!scanHexByte(c))
+ // can only return false if hex-byte is incomplete (only one hex digit instead of two)
+ return setError(ScannerError::IllegalHexString);
+ addLiteralChar(c);
+ }
+
+ if (m_char != quote)
+ return setError(ScannerError::IllegalStringEndQuote);
+
+ literal.complete();
+ advance(); // consume quote
+ return Token::StringLiteral;
+}
+
+// Parse for regex [:digit:]+(_[:digit:]+)*
+void Scanner::scanDecimalDigits()
+{
+ // MUST begin with a decimal digit.
+ if (!isDecimalDigit(m_char))
+ return;
+
+ // May continue with decimal digit or underscore for grouping.
+ do addLiteralCharAndAdvance();
+ while (!m_source->isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_'));
+
+ // Defer further validation of underscore to SyntaxChecker.
+}
+
+Token Scanner::scanNumber(char _charSeen)
+{
+ enum { DECIMAL, HEX, BINARY } kind = DECIMAL;
+ LiteralScope literal(this, LITERAL_TYPE_NUMBER);
+ if (_charSeen == '.')
+ {
+ // we have already seen a decimal point of the float
+ addLiteralChar('.');
+ if (m_char == '_')
+ return setError(ScannerError::IllegalToken);
+ scanDecimalDigits(); // we know we have at least one digit
+ }
+ else
+ {
+ solAssert(_charSeen == 0, "");
+ // if the first character is '0' we must check for octals and hex
+ if (m_char == '0')
+ {
+ addLiteralCharAndAdvance();
+ // either 0, 0exxx, 0Exxx, 0.xxx or a hex number
+ if (m_char == 'x')
+ {
+ // hex number
+ kind = HEX;
+ addLiteralCharAndAdvance();
+ if (!isHexDigit(m_char))
+ return setError(ScannerError::IllegalHexDigit); // we must have at least one hex digit after 'x'
+
+ while (isHexDigit(m_char) || m_char == '_') // We keep the underscores for later validation
+ addLiteralCharAndAdvance();
+ }
+ else if (isDecimalDigit(m_char))
+ // We do not allow octal numbers
+ return setError(ScannerError::OctalNotAllowed);
+ }
+ // Parse decimal digits and allow trailing fractional part.
+ if (kind == DECIMAL)
+ {
+ scanDecimalDigits(); // optional
+ if (m_char == '.')
+ {
+ if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_')
+ {
+ // Assume the input may be a floating point number with leading '_' in fraction part.
+ // Recover by consuming it all but returning `Illegal` right away.
+ addLiteralCharAndAdvance(); // '.'
+ addLiteralCharAndAdvance(); // '_'
+ scanDecimalDigits();
+ }
+ if (m_source->isPastEndOfInput() || !isDecimalDigit(m_source->get(1)))
+ {
+ // A '.' has to be followed by a number.
+ literal.complete();
+ return Token::Number;
+ }
+ addLiteralCharAndAdvance();
+ scanDecimalDigits();
+ }
+ }
+ }
+ // scan exponent, if any
+ if (m_char == 'e' || m_char == 'E')
+ {
+ solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number");
+ if (kind != DECIMAL)
+ return setError(ScannerError::IllegalExponent);
+ else if (!m_source->isPastEndOfInput(1) && m_source->get(1) == '_')
+ {
+ // Recover from wrongly placed underscore as delimiter in literal with scientific
+ // notation by consuming until the end.
+ addLiteralCharAndAdvance(); // 'e'
+ addLiteralCharAndAdvance(); // '_'
+ scanDecimalDigits();
+ literal.complete();
+ return Token::Number;
+ }
+ // scan exponent
+ addLiteralCharAndAdvance(); // 'e' | 'E'
+ if (m_char == '+' || m_char == '-')
+ addLiteralCharAndAdvance();
+ if (!isDecimalDigit(m_char)) // we must have at least one decimal digit after 'e'/'E'
+ return setError(ScannerError::IllegalExponent);
+ scanDecimalDigits();
+ }
+ // The source character immediately following a numeric literal must
+ // not be an identifier start or a decimal digit; see ECMA-262
+ // section 7.8.3, page 17 (note that we read only one decimal digit
+ // if the value is 0).
+ if (isDecimalDigit(m_char) || isIdentifierStart(m_char))
+ return setError(ScannerError::IllegalNumberEnd);
+ literal.complete();
+ return Token::Number;
+}
+
+tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
+{
+ solAssert(isIdentifierStart(m_char), "");
+ LiteralScope literal(this, LITERAL_TYPE_STRING);
+ addLiteralCharAndAdvance();
+ // Scan the rest of the identifier characters.
+ while (isIdentifierPart(m_char)) //get full literal
+ addLiteralCharAndAdvance();
+ literal.complete();
+ return TokenTraits::fromIdentifierOrKeyword(m_nextToken.literal);
+}
+
+
+}