1 files changed, 0 insertions, 1139 deletions
diff --git a/src/google/protobuf/io/tokenizer.cc b/src/google/protobuf/io/tokenizer.cc
deleted file mode 100644
index b3550dfb..00000000
--- a/src/google/protobuf/io/tokenizer.cc
+++ /dev/null
@@ -1,1139 +0,0 @@
-// Protocol Buffers - Google's data interchange format
-// Copyright 2008 Google Inc.  All rights reserved.
-// https://developers.google.com/protocol-buffers/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Author: kenton@google.com (Kenton Varda)
-//  Based on original Protocol Buffers design by
-//  Sanjay Ghemawat, Jeff Dean, and others.
-//
-// Here we have a hand-written lexer.  At first you might ask yourself,
-// "Hand-written text processing?  Is Kenton crazy?!"  Well, first of all,
-// yes I am crazy, but that's beside the point.  There are actually reasons
-// why I ended up writing this this way.
-//
-// The traditional approach to lexing is to use lex to generate a lexer for
-// you.  Unfortunately, lex's output is ridiculously ugly and difficult to
-// integrate cleanly with C++ code, especially abstract code or code meant
-// as a library.  Better parser-generators exist but would add dependencies
-// which most users won't already have, which we'd like to avoid.  (GNU flex
-// has a C++ output option, but it's still ridiculously ugly, non-abstract,
-// and not library-friendly.)
-//
-// The next approach that any good software engineer should look at is to
-// use regular expressions.  And, indeed, I did.  I have code which
-// implements this same class using regular expressions.  It's about 200
-// lines shorter.  However:
-// - Rather than error messages telling you "This string has an invalid
-//   escape sequence at line 5, column 45", you get error messages like
-//   "Parse error on line 5".  Giving more precise errors requires adding
-//   a lot of code that ends up basically as complex as the hand-coded
-//   version anyway.
-// - The regular expression to match a string literal looks like this:
-//     kString  = new RE("(\"([^\"\\\\]|"              // non-escaped
-//                       "\\\\[abfnrtv?\"'\\\\0-7]|"   // normal escape
-//                       "\\\\x[0-9a-fA-F])*\"|"       // hex escape
-//                       "\'([^\'\\\\]|"        // Also support single-quotes.
-//                       "\\\\[abfnrtv?\"'\\\\0-7]|"
-//                       "\\\\x[0-9a-fA-F])*\')");
-//   Verifying the correctness of this line noise is actually harder than
-//   verifying the correctness of ConsumeString(), defined below.  I'm not
-//   even confident that the above is correct, after staring at it for some
-//   time.
-// - PCRE is fast, but there's still more overhead involved than the code
-//   below.
-// - Sadly, regular expressions are not part of the C standard library, so
-//   using them would require depending on some other library.  For the
-//   open source release, this could be really annoying.  Nobody likes
-//   downloading one piece of software just to find that they need to
-//   download something else to make it work, and in all likelihood
-//   people downloading Protocol Buffers will already be doing so just
-//   to make something else work.  We could include a copy of PCRE with
-//   our code, but that obligates us to keep it up-to-date and just seems
-//   like a big waste just to save 200 lines of code.
-//
-// On a similar but unrelated note, I'm even scared to use ctype.h.
-// Apparently functions like isalpha() are locale-dependent.  So, if we used
-// that, then if this code is being called from some program that doesn't
-// have its locale set to "C", it would behave strangely.  We can't just set
-// the locale to "C" ourselves since we might break the calling program that
-// way, particularly if it is multi-threaded.  WTF?  Someone please let me
-// (Kenton) know if I'm missing something here...
-//
-// I'd love to hear about other alternatives, though, as this code isn't
-// exactly pretty.
-
-#include <google/protobuf/io/tokenizer.h>
-#include <google/protobuf/stubs/common.h>
-#include <google/protobuf/stubs/logging.h>
-#include <google/protobuf/stubs/stringprintf.h>
-#include <google/protobuf/io/strtod.h>
-#include <google/protobuf/io/zero_copy_stream.h>
-#include <google/protobuf/stubs/strutil.h>
-#include <google/protobuf/stubs/stl_util.h>
-
-namespace google {
-namespace protobuf {
-namespace io {
-namespace {
-
-// As mentioned above, I don't trust ctype.h due to the presence of "locales".
-// So, I have written replacement functions here.  Someone please smack me if
-// this is a bad idea or if there is some way around this.
-//
-// These "character classes" are designed to be used in template methods.
-// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
-// whitespace.
-
-// Note:  No class is allowed to contain '\0', since this is used to mark end-
-//   of-input and is handled specially.
-
-#define CHARACTER_CLASS(NAME, EXPRESSION)      \
-  class NAME {                                 \
-   public:                                     \
-    static inline bool InClass(char c) {       \
-      return EXPRESSION;                       \
-    }                                          \
-  }
-
-CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
-                            c == '\r' || c == '\v' || c == '\f');
-CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
-                                     c == '\r' || c == '\v' || c == '\f');
-
-CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
-
-CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
-CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
-CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
-                          ('a' <= c && c <= 'f') ||
-                          ('A' <= c && c <= 'F'));
-
-CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
-                        ('A' <= c && c <= 'Z') ||
-                        (c == '_'));
-
-CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
-                              ('A' <= c && c <= 'Z') ||
-                              ('0' <= c && c <= '9') ||
-                              (c == '_'));
-
-CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
-                        c == 'r' || c == 't' || c == 'v' || c == '\\' ||
-                        c == '?' || c == '\'' || c == '\"');
-
-#undef CHARACTER_CLASS
-
-// Given a char, interpret it as a numeric digit and return its value.
-// This supports any number base up to 36.
-inline int DigitValue(char digit) {
-  if ('0' <= digit && digit <= '9') return digit - '0';
-  if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
-  if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
-  return -1;
-}
-
-// Inline because it's only used in one place.
-inline char TranslateEscape(char c) {
-  switch (c) {
-    case 'a':  return '\a';
-    case 'b':  return '\b';
-    case 'f':  return '\f';
-    case 'n':  return '\n';
-    case 'r':  return '\r';
-    case 't':  return '\t';
-    case 'v':  return '\v';
-    case '\\': return '\\';
-    case '?':  return '\?';    // Trigraphs = :(
-    case '\'': return '\'';
-    case '"':  return '\"';
-
-    // We expect escape sequences to have been validated separately.
-    default:   return '?';
-  }
-}
-
-}  // anonymous namespace
-
-ErrorCollector::~ErrorCollector() {}
-
-// ===================================================================
-
-Tokenizer::Tokenizer(ZeroCopyInputStream* input,
-                     ErrorCollector* error_collector)
-  : input_(input),
-    error_collector_(error_collector),
-    buffer_(NULL),
-    buffer_size_(0),
-    buffer_pos_(0),
-    read_error_(false),
-    line_(0),
-    column_(0),
-    record_target_(NULL),
-    record_start_(-1),
-    allow_f_after_float_(false),
-    comment_style_(CPP_COMMENT_STYLE),
-    require_space_after_number_(true),
-    allow_multiline_strings_(false) {
-
-  current_.line = 0;
-  current_.column = 0;
-  current_.end_column = 0;
-  current_.type = TYPE_START;
-
-  Refresh();
-}
-
-Tokenizer::~Tokenizer() {
-  // If we had any buffer left unread, return it to the underlying stream
-  // so that someone else can read it.
-  if (buffer_size_ > buffer_pos_) {
-    input_->BackUp(buffer_size_ - buffer_pos_);
-  }
-}
-
-// -------------------------------------------------------------------
-// Internal helpers.
-
-void Tokenizer::NextChar() {
-  // Update our line and column counters based on the character being
-  // consumed.
-  if (current_char_ == '\n') {
-    ++line_;
-    column_ = 0;
-  } else if (current_char_ == '\t') {
-    column_ += kTabWidth - column_ % kTabWidth;
-  } else {
-    ++column_;
-  }
-
-  // Advance to the next character.
-  ++buffer_pos_;
-  if (buffer_pos_ < buffer_size_) {
-    current_char_ = buffer_[buffer_pos_];
-  } else {
-    Refresh();
-  }
-}
-
-void Tokenizer::Refresh() {
-  if (read_error_) {
-    current_char_ = '\0';
-    return;
-  }
-
-  // If we're in a token, append the rest of the buffer to it.
-  if (record_target_ != NULL && record_start_ < buffer_size_) {
-    record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
-    record_start_ = 0;
-  }
-
-  const void* data = NULL;
-  buffer_ = NULL;
-  buffer_pos_ = 0;
-  do {
-    if (!input_->Next(&data, &buffer_size_)) {
-      // end of stream (or read error)
-      buffer_size_ = 0;
-      read_error_ = true;
-      current_char_ = '\0';
-      return;
-    }
-  } while (buffer_size_ == 0);
-
-  buffer_ = static_cast<const char*>(data);
-
-  current_char_ = buffer_[0];
-}
-
-inline void Tokenizer::RecordTo(string* target) {
-  record_target_ = target;
-  record_start_ = buffer_pos_;
-}
-
-inline void Tokenizer::StopRecording() {
-  // Note:  The if() is necessary because some STL implementations crash when
-  //   you call string::append(NULL, 0), presumably because they are trying to
-  //   be helpful by detecting the NULL pointer, even though there's nothing
-  //   wrong with reading zero bytes from NULL.
-  if (buffer_pos_ != record_start_) {
-    record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
-  }
-  record_target_ = NULL;
-  record_start_ = -1;
-}
-
-inline void Tokenizer::StartToken() {
-  current_.type = TYPE_START;    // Just for the sake of initializing it.
-  current_.text.clear();
-  current_.line = line_;
-  current_.column = column_;
-  RecordTo(&current_.text);
-}
-
-inline void Tokenizer::EndToken() {
-  StopRecording();
-  current_.end_column = column_;
-}
-
-// -------------------------------------------------------------------
-// Helper methods that consume characters.
-
-template<typename CharacterClass>
-inline bool Tokenizer::LookingAt() {
-  return CharacterClass::InClass(current_char_);
-}
-
-template<typename CharacterClass>
-inline bool Tokenizer::TryConsumeOne() {
-  if (CharacterClass::InClass(current_char_)) {
-    NextChar();
-    return true;
-  } else {
-    return false;
-  }
-}
-
-inline bool Tokenizer::TryConsume(char c) {
-  if (current_char_ == c) {
-    NextChar();
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template<typename CharacterClass>
-inline void Tokenizer::ConsumeZeroOrMore() {
-  while (CharacterClass::InClass(current_char_)) {
-    NextChar();
-  }
-}
-
-template<typename CharacterClass>
-inline void Tokenizer::ConsumeOneOrMore(const char* error) {
-  if (!CharacterClass::InClass(current_char_)) {
-    AddError(error);
-  } else {
-    do {
-      NextChar();
-    } while (CharacterClass::InClass(current_char_));
-  }
-}
-
-// -------------------------------------------------------------------
-// Methods that read whole patterns matching certain kinds of tokens
-// or comments.
-
-void Tokenizer::ConsumeString(char delimiter) {
-  while (true) {
-    switch (current_char_) {
-      case '\0':
-        AddError("Unexpected end of string.");
-        return;
-
-      case '\n': {
-        if (!allow_multiline_strings_) {
-          AddError("String literals cannot cross line boundaries.");
-          return;
-        }
-        NextChar();
-        break;
-      }
-
-      case '\\': {
-        // An escape sequence.
-        NextChar();
-        if (TryConsumeOne<Escape>()) {
-          // Valid escape sequence.
-        } else if (TryConsumeOne<OctalDigit>()) {
-          // Possibly followed by two more octal digits, but these will
-          // just be consumed by the main loop anyway so we don't need
-          // to do so explicitly here.
-        } else if (TryConsume('x')) {
-          if (!TryConsumeOne<HexDigit>()) {
-            AddError("Expected hex digits for escape sequence.");
-          }
-          // Possibly followed by another hex digit, but again we don't care.
-        } else if (TryConsume('u')) {
-          if (!TryConsumeOne<HexDigit>() ||
-              !TryConsumeOne<HexDigit>() ||
-              !TryConsumeOne<HexDigit>() ||
-              !TryConsumeOne<HexDigit>()) {
-            AddError("Expected four hex digits for \\u escape sequence.");
-          }
-        } else if (TryConsume('U')) {
-          // We expect 8 hex digits; but only the range up to 0x10ffff is
-          // legal.
-          if (!TryConsume('0') ||
-              !TryConsume('0') ||
-              !(TryConsume('0') || TryConsume('1')) ||
-              !TryConsumeOne<HexDigit>() ||
-              !TryConsumeOne<HexDigit>() ||
-              !TryConsumeOne<HexDigit>() ||
-              !TryConsumeOne<HexDigit>() ||
-              !TryConsumeOne<HexDigit>()) {
-            AddError("Expected eight hex digits up to 10ffff for \\U escape "
-                     "sequence");
-          }
-        } else {
-          AddError("Invalid escape sequence in string literal.");
-        }
-        break;
-      }
-
-      default: {
-        if (current_char_ == delimiter) {
-          NextChar();
-          return;
-        }
-        NextChar();
-        break;
-      }
-    }
-  }
-}
-
-Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
-                                              bool started_with_dot) {
-  bool is_float = false;
-
-  if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
-    // A hex number (started with "0x").
-    ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
-
-  } else if (started_with_zero && LookingAt<Digit>()) {
-    // An octal number (had a leading zero).
-    ConsumeZeroOrMore<OctalDigit>();
-    if (LookingAt<Digit>()) {
-      AddError("Numbers starting with leading zero must be in octal.");
-      ConsumeZeroOrMore<Digit>();
-    }
-
-  } else {
-    // A decimal number.
-    if (started_with_dot) {
-      is_float = true;
-      ConsumeZeroOrMore<Digit>();
-    } else {
-      ConsumeZeroOrMore<Digit>();
-
-      if (TryConsume('.')) {
-        is_float = true;
-        ConsumeZeroOrMore<Digit>();
-      }
-    }
-
-    if (TryConsume('e') || TryConsume('E')) {
-      is_float = true;
-      TryConsume('-') || TryConsume('+');
-      ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
-    }
-
-    if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
-      is_float = true;
-    }
-  }
-
-  if (LookingAt<Letter>() && require_space_after_number_) {
-    AddError("Need space between number and identifier.");
-  } else if (current_char_ == '.') {
-    if (is_float) {
-      AddError(
-        "Already saw decimal point or exponent; can't have another one.");
-    } else {
-      AddError("Hex and octal numbers must be integers.");
-    }
-  }
-
-  return is_float ? TYPE_FLOAT : TYPE_INTEGER;
-}
-
-void Tokenizer::ConsumeLineComment(string* content) {
-  if (content != NULL) RecordTo(content);
-
-  while (current_char_ != '\0' && current_char_ != '\n') {
-    NextChar();
-  }
-  TryConsume('\n');
-
-  if (content != NULL) StopRecording();
-}
-
-void Tokenizer::ConsumeBlockComment(string* content) {
-  int start_line = line_;
-  int start_column = column_ - 2;
-
-  if (content != NULL) RecordTo(content);
-
-  while (true) {
-    while (current_char_ != '\0' &&
-           current_char_ != '*' &&
-           current_char_ != '/' &&
-           current_char_ != '\n') {
-      NextChar();
-    }
-
-    if (TryConsume('\n')) {
-      if (content != NULL) StopRecording();
-
-      // Consume leading whitespace and asterisk;
-      ConsumeZeroOrMore<WhitespaceNoNewline>();
-      if (TryConsume('*')) {
-        if (TryConsume('/')) {
-          // End of comment.
-          break;
-        }
-      }
-
-      if (content != NULL) RecordTo(content);
-    } else if (TryConsume('*') && TryConsume('/')) {
-      // End of comment.
-      if (content != NULL) {
-        StopRecording();
-        // Strip trailing "*/".
-        content->erase(content->size() - 2);
-      }
-      break;
-    } else if (TryConsume('/') && current_char_ == '*') {
-      // Note:  We didn't consume the '*' because if there is a '/' after it
-      //   we want to interpret that as the end of the comment.
-      AddError(
-        "\"/*\" inside block comment.  Block comments cannot be nested.");
-    } else if (current_char_ == '\0') {
-      AddError("End-of-file inside block comment.");
-      error_collector_->AddError(
-        start_line, start_column, "  Comment started here.");
-      if (content != NULL) StopRecording();
-      break;
-    }
-  }
-}
-
-Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
-  if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
-    if (TryConsume('/')) {
-      return LINE_COMMENT;
-    } else if (TryConsume('*')) {
-      return BLOCK_COMMENT;
-    } else {
-      // Oops, it was just a slash.  Return it.
-      current_.type = TYPE_SYMBOL;
-      current_.text = "/";
-      current_.line = line_;
-      current_.column = column_ - 1;
-      current_.end_column = column_;
-      return SLASH_NOT_COMMENT;
-    }
-  } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
-    return LINE_COMMENT;
-  } else {
-    return NO_COMMENT;
-  }
-}
-
-// -------------------------------------------------------------------
-
-bool Tokenizer::Next() {
-  previous_ = current_;
-
-  while (!read_error_) {
-    ConsumeZeroOrMore<Whitespace>();
-
-    switch (TryConsumeCommentStart()) {
-      case LINE_COMMENT:
-        ConsumeLineComment(NULL);
-        continue;
-      case BLOCK_COMMENT:
-        ConsumeBlockComment(NULL);
-        continue;
-      case SLASH_NOT_COMMENT:
-        return true;
-      case NO_COMMENT:
-        break;
-    }
-
-    // Check for EOF before continuing.
-    if (read_error_) break;
-
-    if (LookingAt<Unprintable>() || current_char_ == '\0') {
-      AddError("Invalid control characters encountered in text.");
-      NextChar();
-      // Skip more unprintable characters, too.  But, remember that '\0' is
-      // also what current_char_ is set to after EOF / read error.  We have
-      // to be careful not to go into an infinite loop of trying to consume
-      // it, so make sure to check read_error_ explicitly before consuming
-      // '\0'.
-      while (TryConsumeOne<Unprintable>() ||
-             (!read_error_ && TryConsume('\0'))) {
-        // Ignore.
-      }
-
-    } else {
-      // Reading some sort of token.
-      StartToken();
-
-      if (TryConsumeOne<Letter>()) {
-        ConsumeZeroOrMore<Alphanumeric>();
-        current_.type = TYPE_IDENTIFIER;
-      } else if (TryConsume('0')) {
-        current_.type = ConsumeNumber(true, false);
-      } else if (TryConsume('.')) {
-        // This could be the beginning of a floating-point number, or it could
-        // just be a '.' symbol.
-
-        if (TryConsumeOne<Digit>()) {
-          // It's a floating-point number.
-          if (previous_.type == TYPE_IDENTIFIER &&
-              current_.line == previous_.line &&
-              current_.column == previous_.end_column) {
-            // We don't accept syntax like "blah.123".
-            error_collector_->AddError(line_, column_ - 2,
-              "Need space between identifier and decimal point.");
-          }
-          current_.type = ConsumeNumber(false, true);
-        } else {
-          current_.type = TYPE_SYMBOL;
-        }
-      } else if (TryConsumeOne<Digit>()) {
-        current_.type = ConsumeNumber(false, false);
-      } else if (TryConsume('\"')) {
-        ConsumeString('\"');
-        current_.type = TYPE_STRING;
-      } else if (TryConsume('\'')) {
-        ConsumeString('\'');
-        current_.type = TYPE_STRING;
-      } else {
-        // Check if the high order bit is set.
-        if (current_char_ & 0x80) {
-          error_collector_->AddError(line_, column_,
-              StringPrintf("Interpreting non ascii codepoint %d.",
-                           static_cast<unsigned char>(current_char_)));
-        }
-        NextChar();
-        current_.type = TYPE_SYMBOL;
-      }
-
-      EndToken();
-      return true;
-    }
-  }
-
-  // EOF
-  current_.type = TYPE_END;
-  current_.text.clear();
-  current_.line = line_;
-  current_.column = column_;
-  current_.end_column = column_;
-  return false;
-}
-
-namespace {
-
-// Helper class for collecting comments and putting them in the right places.
-//
-// This basically just buffers the most recent comment until it can be decided
-// exactly where that comment should be placed.  When Flush() is called, the
-// current comment goes into either prev_trailing_comments or detached_comments.
-// When the CommentCollector is destroyed, the last buffered comment goes into
-// next_leading_comments.
-class CommentCollector {
- public:
-  CommentCollector(string* prev_trailing_comments,
-                   vector<string>* detached_comments,
-                   string* next_leading_comments)
-      : prev_trailing_comments_(prev_trailing_comments),
-        detached_comments_(detached_comments),
-        next_leading_comments_(next_leading_comments),
-        has_comment_(false),
-        is_line_comment_(false),
-        can_attach_to_prev_(true) {
-    if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
-    if (detached_comments != NULL) detached_comments->clear();
-    if (next_leading_comments != NULL) next_leading_comments->clear();
-  }
-
-  ~CommentCollector() {
-    // Whatever is in the buffer is a leading comment.
-    if (next_leading_comments_ != NULL && has_comment_) {
-      comment_buffer_.swap(*next_leading_comments_);
-    }
-  }
-
-  // About to read a line comment.  Get the comment buffer pointer in order to
-  // read into it.
-  string* GetBufferForLineComment() {
-    // We want to combine with previous line comments, but not block comments.
-    if (has_comment_ && !is_line_comment_) {
-      Flush();
-    }
-    has_comment_ = true;
-    is_line_comment_ = true;
-    return &comment_buffer_;
-  }
-
-  // About to read a block comment.  Get the comment buffer pointer in order to
-  // read into it.
-  string* GetBufferForBlockComment() {
-    if (has_comment_) {
-      Flush();
-    }
-    has_comment_ = true;
-    is_line_comment_ = false;
-    return &comment_buffer_;
-  }
-
-  void ClearBuffer() {
-    comment_buffer_.clear();
-    has_comment_ = false;
-  }
-
-  // Called once we know that the comment buffer is complete and is *not*
-  // connected to the next token.
-  void Flush() {
-    if (has_comment_) {
-      if (can_attach_to_prev_) {
-        if (prev_trailing_comments_ != NULL) {
-          prev_trailing_comments_->append(comment_buffer_);
-        }
-        can_attach_to_prev_ = false;
-      } else {
-        if (detached_comments_ != NULL) {
-          detached_comments_->push_back(comment_buffer_);
-        }
-      }
-      ClearBuffer();
-    }
-  }
-
-  void DetachFromPrev() {
-    can_attach_to_prev_ = false;
-  }
-
- private:
-  string* prev_trailing_comments_;
-  vector<string>* detached_comments_;
-  string* next_leading_comments_;
-
-  string comment_buffer_;
-
-  // True if any comments were read into comment_buffer_.  This can be true even
-  // if comment_buffer_ is empty, namely if the comment was "/**/".
-  bool has_comment_;
-
-  // Is the comment in the comment buffer a line comment?
-  bool is_line_comment_;
-
-  // Is it still possible that we could be reading a comment attached to the
-  // previous token?
-  bool can_attach_to_prev_;
-};
-
-} // namespace
-
-bool Tokenizer::NextWithComments(string* prev_trailing_comments,
-                                 vector<string>* detached_comments,
-                                 string* next_leading_comments) {
-  CommentCollector collector(prev_trailing_comments, detached_comments,
-                             next_leading_comments);
-
-  if (current_.type == TYPE_START) {
-    // Ignore unicode byte order mark(BOM) if it appears at the file
-    // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
-    if (TryConsume((char)0xEF)) {
-      if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
-        AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
-                 "Only UTF-8 is accepted for proto file.");
-        return false;
-      }
-    }
-    collector.DetachFromPrev();
-  } else {
-    // A comment appearing on the same line must be attached to the previous
-    // declaration.
-    ConsumeZeroOrMore<WhitespaceNoNewline>();
-    switch (TryConsumeCommentStart()) {
-      case LINE_COMMENT:
-        ConsumeLineComment(collector.GetBufferForLineComment());
-
-        // Don't allow comments on subsequent lines to be attached to a trailing
-        // comment.
-        collector.Flush();
-        break;
-      case BLOCK_COMMENT:
-        ConsumeBlockComment(collector.GetBufferForBlockComment());
-
-        ConsumeZeroOrMore<WhitespaceNoNewline>();
-        if (!TryConsume('\n')) {
-          // Oops, the next token is on the same line.  If we recorded a comment
-          // we really have no idea which token it should be attached to.
-          collector.ClearBuffer();
-          return Next();
-        }
-
-        // Don't allow comments on subsequent lines to be attached to a trailing
-        // comment.
-        collector.Flush();
-        break;
-      case SLASH_NOT_COMMENT:
-        return true;
-      case NO_COMMENT:
-        if (!TryConsume('\n')) {
-          // The next token is on the same line.  There are no comments.
-          return Next();
-        }
-        break;
-    }
-  }
-
-  // OK, we are now on the line *after* the previous token.
-  while (true) {
-    ConsumeZeroOrMore<WhitespaceNoNewline>();
-
-    switch (TryConsumeCommentStart()) {
-      case LINE_COMMENT:
-        ConsumeLineComment(collector.GetBufferForLineComment());
-        break;
-      case BLOCK_COMMENT:
-        ConsumeBlockComment(collector.GetBufferForBlockComment());
-
-        // Consume the rest of the line so that we don't interpret it as a
-        // blank line the next time around the loop.
-        ConsumeZeroOrMore<WhitespaceNoNewline>();
-        TryConsume('\n');
-        break;
-      case SLASH_NOT_COMMENT:
-        return true;
-      case NO_COMMENT:
-        if (TryConsume('\n')) {
-          // Completely blank line.
-          collector.Flush();
-          collector.DetachFromPrev();
-        } else {
-          bool result = Next();
-          if (!result ||
-              current_.text == "}" ||
-              current_.text == "]" ||
-              current_.text == ")") {
-            // It looks like we're at the end of a scope.  In this case it
-            // makes no sense to attach a comment to the following token.
-            collector.Flush();
-          }
-          return result;
-        }
-        break;
-    }
-  }
-}
-
-// -------------------------------------------------------------------
-// Token-parsing helpers.  Remember that these don't need to report
-// errors since any errors should already have been reported while
-// tokenizing.  Also, these can assume that whatever text they
-// are given is text that the tokenizer actually parsed as a token
-// of the given type.
-
-bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
-                             uint64* output) {
-  // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
-  // is non-standard.  I hate the C standard library.  :(
-
-//  return strtoull(text.c_str(), NULL, 0);
-
-  const char* ptr = text.c_str();
-  int base = 10;
-  if (ptr[0] == '0') {
-    if (ptr[1] == 'x' || ptr[1] == 'X') {
-      // This is hex.
-      base = 16;
-      ptr += 2;
-    } else {
-      // This is octal.
-      base = 8;
-    }
-  }
-
-  uint64 result = 0;
-  for (; *ptr != '\0'; ptr++) {
-    int digit = DigitValue(*ptr);
-    if (digit < 0 || digit >= base) {
-      // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
-      // token, but Tokenizer still think it's integer.
-      return false;
-    }
-    if (digit > max_value || result > (max_value - digit) / base) {
-      // Overflow.
-      return false;
-    }
-    result = result * base + digit;
-  }
-
-  *output = result;
-  return true;
-}
-
-double Tokenizer::ParseFloat(const string& text) {
-  const char* start = text.c_str();
-  char* end;
-  double result = NoLocaleStrtod(start, &end);
-
-  // "1e" is not a valid float, but if the tokenizer reads it, it will
-  // report an error but still return it as a valid token.  We need to
-  // accept anything the tokenizer could possibly return, error or not.
-  if (*end == 'e' || *end == 'E') {
-    ++end;
-    if (*end == '-' || *end == '+') ++end;
-  }
-
-  // If the Tokenizer had allow_f_after_float_ enabled, the float may be
-  // suffixed with the letter 'f'.
-  if (*end == 'f' || *end == 'F') {
-    ++end;
-  }
-
-  GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
-    << " Tokenizer::ParseFloat() passed text that could not have been"
-       " tokenized as a float: " << CEscape(text);
-  return result;
-}
-
-// Helper to append a Unicode code point to a string as UTF8, without bringing
-// in any external dependencies.
-static void AppendUTF8(uint32 code_point, string* output) {
-  uint32 tmp = 0;
-  int len = 0;
-  if (code_point <= 0x7f) {
-    tmp = code_point;
-    len = 1;
-  } else if (code_point <= 0x07ff) {
-    tmp = 0x0000c080 |
-        ((code_point & 0x07c0) << 2) |
-        (code_point & 0x003f);
-    len = 2;
-  } else if (code_point <= 0xffff) {
-    tmp = 0x00e08080 |
-        ((code_point & 0xf000) << 4) |
-        ((code_point & 0x0fc0) << 2) |
-        (code_point & 0x003f);
-    len = 3;
-  } else if (code_point <= 0x1fffff) {
-    tmp = 0xf0808080 |
-        ((code_point & 0x1c0000) << 6) |
-        ((code_point & 0x03f000) << 4) |
-        ((code_point & 0x000fc0) << 2) |
-        (code_point & 0x003f);
-    len = 4;
-  } else {
-    // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
-    // normally only defined up to there as well.
-    StringAppendF(output, "\\U%08x", code_point);
-    return;
-  }
-  tmp = ghtonl(tmp);
-  output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
-}
-
-// Try to read <len> hex digits from ptr, and stuff the numeric result into
-// *result. Returns true if that many digits were successfully consumed.
-static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
-  *result = 0;
-  if (len == 0) return false;
-  for (const char* end = ptr + len; ptr < end; ++ptr) {
-    if (*ptr == '\0') return false;
-    *result = (*result << 4) + DigitValue(*ptr);
-  }
-  return true;
-}
-
-// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
-// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
-// surrogate. These numbers are in a reserved range of Unicode code points, so
-// if we encounter such a pair we know how to parse it and convert it into a
-// single code point.
-static const uint32 kMinHeadSurrogate = 0xd800;
-static const uint32 kMaxHeadSurrogate = 0xdc00;
-static const uint32 kMinTrailSurrogate = 0xdc00;
-static const uint32 kMaxTrailSurrogate = 0xe000;
-
-static inline bool IsHeadSurrogate(uint32 code_point) {
-  return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
-}
-
-static inline bool IsTrailSurrogate(uint32 code_point) {
-  return (code_point >= kMinTrailSurrogate) &&
-      (code_point < kMaxTrailSurrogate);
-}
-
-// Combine a head and trail surrogate into a single Unicode code point.
-static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
-  GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
-  GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
-  return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
-      (trail_surrogate - kMinTrailSurrogate));
-}
-
-// Convert the escape sequence parameter to a number of expected hex digits.
-static inline int UnicodeLength(char key) {
-  if (key == 'u') return 4;
-  if (key == 'U') return 8;
-  return 0;
-}
-
-// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
-// to parse that sequence. On success, returns a pointer to the first char
-// beyond that sequence, and fills in *code_point. On failure, returns ptr
-// itself.
-static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
-  const char* p = ptr;
-  // Fetch the code point.
-  const int len = UnicodeLength(*p++);
-  if (!ReadHexDigits(p, len, code_point))
-    return ptr;
-  p += len;
-
-  // Check if the code point we read is a "head surrogate." If so, then we
-  // expect it to be immediately followed by another code point which is a valid
-  // "trail surrogate," and together they form a UTF-16 pair which decodes into
-  // a single Unicode point. Trail surrogates may only use \u, not \U.
-  if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
-    uint32 trail_surrogate;
-    if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
-        IsTrailSurrogate(trail_surrogate)) {
-      *code_point = AssembleUTF16(*code_point, trail_surrogate);
-      p += 6;
-    }
-    // If this failed, then we just emit the head surrogate as a code point.
-    // It's bogus, but so is the string.
-  }
-
-  return p;
-}
-
-// The text string must begin and end with single or double quote
-// characters.
-void Tokenizer::ParseStringAppend(const string& text, string* output) {
-  // Reminder: text[0] is always a quote character.  (If text is
-  // empty, it's invalid, so we'll just return).
-  const size_t text_size = text.size();
-  if (text_size == 0) {
-    GOOGLE_LOG(DFATAL)
-      << " Tokenizer::ParseStringAppend() passed text that could not"
-         " have been tokenized as a string: " << CEscape(text);
-    return;
-  }
-
-  // Reserve room for new string. The branch is necessary because if
-  // there is already space available the reserve() call might
-  // downsize the output.
-  const size_t new_len = text_size + output->size();
-  if (new_len > output->capacity()) {
-    output->reserve(new_len);
-  }
-
-  // Loop through the string copying characters to "output" and
-  // interpreting escape sequences.  Note that any invalid escape
-  // sequences or other errors were already reported while tokenizing.
-  // In this case we do not need to produce valid results.
-  for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
-    if (*ptr == '\\' && ptr[1] != '\0') {
-      // An escape sequence.
-      ++ptr;
-
-      if (OctalDigit::InClass(*ptr)) {
-        // An octal escape.  May one, two, or three digits.
-        int code = DigitValue(*ptr);
-        if (OctalDigit::InClass(ptr[1])) {
-          ++ptr;
-          code = code * 8 + DigitValue(*ptr);
-        }
-        if (OctalDigit::InClass(ptr[1])) {
-          ++ptr;
-          code = code * 8 + DigitValue(*ptr);
-        }
-        output->push_back(static_cast<char>(code));
-
-      } else if (*ptr == 'x') {
-        // A hex escape.  May zero, one, or two digits.  (The zero case
-        // will have been caught as an error earlier.)
-        int code = 0;
-        if (HexDigit::InClass(ptr[1])) {
-          ++ptr;
-          code = DigitValue(*ptr);
-        }
-        if (HexDigit::InClass(ptr[1])) {
-          ++ptr;
-          code = code * 16 + DigitValue(*ptr);
-        }
-        output->push_back(static_cast<char>(code));
-
-      } else if (*ptr == 'u' || *ptr == 'U') {
-        uint32 unicode;
-        const char* end = FetchUnicodePoint(ptr, &unicode);
-        if (end == ptr) {
-          // Failure: Just dump out what we saw, don't try to parse it.
-          output->push_back(*ptr);
-        } else {
-          AppendUTF8(unicode, output);
-          ptr = end - 1;  // Because we're about to ++ptr.
-        }
-      } else {
-        // Some other escape code.
-        output->push_back(TranslateEscape(*ptr));
-      }
-
-    } else if (*ptr == text[0] && ptr[1] == '\0') {
-      // Ignore final quote matching the starting quote.
-    } else {
-      output->push_back(*ptr);
-    }
-  }
-}
-
-template<typename CharacterClass>
-static bool AllInClass(const string& s) {
-  for (int i = 0; i < s.size(); ++i) {
-    if (!CharacterClass::InClass(s[i]))
-      return false;
-  }
-  return true;
-}
-
-bool Tokenizer::IsIdentifier(const string& text) {
-  // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
-  if (text.size() == 0)
-    return false;
-  if (!Letter::InClass(text.at(0)))
-    return false;
-  if (!AllInClass<Alphanumeric>(text.substr(1)))
-    return false;
-  return true;
-}
-
-}  // namespace io
-}  // namespace protobuf
-}  // namespace google