BWAPI
Undermind/proxy/cpp/include/google/protobuf/io/tokenizer.h
Go to the documentation of this file.
00001 // Protocol Buffers - Google's data interchange format
00002 // Copyright 2008 Google Inc.  All rights reserved.
00003 // http://code.google.com/p/protobuf/
00004 //
00005 // Redistribution and use in source and binary forms, with or without
00006 // modification, are permitted provided that the following conditions are
00007 // met:
00008 //
00009 //     * Redistributions of source code must retain the above copyright
00010 // notice, this list of conditions and the following disclaimer.
00011 //     * Redistributions in binary form must reproduce the above
00012 // copyright notice, this list of conditions and the following disclaimer
00013 // in the documentation and/or other materials provided with the
00014 // distribution.
00015 //     * Neither the name of Google Inc. nor the names of its
00016 // contributors may be used to endorse or promote products derived from
00017 // this software without specific prior written permission.
00018 //
00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00020 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00021 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00022 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00023 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00024 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00025 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00026 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00027 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00028 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00029 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 
00031 // Author: kenton@google.com (Kenton Varda)
00032 //  Based on original Protocol Buffers design by
00033 //  Sanjay Ghemawat, Jeff Dean, and others.
00034 //
00035 // Class for parsing tokenized text from a ZeroCopyInputStream.
00036 
00037 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
00038 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
00039 
00040 #include <string>
00041 #include <google/protobuf/stubs/common.h>
00042 
00043 namespace google {
00044 namespace protobuf {
00045 namespace io {
00046 
00047 class ZeroCopyInputStream;     // zero_copy_stream.h
00048 
00049 // Defined in this file.
00050 class ErrorCollector;
00051 class Tokenizer;
00052 
00053 // Abstract interface for an object which collects the errors that occur
00054 // during parsing.  A typical implementation might simply print the errors
00055 // to stdout.
00056 class LIBPROTOBUF_EXPORT ErrorCollector {
00057  public:
00058   inline ErrorCollector() {}
00059   virtual ~ErrorCollector();
00060 
00061   // Indicates that there was an error in the input at the given line and
00062   // column numbers.  The numbers are zero-based, so you may want to add
00063   // 1 to each before printing them.
00064   virtual void AddError(int line, int column, const string& message) = 0;
00065 
00066   // Indicates that there was a warning in the input at the given line and
00067   // column numbers.  The numbers are zero-based, so you may want to add
00068   // 1 to each before printing them.
00069   virtual void AddWarning(int line, int column, const string& message) { }
00070 
00071  private:
00072   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
00073 };
00074 
00075 // This class converts a stream of raw text into a stream of tokens for
00076 // the protocol definition parser to parse.  The tokens recognized are
00077 // similar to those that make up the C language; see the TokenType enum for
00078 // precise descriptions.  Whitespace and comments are skipped.  By default,
00079 // C- and C++-style comments are recognized, but other styles can be used by
00080 // calling set_comment_style().
00081 class LIBPROTOBUF_EXPORT Tokenizer {
00082  public:
00083   // Construct a Tokenizer that reads and tokenizes text from the given
00084   // input stream and writes errors to the given error_collector.
00085   // The caller keeps ownership of input and error_collector.
00086   Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
00087   ~Tokenizer();
00088 
00089   enum TokenType {
00090     TYPE_START,       // Next() has not yet been called.
00091     TYPE_END,         // End of input reached.  "text" is empty.
00092 
00093     TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
00094                       // starting with a digit.  It is an error for a number
00095                       // to be followed by an identifier with no space in
00096                       // between.
00097     TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
00098                       // the digits are decimal, but a prefix of "0x" indicates
00099                       // a hex number and a leading zero indicates octal, just
00100                       // like with C numeric literals.  A leading negative sign
00101                       // is NOT included in the token; it's up to the parser to
00102                       // interpret the unary minus operator on its own.
00103     TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
00104                       // an exponent.  Always in decimal.  Again, never
00105                       // negative.
00106     TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
00107                       // or double quotes can be used, but they must match.
00108                       // A string literal cannot cross a line break.
00109     TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
00110                       // Symbols are always a single character, so "!+$%" is
00111                       // four tokens.
00112   };
00113 
00114   // Structure representing a token read from the token stream.
00115   struct Token {
00116     TokenType type;
00117     string text;       // The exact text of the token as it appeared in
00118                        // the input.  e.g. tokens of TYPE_STRING will still
00119                        // be escaped and in quotes.
00120 
00121     // "line" and "column" specify the position of the first character of
00122     // the token within the input stream.  They are zero-based.
00123     int line;
00124     int column;
00125   };
00126 
00127   // Get the current token.  This is updated when Next() is called.  Before
00128   // the first call to Next(), current() has type TYPE_START and no contents.
00129   const Token& current();
00130 
00131   // Advance to the next token.  Returns false if the end of the input is
00132   // reached.
00133   bool Next();
00134 
00135   // Parse helpers ---------------------------------------------------
00136 
00137   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
00138   // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
00139   // result is undefined (possibly an assert failure).
00140   static double ParseFloat(const string& text);
00141 
00142   // Parses a TYPE_STRING token.  This never fails, so long as the text actually
00143   // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
00144   // result is undefined (possibly an assert failure).
00145   static void ParseString(const string& text, string* output);
00146 
00147   // Identical to ParseString, but appends to output.
00148   static void ParseStringAppend(const string& text, string* output);
00149 
00150   // Parses a TYPE_INTEGER token.  Returns false if the result would be
00151   // greater than max_value.  Otherwise, returns true and sets *output to the
00152   // result.  If the text is not from a Token of type TYPE_INTEGER originally
00153   // parsed by a Tokenizer, the result is undefined (possibly an assert
00154   // failure).
00155   static bool ParseInteger(const string& text, uint64 max_value,
00156                            uint64* output);
00157 
00158   // Options ---------------------------------------------------------
00159 
00160   // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
00161   // which would otherwise be integers but which have the 'f' suffix will be
00162   // forced to be interpreted as floats.  For all other purposes, the 'f' is
00163   // ignored.
00164   void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
00165 
00166   // Valid values for set_comment_style().
00167   enum CommentStyle {
00168     // Line comments begin with "//", block comments are delimited by "/*" and
00169     // "*/".
00170     CPP_COMMENT_STYLE,
00171     // Line comments begin with "#".  No way to write block comments.
00172     SH_COMMENT_STYLE
00173   };
00174 
00175   // Sets the comment style.
00176   void set_comment_style(CommentStyle style) { comment_style_ = style; }
00177 
00178   // -----------------------------------------------------------------
00179  private:
00180   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
00181 
00182   Token current_;           // Returned by current().
00183 
00184   ZeroCopyInputStream* input_;
00185   ErrorCollector* error_collector_;
00186 
00187   char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
00188   const char* buffer_;      // Current buffer returned from input_.
00189   int buffer_size_;         // Size of buffer_.
00190   int buffer_pos_;          // Current position within the buffer.
00191   bool read_error_;         // Did we previously encounter a read error?
00192 
00193   // Line and column number of current_char_ within the whole input stream.
00194   int line_;
00195   int column_;
00196 
00197   // Position in buffer_ where StartToken() was called.  If the token
00198   // started in the previous buffer, this is zero, and current_.text already
00199   // contains the part of the token from the previous buffer.  If not
00200   // currently parsing a token, this is -1.
00201   int token_start_;
00202 
00203   // Options.
00204   bool allow_f_after_float_;
00205   CommentStyle comment_style_;
00206 
00207   // Since we count columns we need to interpret tabs somehow.  We'll take
00208   // the standard 8-character definition for lack of any way to do better.
00209   static const int kTabWidth = 8;
00210 
00211   // -----------------------------------------------------------------
00212   // Helper methods.
00213 
00214   // Consume this character and advance to the next one.
00215   void NextChar();
00216 
00217   // Read a new buffer from the input.
00218   void Refresh();
00219 
00220   // Called when the current character is the first character of a new
00221   // token (not including whitespace or comments).
00222   inline void StartToken();
00223   // Called when the current character is the first character after the
00224   // end of the last token.  After this returns, current_.text will
00225   // contain all text consumed since StartToken() was called.
00226   inline void EndToken();
00227 
00228   // Convenience method to add an error at the current line and column.
00229   void AddError(const string& message) {
00230     error_collector_->AddError(line_, column_, message);
00231   }
00232 
00233   // -----------------------------------------------------------------
00234   // The following four methods are used to consume tokens of specific
00235   // types.  They are actually used to consume all characters *after*
00236   // the first, since the calling function consumes the first character
00237   // in order to decide what kind of token is being read.
00238 
00239   // Read and consume a string, ending when the given delimiter is
00240   // consumed.
00241   void ConsumeString(char delimiter);
00242 
00243   // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
00244   // depending on what was read.  This needs to know if the first
00245   // character was a zero in order to correctly recognize hex and octal
00246   // numbers.
00247   // It also needs to know if the first characted was a . to parse floating
00248   // point correctly.
00249   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
00250 
00251   // Consume the rest of a line.
00252   void ConsumeLineComment();
00253   // Consume until "*/".
00254   void ConsumeBlockComment();
00255 
00256   // -----------------------------------------------------------------
00257   // These helper methods make the parsing code more readable.  The
00258   // "character classes" refered to are defined at the top of the .cc file.
00259   // Basically it is a C++ class with one method:
00260   //   static bool InClass(char c);
00261   // The method returns true if c is a member of this "class", like "Letter"
00262   // or "Digit".
00263 
00264   // Returns true if the current character is of the given character
00265   // class, but does not consume anything.
00266   template<typename CharacterClass>
00267   inline bool LookingAt();
00268 
00269   // If the current character is in the given class, consume it and return
00270   // true.  Otherwise return false.
00271   // e.g. TryConsumeOne<Letter>()
00272   template<typename CharacterClass>
00273   inline bool TryConsumeOne();
00274 
00275   // Like above, but try to consume the specific character indicated.
00276   inline bool TryConsume(char c);
00277 
00278   // Consume zero or more of the given character class.
00279   template<typename CharacterClass>
00280   inline void ConsumeZeroOrMore();
00281 
00282   // Consume one or more of the given character class or log the given
00283   // error message.
00284   // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
00285   template<typename CharacterClass>
00286   inline void ConsumeOneOrMore(const char* error);
00287 };
00288 
00289 // inline methods ====================================================
00290 inline const Tokenizer::Token& Tokenizer::current() {
00291   return current_;
00292 }
00293 
00294 inline void Tokenizer::ParseString(const string& text, string* output) {
00295   output->clear();
00296   ParseStringAppend(text, output);
00297 }
00298 
00299 }  // namespace io
00300 }  // namespace protobuf
00301 
00302 }  // namespace google
00303 #endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines