BWAPI
|
00001 // Protocol Buffers - Google's data interchange format 00002 // Copyright 2008 Google Inc. All rights reserved. 00003 // http://code.google.com/p/protobuf/ 00004 // 00005 // Redistribution and use in source and binary forms, with or without 00006 // modification, are permitted provided that the following conditions are 00007 // met: 00008 // 00009 // * Redistributions of source code must retain the above copyright 00010 // notice, this list of conditions and the following disclaimer. 00011 // * Redistributions in binary form must reproduce the above 00012 // copyright notice, this list of conditions and the following disclaimer 00013 // in the documentation and/or other materials provided with the 00014 // distribution. 00015 // * Neither the name of Google Inc. nor the names of its 00016 // contributors may be used to endorse or promote products derived from 00017 // this software without specific prior written permission. 00018 // 00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00020 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00021 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00022 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00023 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00024 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00025 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00026 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00027 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00028 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00029 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 00031 // Author: kenton@google.com (Kenton Varda) 00032 // Based on original Protocol Buffers design by 00033 // Sanjay Ghemawat, Jeff Dean, and others. 00034 // 00035 // Class for parsing tokenized text from a ZeroCopyInputStream. 00036 00037 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 00038 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 00039 00040 #include <string> 00041 #include <google/protobuf/stubs/common.h> 00042 00043 namespace google { 00044 namespace protobuf { 00045 namespace io { 00046 00047 class ZeroCopyInputStream; // zero_copy_stream.h 00048 00049 // Defined in this file. 00050 class ErrorCollector; 00051 class Tokenizer; 00052 00053 // Abstract interface for an object which collects the errors that occur 00054 // during parsing. A typical implementation might simply print the errors 00055 // to stdout. 00056 class LIBPROTOBUF_EXPORT ErrorCollector { 00057 public: 00058 inline ErrorCollector() {} 00059 virtual ~ErrorCollector(); 00060 00061 // Indicates that there was an error in the input at the given line and 00062 // column numbers. The numbers are zero-based, so you may want to add 00063 // 1 to each before printing them. 00064 virtual void AddError(int line, int column, const string& message) = 0; 00065 00066 // Indicates that there was a warning in the input at the given line and 00067 // column numbers. The numbers are zero-based, so you may want to add 00068 // 1 to each before printing them. 00069 virtual void AddWarning(int line, int column, const string& message) { } 00070 00071 private: 00072 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); 00073 }; 00074 00075 // This class converts a stream of raw text into a stream of tokens for 00076 // the protocol definition parser to parse. The tokens recognized are 00077 // similar to those that make up the C language; see the TokenType enum for 00078 // precise descriptions. Whitespace and comments are skipped. By default, 00079 // C- and C++-style comments are recognized, but other styles can be used by 00080 // calling set_comment_style(). 00081 class LIBPROTOBUF_EXPORT Tokenizer { 00082 public: 00083 // Construct a Tokenizer that reads and tokenizes text from the given 00084 // input stream and writes errors to the given error_collector. 00085 // The caller keeps ownership of input and error_collector. 00086 Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); 00087 ~Tokenizer(); 00088 00089 enum TokenType { 00090 TYPE_START, // Next() has not yet been called. 00091 TYPE_END, // End of input reached. "text" is empty. 00092 00093 TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not 00094 // starting with a digit. It is an error for a number 00095 // to be followed by an identifier with no space in 00096 // between. 00097 TYPE_INTEGER, // A sequence of digits representing an integer. Normally 00098 // the digits are decimal, but a prefix of "0x" indicates 00099 // a hex number and a leading zero indicates octal, just 00100 // like with C numeric literals. A leading negative sign 00101 // is NOT included in the token; it's up to the parser to 00102 // interpret the unary minus operator on its own. 00103 TYPE_FLOAT, // A floating point literal, with a fractional part and/or 00104 // an exponent. Always in decimal. Again, never 00105 // negative. 00106 TYPE_STRING, // A quoted sequence of escaped characters. Either single 00107 // or double quotes can be used, but they must match. 00108 // A string literal cannot cross a line break. 00109 TYPE_SYMBOL, // Any other printable character, like '!' or '+'. 00110 // Symbols are always a single character, so "!+$%" is 00111 // four tokens. 00112 }; 00113 00114 // Structure representing a token read from the token stream. 00115 struct Token { 00116 TokenType type; 00117 string text; // The exact text of the token as it appeared in 00118 // the input. e.g. tokens of TYPE_STRING will still 00119 // be escaped and in quotes. 00120 00121 // "line" and "column" specify the position of the first character of 00122 // the token within the input stream. They are zero-based. 00123 int line; 00124 int column; 00125 }; 00126 00127 // Get the current token. This is updated when Next() is called. Before 00128 // the first call to Next(), current() has type TYPE_START and no contents. 00129 const Token& current(); 00130 00131 // Advance to the next token. Returns false if the end of the input is 00132 // reached. 00133 bool Next(); 00134 00135 // Parse helpers --------------------------------------------------- 00136 00137 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 00138 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 00139 // result is undefined (possibly an assert failure). 00140 static double ParseFloat(const string& text); 00141 00142 // Parses a TYPE_STRING token. This never fails, so long as the text actually 00143 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 00144 // result is undefined (possibly an assert failure). 00145 static void ParseString(const string& text, string* output); 00146 00147 // Identical to ParseString, but appends to output. 00148 static void ParseStringAppend(const string& text, string* output); 00149 00150 // Parses a TYPE_INTEGER token. Returns false if the result would be 00151 // greater than max_value. Otherwise, returns true and sets *output to the 00152 // result. If the text is not from a Token of type TYPE_INTEGER originally 00153 // parsed by a Tokenizer, the result is undefined (possibly an assert 00154 // failure). 00155 static bool ParseInteger(const string& text, uint64 max_value, 00156 uint64* output); 00157 00158 // Options --------------------------------------------------------- 00159 00160 // Set true to allow floats to be suffixed with the letter 'f'. Tokens 00161 // which would otherwise be integers but which have the 'f' suffix will be 00162 // forced to be interpreted as floats. For all other purposes, the 'f' is 00163 // ignored. 00164 void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } 00165 00166 // Valid values for set_comment_style(). 00167 enum CommentStyle { 00168 // Line comments begin with "//", block comments are delimited by "/*" and 00169 // "*/". 00170 CPP_COMMENT_STYLE, 00171 // Line comments begin with "#". No way to write block comments. 00172 SH_COMMENT_STYLE 00173 }; 00174 00175 // Sets the comment style. 00176 void set_comment_style(CommentStyle style) { comment_style_ = style; } 00177 00178 // ----------------------------------------------------------------- 00179 private: 00180 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer); 00181 00182 Token current_; // Returned by current(). 00183 00184 ZeroCopyInputStream* input_; 00185 ErrorCollector* error_collector_; 00186 00187 char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). 00188 const char* buffer_; // Current buffer returned from input_. 00189 int buffer_size_; // Size of buffer_. 00190 int buffer_pos_; // Current position within the buffer. 00191 bool read_error_; // Did we previously encounter a read error? 00192 00193 // Line and column number of current_char_ within the whole input stream. 00194 int line_; 00195 int column_; 00196 00197 // Position in buffer_ where StartToken() was called. If the token 00198 // started in the previous buffer, this is zero, and current_.text already 00199 // contains the part of the token from the previous buffer. If not 00200 // currently parsing a token, this is -1. 00201 int token_start_; 00202 00203 // Options. 00204 bool allow_f_after_float_; 00205 CommentStyle comment_style_; 00206 00207 // Since we count columns we need to interpret tabs somehow. We'll take 00208 // the standard 8-character definition for lack of any way to do better. 00209 static const int kTabWidth = 8; 00210 00211 // ----------------------------------------------------------------- 00212 // Helper methods. 00213 00214 // Consume this character and advance to the next one. 00215 void NextChar(); 00216 00217 // Read a new buffer from the input. 00218 void Refresh(); 00219 00220 // Called when the current character is the first character of a new 00221 // token (not including whitespace or comments). 00222 inline void StartToken(); 00223 // Called when the current character is the first character after the 00224 // end of the last token. After this returns, current_.text will 00225 // contain all text consumed since StartToken() was called. 00226 inline void EndToken(); 00227 00228 // Convenience method to add an error at the current line and column. 00229 void AddError(const string& message) { 00230 error_collector_->AddError(line_, column_, message); 00231 } 00232 00233 // ----------------------------------------------------------------- 00234 // The following four methods are used to consume tokens of specific 00235 // types. They are actually used to consume all characters *after* 00236 // the first, since the calling function consumes the first character 00237 // in order to decide what kind of token is being read. 00238 00239 // Read and consume a string, ending when the given delimiter is 00240 // consumed. 00241 void ConsumeString(char delimiter); 00242 00243 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER 00244 // depending on what was read. This needs to know if the first 00245 // character was a zero in order to correctly recognize hex and octal 00246 // numbers. 00247 // It also needs to know if the first characted was a . to parse floating 00248 // point correctly. 00249 TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); 00250 00251 // Consume the rest of a line. 00252 void ConsumeLineComment(); 00253 // Consume until "*/". 00254 void ConsumeBlockComment(); 00255 00256 // ----------------------------------------------------------------- 00257 // These helper methods make the parsing code more readable. The 00258 // "character classes" refered to are defined at the top of the .cc file. 00259 // Basically it is a C++ class with one method: 00260 // static bool InClass(char c); 00261 // The method returns true if c is a member of this "class", like "Letter" 00262 // or "Digit". 00263 00264 // Returns true if the current character is of the given character 00265 // class, but does not consume anything. 00266 template<typename CharacterClass> 00267 inline bool LookingAt(); 00268 00269 // If the current character is in the given class, consume it and return 00270 // true. Otherwise return false. 00271 // e.g. TryConsumeOne<Letter>() 00272 template<typename CharacterClass> 00273 inline bool TryConsumeOne(); 00274 00275 // Like above, but try to consume the specific character indicated. 00276 inline bool TryConsume(char c); 00277 00278 // Consume zero or more of the given character class. 00279 template<typename CharacterClass> 00280 inline void ConsumeZeroOrMore(); 00281 00282 // Consume one or more of the given character class or log the given 00283 // error message. 00284 // e.g. ConsumeOneOrMore<Digit>("Expected digits."); 00285 template<typename CharacterClass> 00286 inline void ConsumeOneOrMore(const char* error); 00287 }; 00288 00289 // inline methods ==================================================== 00290 inline const Tokenizer::Token& Tokenizer::current() { 00291 return current_; 00292 } 00293 00294 inline void Tokenizer::ParseString(const string& text, string* output) { 00295 output->clear(); 00296 ParseStringAppend(text, output); 00297 } 00298 00299 } // namespace io 00300 } // namespace protobuf 00301 00302 } // namespace google 00303 #endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__