textfile.cc

Go to the documentation of this file.
00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00043 #include "fastlib/file/textfile.h"
00044 //#include "textfile.h"
00045 
00046 #include <ctype.h>
00047 
00048 /*
00049 char *TextTokenizer::ReadLine() {
00050   char *buf = NULL;
00051   size_t size = 0;
00052   size_t len = 0;
00053   const size_t extra = 64;
00054   int c;
00055   
00056   for (;;) {
00057     c = getc(f_);
00058     
00059     if (unlikely(c == '\r')) {
00060       c = getc(f_);
00061       if (c != '\n') {
00062         ungetc(c, f_);
00063       }
00064       break;
00065     } else if (unlikely(c == '\n')) {
00066       break;
00067     } else if (unlikely(c == EOF)) {
00068       if (len == 0) {
00069         return NULL;
00070       } else {
00071         break;
00072       }
00073     }
00074     
00075     len++;
00076     
00077     if (size <= len) {
00078       size = len * 2 + extra;
00079       buf = mem::Realloc(buf, size);
00080     }
00081     
00082     buf[len-1] = c;
00083   }
00084   
00085   if (len == 0) {
00086     // special case: empty line
00087     buf = mem::Alloc<char>(1);
00088   }
00089   
00090   buf[len] = '\0';
00091   
00092   return buf;
00093 }
00094 */
00095 
00096 void TextLineReader::Error(const char *format, ...) {
00097   va_list vl;
00098   
00099   // TODO: Use a warning propagation system
00100   fprintf(stderr, ".| %d: %s\nX|  `-> ", line_num_, line_.c_str());
00101   
00102   va_start(vl, format);
00103   vfprintf(stderr, format, vl);
00104   va_end(vl);
00105   
00106   fprintf(stderr, "\n");
00107 }
00108 
00109 success_t TextLineReader::Open(const char *fname) {
00110   f_ = fopen(fname, "r");
00111   line_num_ = 0;
00112   has_line_ = false;
00113   line_.Init();
00114   
00115   if (unlikely(f_ == NULL)) {
00116     return SUCCESS_FAIL;
00117   } else {
00118     Gobble();
00119     return SUCCESS_PASS;
00120   }
00121 }  
00122 
00123 bool TextLineReader::Gobble() {
00124   char *ptr = ReadLine_();
00125   
00126   line_.Destruct();
00127   
00128   if (likely(ptr != NULL)) {
00129     line_.Steal(ptr);
00130     has_line_ = true;
00131     line_num_++;
00132     return true;
00133   } else {
00134     line_.Init();
00135     has_line_ = false;
00136     return false;
00137   }
00138 }
00139  
00140 char *TextLineReader::ReadLine_() {
00141   char *buf = NULL;
00142   size_t size = 1;
00143   size_t len = 0;
00144 #ifdef DEBUG
00145   const size_t extra = 10;
00146 #else
00147   const size_t extra = 80;
00148 #endif
00149   
00150   for (;;) {
00151     size = size * 2 + extra;
00152     buf = mem::Realloc(buf, size);
00154     char *result = ::fgets(buf + len, size - len, f_); 
00155     if (len == 0 && result == NULL) {
00156       mem::Free(buf);
00157       return NULL;
00158     }
00159     len += strlen(buf + len);
00160     
00161     if (len < size - 1 || buf[len - 1] == '\r' || buf[len - 1] == '\n') {
00162       while (len && (buf[len-1] == '\r' || buf[len-1] == '\n')) {
00163         len--;
00164       }
00165       buf[len] = '\0';
00166       return buf;
00167     }
00168   }
00169 }
00170 
00171 success_t TextTokenizer::Open(const char *fname,
00172     const char *comment_chars_in, const char *ident_extra_in,
00173     int features_in) {
00174   next_.Copy("");
00175   cur_.Copy("");
00176   next_type_ = END;
00177   cur_type_ = END;
00178   comment_start_ = comment_chars_in;
00179   features_ = features_in;
00180   ident_extra_ = ident_extra_in;
00181   line_ = 1;
00182   
00183   f_ = fopen(fname, "r");
00184   
00185   if (unlikely(f_ == NULL)) {
00186     return SUCCESS_FAIL;
00187   } else {
00188     Gobble();
00189     return SUCCESS_PASS;
00190   }
00191 }
00192 
00193 char TextTokenizer::NextChar_() {
00194   int c = GetChar_();
00195   
00196   if (c != EOF && unlikely(strchr(comment_start_, c) != NULL)) {
00197     do {
00198       c = GetChar_();
00199     } while (likely(c != EOF) && likely(c != '\r') && likely(c != '\n'));
00200   }
00201   
00202   if (unlikely(c == EOF)) {
00203     c = 0;
00204   }
00205   
00206   return c;
00207 }
00208 
00209 char TextTokenizer::NextChar_(ArrayList<char> *token) {
00210   char c = NextChar_();
00211 
00212   token->PushBackCopy(c);
00213   
00214   return c;
00215 }
00216 
00217 char TextTokenizer::Skip_(ArrayList<char> *token) {
00218   int c;
00219   
00220   while (1) {
00221     c = NextChar_();
00222     if (!isspace(c)) {
00223       break;
00224     }
00225     
00226     if (c == '\r' || c == '\n') {
00227       if (c == '\r') {
00228         c = NextChar_();
00229         if (c != '\n') {
00230           Unget_(c);
00231         }
00232       }
00233       line_++;
00234       if ((features_ & WANT_NEWLINE)) {
00235         c = '\n';
00236         break;
00237       }
00238     }
00239   }
00240   
00241   token->PushBackCopy(char(c));
00242   
00243   return char(c);
00244 }
00245 
00246 void TextTokenizer::UndoNextChar_(ArrayList<char> *token) {
00247   char c;
00248   token->PopBackInit(&c);
00249   if (c != 0) { /* don't put EOF back on the stream */
00250     Unget_(c);
00251   }
00252 }
00253 
00254 void Sanitize(const String& src, String* dest) {
00255   dest->Init();
00256   
00257   for (index_t i = 0; i < src.length(); i++) {
00258     char c = src[i];
00259     
00260     if (isgraph(c) || c == ' ' || c == '\t') {
00261       *dest += c;
00262     } else if (isspace(c)) {
00263       *dest += "<whitespace>";
00264     } else {
00265       *dest += "<nonprint>";
00266     }
00267   }
00268 }
00269 
00270 void TextTokenizer::Error(const char *format, ...) {
00271   va_list vl;
00272   String cur_sanitized;
00273   String next_sanitized;
00274   
00275   Sanitize(cur_, &cur_sanitized);
00276   Sanitize(next_, &next_sanitized);
00277   
00278   // TODO: Use a warning propagation system
00279   fprintf(stderr, ".| %d: %s <-HERE-> %s\nX|  `-> ", line_,
00280       cur_sanitized.c_str(), next_sanitized.c_str());
00281   
00282   va_start(vl, format);
00283   vfprintf(stderr, format, vl);
00284   va_end(vl);
00285   
00286   fprintf(stderr, "\n");
00287 }
00288 
00289 void TextTokenizer::Error_(const char *msg, const ArrayList<char>& token) {
00290   next_type_ = INVALID;
00291   
00292   printf("size is %"LI"d, token[0] = %d\n", token.size(), token[0]);
00293   next_.Copy(token.begin(), token.size());
00294   Error("%s", msg);
00295   next_.Destruct();
00296 }
00297 
00298 void TextTokenizer::ScanNumber_(char c, ArrayList<char> *token) {
00299   bool dot = false;
00300   bool floating = false;
00301   
00302   while (1) {
00303     if (unlikely(c == '.')) {
00304       /* handle a period */
00305       if (unlikely(dot)) {
00306         Error_("Multiple decimal points in a float", *token);
00307         return;
00308       }
00309       dot = true;
00310       floating = true;
00311     } else if (likely(isdigit(c))) {
00312       /* keep on processing digits */
00313     } else if (unlikely(c == 'e' || c == 'E')) {
00314       /* exponent - read exponent and finish */
00315       c = NextChar_(token);
00316       if (c == '+' || c == '-') {
00317         c = NextChar_(token);
00318       }
00319       while (isdigit(c)) {
00320         c = NextChar_(token);
00321       }
00322       floating = true;
00323       break;
00324     } else {
00325       /* non numeric */
00326       break;
00327     }
00328     
00329     c = NextChar_(token);
00330   }
00331 
00332   if (c == 'f' || c == 'F') {
00333     // It's labelled a float.  Gobble and go.
00334     floating = true;
00335   } else if (isspace(c) || ispunct(c)) {
00336     UndoNextChar_(token);
00337   } else {
00338     Error_("Invalid character while parsing number", *token);
00339   }
00340   
00341   if (floating) {
00342     next_type_ = DOUBLE;
00343   } else {
00344     next_type_ = INTEGER;
00345   }
00346 }
00347 
00348 void TextTokenizer::ScanString_(char ending, ArrayList<char> *token) {
00349   int c;
00350   
00351   while (1) {
00352     c = NextChar_(token);
00353     
00354     if (c == 0) {
00355       Error_("Unterminated String", *token);
00356       UndoNextChar_(token);
00357       return;
00358     }
00359     
00360     if (c == ending) {
00361       next_type_ = STRING;
00362       return;
00363     }
00364   }
00365 }
00366 
00367 void TextTokenizer::Scan_(ArrayList<char> *token) {
00368   char c = Skip_(token);
00369   
00370   if (c == 0) {
00371     token->Clear();
00372     next_type_ = END;
00373     return;
00374   } else if (c == '.' || isdigit(c)) {
00375     ScanNumber_(c, token);
00376   } else if (isident_begin_(c)) {
00377     while (isident_rest_(NextChar_(token))) {}
00378     UndoNextChar_(token);
00379     next_type_ = IDENTIFIER;
00380   } else if (ispunct(c) || isspace(c)) {
00381     if (c == '"' || c == '\'') {
00382       ScanString_(c, token);
00383     } else if (c == '+' || c == '-') {
00384       c = NextChar_(token);
00385       if (c == '.' || isdigit(c)) {
00386         ScanNumber_(c, token);
00387       } else {
00388         UndoNextChar_(token);
00389       }
00390     } else {
00391       next_type_ = PUNCT;
00392     }
00393   } else {
00394     Error_("Unknown Character", *token);
00395   }
00396 }
00397 
00398 void TextTokenizer::Gobble() {
00399   cur_.Destruct();
00400   cur_.StealDestruct(&next_);
00401   cur_type_ = next_type_;
00402   
00403   ArrayList<char> token;
00404   token.Init();
00405   Scan_(&token);
00406   token.PushBackCopy('\0');
00407   next_.Steal(&token);
00408   DEBUG_ASSERT(next_.length() == index_t(strlen(next_.c_str())));
00409 }
00410 
00411 success_t TextWriter::Printf(const char *format, ...) {
00412   int rv;
00413   
00414   va_list vl;
00415   
00416   va_start(vl, format);
00417   rv = vfprintf(f_, format, vl);
00418   va_end(vl);
00419   
00420   return SUCCESS_FROM_C(rv);
00421 }
00422