00001 /*<std-header orig-src='regex' incl-file-exclusion='REGEX2_H'> 00002 00003 $Id: regex2.h,v 1.11 2010/07/01 00:08:17 nhall Exp $ 00004 00005 **\file src/common/regex2.h 00006 **\cond skip 00007 00008 */ 00009 00010 #ifndef REGEX2_H 00011 #define REGEX2_H 00012 00013 #include "w_defines.h" 00014 00015 /* -- do not edit anything above this line -- </std-header>*/ 00016 00017 /* 00018 Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved. 00019 This software is not subject to any license of the American Telephone 00020 and Telegraph Company or of the Regents of the University of California. 00021 00022 Permission is granted to anyone to use this software for any purpose on 00023 any computer system, and to alter it and redistribute it, subject 00024 to the following restrictions: 00025 00026 1. The author is not responsible for the consequences of use of this 00027 software, no matter how awful, even if they arise from flaws in it. 00028 00029 2. The origin of this software must not be misrepresented, either by 00030 explicit claim or by omission. Since few users ever read sources, 00031 credits must appear in the documentation. 00032 00033 3. Altered versions must be plainly marked as such, and must not be 00034 misrepresented as being the original software. Since few users 00035 ever read sources, credits must appear in the documentation. 00036 00037 4. This notice may not be removed or altered. 00038 00039 */ 00040 00041 /* 00042 NOTICE of alterations in Spencer's regex implementation : 00043 The following alterations were made to Henry Spencer's regular 00044 expressions implementation, in order to make it build in the 00045 Shore configuration scheme: 00046 00047 1) the generated .ih files are no longer generated. They are 00048 considered "sources". Likewise for regex.h. 00049 2) names were changed to w_regexex, w_regerror, etc by i 00050 #define statements in regex.h 00051 3) all the c sources were protoized and gcc warnings were 00052 fixed. 00053 4) This entire notice was put into the .c, .ih, and .h files 00054 */ 00055 /* 00056 * First, the stuff that ends up in the outside-world include file 00057 = typedef off_t regoff_t; 00058 = typedef struct { 00059 = int re_magic; 00060 = size_t re_nsub; // number of parenthesized subexpressions 00061 = const char *re_endp; // end pointer for REG_PEND 00062 = struct re_guts *re_g; // none of your business :-) 00063 = } regex_t; 00064 = typedef struct { 00065 = regoff_t rm_so; // start of match 00066 = regoff_t rm_eo; // end of match 00067 = } regmatch_t; 00068 */ 00069 /* 00070 * internals of regex_t 00071 */ 00072 #define MAGIC1 ((('r'^0200)<<8) | 'e') 00073 00074 /* 00075 * The internal representation is a *strip*, a sequence of 00076 * operators ending with an endmarker. (Some terminology etc. is a 00077 * historical relic of earlier versions which used multiple strips.) 00078 * Certain oddities in the representation are there to permit running 00079 * the machinery backwards; in particular, any deviation from sequential 00080 * flow must be marked at both its source and its destination. Some 00081 * fine points: 00082 * 00083 * - OPLUS_ and O_PLUS are *inside* the loop they create. 00084 * - OQUEST_ and O_QUEST are *outside* the bypass they create. 00085 * - OCH_ and O_CH are *outside* the multi-way branch they create, while 00086 * OOR1 and OOR2 are respectively the end and the beginning of one of 00087 * the branches. Note that there is an implicit OOR2 following OCH_ 00088 * and an implicit OOR1 preceding O_CH. 00089 * 00090 * In state representations, an operator's bit is on to signify a state 00091 * immediately *preceding* "execution" of that operator. 00092 */ 00093 typedef unsigned long sop; /* strip operator */ 00094 typedef long sopno; 00095 #define OPRMASK 0xf8000000 00096 #define OPDMASK 0x07ffffff 00097 #define OPSHIFT ((unsigned)27) 00098 #define OP(n) ((n)&OPRMASK) 00099 #define OPND(n) ((n)&OPDMASK) 00100 #define SOP(op, opnd) ((op)|(opnd)) 00101 /* operators meaning operand */ 00102 /* (back, fwd are offsets) */ 00103 #define OEND (1ul<<OPSHIFT) /* endmarker - */ 00104 #define OCHAR (2ul<<OPSHIFT) /* character unsigned char */ 00105 #define OBOL (3ul<<OPSHIFT) /* left anchor - */ 00106 #define OEOL (4ul<<OPSHIFT) /* right anchor - */ 00107 #define OANY (5ul<<OPSHIFT) /* . - */ 00108 #define OANYOF (6ul<<OPSHIFT) /* [...] set number */ 00109 #define OBACK_ (7ul<<OPSHIFT) /* begin \d paren number */ 00110 #define O_BACK (8ul<<OPSHIFT) /* end \d paren number */ 00111 #define OPLUS_ (9ul<<OPSHIFT) /* + prefix fwd to suffix */ 00112 #define O_PLUS (10ul<<OPSHIFT) /* + suffix back to prefix */ 00113 #define OQUEST_ (11ul<<OPSHIFT) /* ? prefix fwd to suffix */ 00114 #define O_QUEST (12ul<<OPSHIFT) /* ? suffix back to prefix */ 00115 #define OLPAREN (13ul<<OPSHIFT) /* ( fwd to ) */ 00116 #define ORPAREN (14ul<<OPSHIFT) /* ) back to ( */ 00117 #define OCH_ (15ul<<OPSHIFT) /* begin choice fwd to OOR2 */ 00118 #define OOR1 (16ul<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */ 00119 #define OOR2 (17ul<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */ 00120 #define O_CH (18ul<<OPSHIFT) /* end choice back to OOR1 */ 00121 #define OBOW (19ul<<OPSHIFT) /* begin word - */ 00122 #define OEOW (20ul<<OPSHIFT) /* end word - */ 00123 00124 /* 00125 * Structure for [] character-set representation. Character sets are 00126 * done as bit vectors, grouped 8 to a byte vector for compactness. 00127 * The individual set therefore has both a pointer to the byte vector 00128 * and a mask to pick out the relevant bit of each byte. A hash code 00129 * simplifies testing whether two sets could be identical. 00130 * 00131 * This will get trickier for multicharacter collating elements. As 00132 * preliminary hooks for dealing with such things, we also carry along 00133 * a string of multi-character elements, and decide the size of the 00134 * vectors at run time. 00135 */ 00136 typedef struct { 00137 uch *ptr; /* -> uch [csetsize] */ 00138 uch mask; /* bit within array */ 00139 uch hash; /* hash code */ 00140 size_t smultis; 00141 char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ 00142 } cset; 00143 /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ 00144 #define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c)) 00145 #define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) 00146 #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) 00147 #define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ 00148 #define MCsub(p, cs, cp) mcsub(p, cs, cp) 00149 #define MCin(p, cs, cp) mcin(p, cs, cp) 00150 00151 /* stuff for character categories */ 00152 typedef unsigned char cat_t; 00153 00154 /* 00155 * main compiled-expression structure 00156 */ 00157 struct re_guts { 00158 int magic; 00159 # define MAGIC2 ((('R'^0200)<<8)|'E') 00160 sop *strip; /* malloced area for strip */ 00161 int csetsize; /* number of bits in a cset vector */ 00162 int ncsets; /* number of csets in use */ 00163 cset *sets; /* -> cset [ncsets] */ 00164 uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */ 00165 int cflags; /* copy of regcomp() cflags argument */ 00166 sopno nstates; /* = number of sops */ 00167 sopno firststate; /* the initial OEND (normally 0) */ 00168 sopno laststate; /* the final OEND */ 00169 int iflags; /* internal flags */ 00170 # define USEBOL 01 /* used ^ */ 00171 # define USEEOL 02 /* used $ */ 00172 # define BAD 04 /* something wrong */ 00173 int nbol; /* number of ^ used */ 00174 int neol; /* number of $ used */ 00175 int ncategories; /* how many character categories */ 00176 cat_t *categories; /* ->catspace[-CHAR_MIN] */ 00177 char *must; /* match must contain this string */ 00178 int mlen; /* length of must */ 00179 size_t nsub; /* copy of re_nsub */ 00180 int backrefs; /* does it use back references? */ 00181 sopno nplus; /* how deep does it nest +s? */ 00182 /* catspace must be last */ 00183 cat_t catspace[1]; /* actually [NC] */ 00184 }; 00185 00186 /* misc utilities */ 00187 #define REGEX_OUT (CHAR_MAX+1) /* a non-character value */ 00188 #define ISWORD(c) (isalnum(c) || (c) == '_') 00189 00190 /**\endcond skip */ 00191 00192 /*<std-footer incl-file-exclusion='REGEX2_H'> -- do not edit anything below this line -- */ 00193 00194 #endif /*</std-footer>*/