regex2.h

00001 /*<std-header orig-src='regex' incl-file-exclusion='REGEX2_H'>
00002 
00003  $Id: regex2.h,v 1.11 2010/07/01 00:08:17 nhall Exp $
00004 
00005 **\file src/common/regex2.h
00006 **\cond skip
00007 
00008 */
00009 
00010 #ifndef REGEX2_H
00011 #define REGEX2_H
00012 
00013 #include "w_defines.h"
00014 
00015 /*  -- do not edit anything above this line --   </std-header>*/
00016 
00017 /*
00018 Copyright 1992, 1993, 1994, 1997 Henry Spencer.  All rights reserved.
00019 This software is not subject to any license of the American Telephone
00020 and Telegraph Company or of the Regents of the University of California.
00021 
00022 Permission is granted to anyone to use this software for any purpose on
00023 any computer system, and to alter it and redistribute it, subject
00024 to the following restrictions:
00025 
00026 1. The author is not responsible for the consequences of use of this
00027    software, no matter how awful, even if they arise from flaws in it.
00028 
00029 2. The origin of this software must not be misrepresented, either by
00030    explicit claim or by omission.  Since few users ever read sources,
00031    credits must appear in the documentation.
00032 
00033 3. Altered versions must be plainly marked as such, and must not be
00034    misrepresented as being the original software.  Since few users
00035    ever read sources, credits must appear in the documentation.
00036 
00037 4. This notice may not be removed or altered.
00038 
00039 */
00040 
00041 /* 
00042   NOTICE of alterations in Spencer's regex implementation :
00043   The following alterations were made to Henry Spencer's regular 
00044   expressions implementation, in order to make it build in the
00045   Shore configuration scheme:
00046 
00047   1) the generated .ih files are no longer generated. They are
00048     considered "sources".  Likewise for regex.h.
00049   2) names were changed to w_regexex, w_regerror, etc by i
00050     #define statements in regex.h
00051   3) all the c sources were protoized and gcc warnings were 
00052     fixed.
00053   4) This entire notice was put into the .c, .ih, and .h files
00054 */
00055 /*
00056  * First, the stuff that ends up in the outside-world include file
00057  = typedef off_t regoff_t;
00058  = typedef struct {
00059  =     int re_magic;
00060  =     size_t re_nsub;        // number of parenthesized subexpressions
00061  =     const char *re_endp;    // end pointer for REG_PEND
00062  =     struct re_guts *re_g;    // none of your business :-)
00063  = } regex_t;
00064  = typedef struct {
00065  =     regoff_t rm_so;        // start of match
00066  =     regoff_t rm_eo;        // end of match
00067  = } regmatch_t;
00068  */
00069 /*
00070  * internals of regex_t
00071  */
00072 #define    MAGIC1    ((('r'^0200)<<8) | 'e')
00073 
00074 /*
00075  * The internal representation is a *strip*, a sequence of
00076  * operators ending with an endmarker.  (Some terminology etc. is a
00077  * historical relic of earlier versions which used multiple strips.)
00078  * Certain oddities in the representation are there to permit running
00079  * the machinery backwards; in particular, any deviation from sequential
00080  * flow must be marked at both its source and its destination.  Some
00081  * fine points:
00082  *
00083  * - OPLUS_ and O_PLUS are *inside* the loop they create.
00084  * - OQUEST_ and O_QUEST are *outside* the bypass they create.
00085  * - OCH_ and O_CH are *outside* the multi-way branch they create, while
00086  *   OOR1 and OOR2 are respectively the end and the beginning of one of
00087  *   the branches.  Note that there is an implicit OOR2 following OCH_
00088  *   and an implicit OOR1 preceding O_CH.
00089  *
00090  * In state representations, an operator's bit is on to signify a state
00091  * immediately *preceding* "execution" of that operator.
00092  */
00093 typedef unsigned long sop;    /* strip operator */
00094 typedef long sopno;
00095 #define    OPRMASK    0xf8000000
00096 #define    OPDMASK    0x07ffffff
00097 #define    OPSHIFT    ((unsigned)27)
00098 #define    OP(n)    ((n)&OPRMASK)
00099 #define    OPND(n)    ((n)&OPDMASK)
00100 #define    SOP(op, opnd)    ((op)|(opnd))
00101 /* operators               meaning    operand            */
00102 /*                        (back, fwd are offsets)    */
00103 #define    OEND    (1ul<<OPSHIFT)    /* endmarker    -            */
00104 #define    OCHAR    (2ul<<OPSHIFT)    /* character    unsigned char        */
00105 #define    OBOL    (3ul<<OPSHIFT)    /* left anchor    -            */
00106 #define    OEOL    (4ul<<OPSHIFT)    /* right anchor    -            */
00107 #define    OANY    (5ul<<OPSHIFT)    /* .        -            */
00108 #define    OANYOF    (6ul<<OPSHIFT)    /* [...]    set number        */
00109 #define    OBACK_    (7ul<<OPSHIFT)    /* begin \d    paren number        */
00110 #define    O_BACK    (8ul<<OPSHIFT)    /* end \d    paren number        */
00111 #define    OPLUS_    (9ul<<OPSHIFT)    /* + prefix    fwd to suffix        */
00112 #define    O_PLUS    (10ul<<OPSHIFT)    /* + suffix    back to prefix        */
00113 #define    OQUEST_    (11ul<<OPSHIFT)    /* ? prefix    fwd to suffix        */
00114 #define    O_QUEST    (12ul<<OPSHIFT)    /* ? suffix    back to prefix        */
00115 #define    OLPAREN    (13ul<<OPSHIFT)    /* (        fwd to )        */
00116 #define    ORPAREN    (14ul<<OPSHIFT)    /* )        back to (        */
00117 #define    OCH_    (15ul<<OPSHIFT)    /* begin choice    fwd to OOR2        */
00118 #define    OOR1    (16ul<<OPSHIFT)    /* | pt. 1    back to OOR1 or OCH_    */
00119 #define    OOR2    (17ul<<OPSHIFT)    /* | pt. 2    fwd to OOR2 or O_CH    */
00120 #define    O_CH    (18ul<<OPSHIFT)    /* end choice    back to OOR1        */
00121 #define    OBOW    (19ul<<OPSHIFT)    /* begin word    -            */
00122 #define    OEOW    (20ul<<OPSHIFT)    /* end word    -            */
00123 
00124 /*
00125  * Structure for [] character-set representation.  Character sets are
00126  * done as bit vectors, grouped 8 to a byte vector for compactness.
00127  * The individual set therefore has both a pointer to the byte vector
00128  * and a mask to pick out the relevant bit of each byte.  A hash code
00129  * simplifies testing whether two sets could be identical.
00130  *
00131  * This will get trickier for multicharacter collating elements.  As
00132  * preliminary hooks for dealing with such things, we also carry along
00133  * a string of multi-character elements, and decide the size of the
00134  * vectors at run time.
00135  */
00136 typedef struct {
00137     uch *ptr;        /* -> uch [csetsize] */
00138     uch mask;        /* bit within array */
00139     uch hash;        /* hash code */
00140     size_t smultis;
00141     char *multis;        /* -> char[smulti]  ab\0cd\0ef\0\0 */
00142 } cset;
00143 /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
00144 #define    CHadd(cs, c)    ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
00145 #define    CHsub(cs, c)    ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
00146 #define    CHIN(cs, c)    ((cs)->ptr[(uch)(c)] & (cs)->mask)
00147 #define    MCadd(p, cs, cp)    mcadd(p, cs, cp)    /* regcomp() internal fns */
00148 #define    MCsub(p, cs, cp)    mcsub(p, cs, cp)
00149 #define    MCin(p, cs, cp)    mcin(p, cs, cp)
00150 
00151 /* stuff for character categories */
00152 typedef unsigned char cat_t;
00153 
00154 /*
00155  * main compiled-expression structure
00156  */
00157 struct re_guts {
00158     int magic;
00159 #        define    MAGIC2    ((('R'^0200)<<8)|'E')
00160     sop *strip;        /* malloced area for strip */
00161     int csetsize;        /* number of bits in a cset vector */
00162     int ncsets;        /* number of csets in use */
00163     cset *sets;        /* -> cset [ncsets] */
00164     uch *setbits;        /* -> uch[csetsize][ncsets/CHAR_BIT] */
00165     int cflags;        /* copy of regcomp() cflags argument */
00166     sopno nstates;        /* = number of sops */
00167     sopno firststate;    /* the initial OEND (normally 0) */
00168     sopno laststate;    /* the final OEND */
00169     int iflags;        /* internal flags */
00170 #        define    USEBOL    01    /* used ^ */
00171 #        define    USEEOL    02    /* used $ */
00172 #        define    BAD    04    /* something wrong */
00173     int nbol;        /* number of ^ used */
00174     int neol;        /* number of $ used */
00175     int ncategories;    /* how many character categories */
00176     cat_t *categories;    /* ->catspace[-CHAR_MIN] */
00177     char *must;        /* match must contain this string */
00178     int mlen;        /* length of must */
00179     size_t nsub;        /* copy of re_nsub */
00180     int backrefs;        /* does it use back references? */
00181     sopno nplus;        /* how deep does it nest +s? */
00182     /* catspace must be last */
00183     cat_t catspace[1];    /* actually [NC] */
00184 };
00185 
00186 /* misc utilities */
00187 #define    REGEX_OUT    (CHAR_MAX+1)    /* a non-character value */
00188 #define    ISWORD(c)    (isalnum(c) || (c) == '_')
00189 
00190 /**\endcond skip */
00191 
00192 /*<std-footer incl-file-exclusion='REGEX2_H'>  -- do not edit anything below this line -- */
00193 
00194 #endif          /*</std-footer>*/

Generated on Wed Jul 7 17:22:32 2010 for Shore Storage Manager by  doxygen 1.4.7