rx/rxsuper.h

   1 /* classes: h_files */
   2
   3 #ifndef RXSUPERH
   4 #define RXSUPERH
   5
   6 /*      Copyright (C) 1995, 1996 Tom Lord
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU Library General Public License as published by
  10  * the Free Software Foundation; either version 2, or (at your option)
  11  * any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU Library General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Library General Public License
  19  * along with this software; see the file COPYING.  If not, write to
  20  * the Free Software Foundation, 59 Temple Place - Suite 330,
  21  * Boston, MA 02111-1307, USA.
  22  */
  23
  24 /*  lord        Sun May  7 12:40:17 1995        */
  25
  26 \f
  27
  28 #include "rxnfa.h"
  29
  30 \f
  31
  32 /* This begins the description of the superstate NFA.
  33  *
  34  * The superstate NFA corresponds to the NFA in these ways:
  35  *
  36  * Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
  37  *
  38  * Superstate edges correspond to NFA paths.
  39  *
  40  * The superstate has no epsilon transitions;
  41  * every edge has a character label, and a (possibly empty) side
  42  * effect label.   The side effect label corresponds to a list of
  43  * side effects that occur in the NFA.  These parts are referred
  44  * to as:   superedge_character(EDGE) and superedge_sides(EDGE).
  45  *
  46  * For a superstate edge EDGE starting in some superstate SUPER,
  47  * the following is true (in pseudo-notation :-):
  48  *
  49  *       exists DEST in nfa_states s.t.
  50  *         exists nfaEDGE in nfa_edges s.t.
  51  *                 origin (nfaEDGE) == DEST
  52  *              && origin (nfaEDGE) is a member of nfa_states(SUPER)
  53  *              && exists PF in possible_futures(dest(nfaEDGE)) s.t.
  54  *                      sides_of_possible_future (PF) == superedge_sides (EDGE)
  55  *
  56  * also:
  57  *
  58  *      let SUPER2 := superedge_destination(EDGE)
  59  *          nfa_states(SUPER2)
  60  *           == union of all nfa state sets S s.t.
  61  *                          exists PF in possible_futures(dest(nfaEDGE)) s.t.
  62  *                             sides_of_possible_future (PF) == superedge_sides (EDGE)
  63  *                          && S == dests_of_possible_future (PF) }
  64  *
  65  * Or in english, every superstate is a set of nfa states.  A given
  66  * character and a superstate implies many transitions in the NFA --
  67  * those that begin with an edge labeled with that character from a
  68  * state in the set corresponding to the superstate.
  69  *
  70  * The destinations of those transitions each have a set of possible
  71  * futures.  A possible future is a list of side effects and a set of
  72  * destination NFA states.  Two sets of possible futures can be
  73  * `merged' by combining all pairs of possible futures that have the
  74  * same side effects.  A pair is combined by creating a new future
  75  * with the same side effect but the union of the two destination sets.
  76  * In this way, all the possible futures suggested by a superstate
  77  * and a character can be merged into a set of possible futures where
  78  * no two elements of the set have the same set of side effects.
  79  *
  80  * The destination of a possible future, being a set of NFA states,
  81  * corresponds to a supernfa state.  So, the merged set of possible
  82  * futures we just created can serve as a set of edges in the
  83  * supernfa.
  84  *
  85  * The representation of the superstate nfa and the nfa is critical.
  86  * The nfa has to be compact, but has to facilitate the rapid
  87  * computation of missing superstates.  The superstate nfa has to
  88  * be fast to interpret, lazilly constructed, and bounded in space.
  89  *
  90  * To facilitate interpretation, the superstate data structures are
  91  * peppered with `instruction frames'.  There is an instruction set
  92  * defined below which matchers using the supernfa must be able to
  93  * interpret.
  94  *
  95  * We'd like to make it possible but not mandatory to use code
  96  * addresses to represent instructions (c.f. gcc's computed goto).
  97  * Therefore, we define an enumerated type of opcodes, and when
  98  * writing one of these instructions into a data structure, use
  99  * the opcode as an index into a table of instruction values.
 100  *
 101  * Below are the opcodes that occur in the superstate nfa.
 102  *
 103  * The descriptions of the opcodes refer to data structures that are
 104  * described further below.
 105  */
 106
 107 enum rx_opcode
 108 {
 109   /*
 110    * BACKTRACK_POINT is invoked when a character transition in
 111    * a superstate leads to more than one edge.  In that case,
 112    * the edges have to be explored independently using a backtracking
 113    * strategy.
 114    *
 115    * A BACKTRACK_POINT instruction is stored in a superstate's
 116    * transition table for some character when it is known that that
 117    * character crosses more than one edge.  On encountering this
 118    * instruction, the matcher saves enough state to backtrack to this
 119    * point later in the match.
 120    */
 121   rx_backtrack_point = 0,       /* data is (struct transition_class *) */
 122
 123   /*
 124    * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
 125    * There is one occurence of this instruction per rx_distinct_future.
 126    * This instruction is skipped if a rx_distinct_future has no side effects.
 127    */
 128   rx_do_side_effects = rx_backtrack_point + 1,
 129
 130   /* data is (struct rx_distinct_future *) */
 131
 132   /*
 133    * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
 134    * destination superstate has been reclaimed (or was never built).
 135    * It recomputes the destination superstate.
 136    * RX_CACHE_MISS is also stored in a superstate transition table before
 137    * any of its edges have been built.
 138    */
 139   rx_cache_miss = rx_do_side_effects + 1,
 140   /* data is (struct rx_distinct_future *) */
 141
 142   /*
 143    * RX_NEXT_CHAR is called to consume the next character and take the
 144    * corresponding transition.  This is the only instruction that uses
 145    * the DATA field of the instruction frame instead of DATA_2.
 146    * The comments about rx_inx explain this further.
 147    */
 148   rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
 149
 150   /* RX_BACKTRACK indicates that a transition fails.  Don't
 151    * confuse this with rx_backtrack_point.
 152    */
 153   rx_backtrack = rx_next_char + 1, /* no data */
 154
 155   /*
 156    * RX_ERROR_INX is stored only in places that should never be executed.
 157    */
 158   rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
 159
 160   rx_num_instructions = rx_error_inx + 1
 161 };
 162
 163 /* The heart of the matcher is a `word-code-interpreter'
 164  * (like a byte-code interpreter, except that instructions
 165  * are a full word wide).
 166  *
 167  * Instructions are not stored in a vector of code, instead,
 168  * they are scattered throughout the data structures built
 169  * by the regexp compiler and the matcher.  One word-code instruction,
 170  * together with the arguments to that instruction, constitute
 171  * an instruction frame (struct rx_inx).
 172  *
 173  * This structure type is padded by hand to a power of 2 because
 174  * in one of the dominant cases, we dispatch by indexing a table
 175  * of instruction frames.  If that indexing can be accomplished
 176  * by just a shift of the index, we're happy.
 177  *
 178  * Instructions take at most one argument, but there are two
 179  * slots in an instruction frame that might hold that argument.
 180  * These are called data and data_2.  The data slot is only
 181  * used for one instruction (RX_NEXT_CHAR).  For all other
 182  * instructions, data should be set to 0.
 183  *
 184  * RX_NEXT_CHAR is the most important instruction by far.
 185  * By reserving the data field for its exclusive use,
 186  * instruction dispatch is sped up in that case.  There is
 187  * no need to fetch both the instruction and the data,
 188  * only the data is needed.  In other words, a `cycle' begins
 189  * by fetching the field data.  If that is non-0, then it must
 190  * be the destination state of a next_char transition, so
 191  * make that value the current state, advance the match position
 192  * by one character, and start a new cycle.  On the other hand,
 193  * if data is 0, fetch the instruction and do a more complicated
 194  * dispatch on that.
 195  */
 196
 197 struct rx_inx
 198 {
 199   void * data;
 200   void * data_2;
 201   void * inx;
 202   void * fnord;
 203 };
 204
 205 #ifndef RX_TAIL_ARRAY
 206 #define RX_TAIL_ARRAY  1
 207 #endif
 208
 209 /* A superstate corresponds to a set of nfa states.  Those sets are
 210  * represented by STRUCT RX_SUPERSET.  The constructors
 211  * guarantee that only one (shared) structure is created for a given set.
 212  */
 213 struct rx_superset
 214 {
 215   int refs;                     /* This is a reference counted structure. */
 216
 217   /* We keep these sets in a cache because (in an unpredictable way),
 218    * the same set is often created again and again.
 219    *
 220    * When an NFA is destroyed, some of the supersets for that NFA
 221    * may still exist.  This can lead to false cache hits -- an apparent cache
 222    * hit on a superset that properly belongs to an already free NFA.
 223    *
 224    * When a cache hit appears to occur, we will have in hand the
 225    * nfa for which it may have happened.  Every nfa is given
 226    * its own sequence number.  The cache is validated
 227    * by comparing the nfa sequence number to this field:
 228    */
 229   int id;
 230
 231   struct rx_nfa_state * car;    /* May or may not be a valid addr. */
 232   struct rx_superset * cdr;
 233
 234   /* If the corresponding superstate exists: */
 235   struct rx_superstate * superstate;
 236
 237   /* That is_final field of the constiuent nfa states which has the greatest magnitude. */
 238   int is_final;
 239
 240   /* The OR of the corresponding fields of the constiuent nfa states. */
 241   int has_cset_edges;
 242
 243
 244   /* There is another bookkeeping problem.  It is expensive to
 245    * compute the starting nfa state set for an nfa.  So, once computed,
 246    * it is cached in the `struct rx'.
 247    *
 248    * But, the state set can be flushed from the superstate cache.
 249    * When that happens, the cached value in the `struct rx' has
 250    * to be flushed.
 251    */
 252   struct rx * starts_for;
 253
 254   /* This is used to link into a hash bucket so these objects can
 255    * be `hash-consed'.
 256    */
 257   struct rx_hash_item hash_item;
 258 };
 259
 260 #define rx_protect_superset(RX,CON) (++(CON)->refs)
 261
 262 /* The terminology may be confusing (rename this structure?).
 263  * Every character occurs in at most one rx_super_edge per super-state.
 264  * But, that structure might have more than one option, indicating a point
 265  * of non-determinism.
 266  *
 267  * In other words, this structure holds a list of superstate edges
 268  * sharing a common starting state and character label.  The edges
 269  * are in the field OPTIONS.  All superstate edges sharing the same
 270  * starting state and character are in this list.
 271  */
 272 struct rx_super_edge
 273 {
 274   struct rx_super_edge *next;
 275   struct rx_inx rx_backtrack_frame;
 276   int cset_size;
 277   rx_Bitset cset;
 278   struct rx_distinct_future *options;
 279 };
 280
 281 /* A superstate is a set of nfa states (RX_SUPERSET) along
 282  * with a transition table.  Superstates are built on demand and reclaimed
 283  * without warning.  To protect a superstate from this ghastly fate,
 284  * use LOCK_SUPERSTATE.
 285  */
 286 struct rx_superstate
 287 {
 288   int rx_id;                    /* c.f. the id field of rx_superset */
 289   int locks;                    /* protection from reclamation */
 290
 291   /* Within a superstate cache, all the superstates are kept in a big
 292    * queue.  The tail of the queue is the state most likely to be
 293    * reclaimed.  The *recyclable fields hold the queue position of
 294    * this state.
 295    */
 296   struct rx_superstate * next_recyclable;
 297   struct rx_superstate * prev_recyclable;
 298
 299   /* The supernfa edges that exist in the cache and that have
 300    * this state as their destination are kept in this list:
 301    */
 302   struct rx_distinct_future * transition_refs;
 303
 304   /* The list of nfa states corresponding to this superstate: */
 305   struct rx_superset * contents;
 306
 307   /* The list of edges in the cache beginning from this state. */
 308   struct rx_super_edge * edges;
 309
 310   /* A tail of the recyclable queue is marked as semifree.  A semifree
 311    * state has no incoming next_char transitions -- any transition
 312    * into a semifree state causes a complex dispatch with the side
 313    * effect of rescuing the state from its semifree state into a
 314    * fully free state at the head of the queue.
 315    *
 316    * An alternative to this might be to make next_char more expensive,
 317    * and to move a state to the head of the recyclable queue whenever
 318    * it is entered.  That way, popular states would never be recycled.
 319    *
 320    * But unilaterally making next_char more expensive actually loses.
 321    * So, incoming transitions are only made expensive for states near
 322    * the tail of the recyclable queue.  The more cache contention
 323    * there is, the more frequently a state will have to prove itself
 324    * and be moved back to the front of the queue.  If there is less
 325    * contention, then popular states just aggregate in the front of
 326    * the queue and stay there.
 327    */
 328   int is_semifree;
 329
 330
 331   /* This keeps track of the size of the transition table for this
 332    * state.  There is a half-hearted attempt to support variable sized
 333    * superstates.
 334    */
 335   int trans_size;
 336
 337   /* Indexed by characters... */
 338   struct rx_inx transitions[RX_TAIL_ARRAY];
 339 };
 340
 341
 342 /* A list of distinct futures define the edges that leave from a
 343  * given superstate on a given character.  c.f. rx_super_edge.
 344  */
 345 struct rx_distinct_future
 346 {
 347   struct rx_distinct_future * next_same_super_edge[2];
 348   struct rx_distinct_future * next_same_dest;
 349   struct rx_distinct_future * prev_same_dest;
 350   struct rx_superstate * present;       /* source state */
 351   struct rx_superstate * future;        /* destination state */
 352   struct rx_super_edge * edge;
 353
 354
 355   /* The future_frame holds the instruction that should be executed
 356    * after all the side effects are done, when it is time to complete
 357    * the transition to the next state.
 358    *
 359    * Normally this is a next_char instruction, but it may be a
 360    * cache_miss instruction as well, depending on whether or not
 361    * the superstate is in the cache and semifree.
 362    *
 363    * If this is the only future for a given superstate/char, and
 364    * if there are no side effects to be performed, this frame is
 365    * not used (directly) at all.  Instead, its contents are copied
 366    * into the transition table of the starting state of this dist. future
 367    * (a sort of goto elimination).
 368    */
 369   struct rx_inx future_frame;
 370
 371   struct rx_inx side_effects_frame;
 372   struct rx_se_list * effects;
 373 };
 374
 375 #define rx_lock_superstate(R,S)  ((S)->locks++)
 376 #define rx_unlock_superstate(R,S) (--(S)->locks)
 377 \f
 378 struct rx_cache;
 379
 380 #ifdef __STDC__
 381 typedef void (*rx_morecore_fn)(struct rx_cache *);
 382 #else
 383 typedef void (*rx_morecore_fn)();
 384 #endif
 385
 386 struct rx_cache
 387 {
 388   struct rx_hash_rules superset_hash_rules;
 389
 390   struct rx_superstate * lru_superstate;
 391   struct rx_superstate * semifree_superstate;
 392
 393   struct rx_superset * empty_superset;
 394
 395   int superstates;
 396   int semifree_superstates;
 397   int hits;
 398   int misses;
 399
 400   int bytes_allowed;
 401   int bytes_used;
 402
 403   int local_cset_size;
 404   void ** instruction_table;
 405
 406   struct rx_hash superset_table;
 407 };
 408
 409 #ifndef RX_DEFAULT_DFA_CACHE_SIZE
 410 /* This is an upper bound on the number of bytes that may (normally)
 411  * be allocated for DFA states.  If this threshold would be exceeded,
 412  * Rx tries to flush some DFA states from the cache.
 413  *
 414  * This value is used whenever the rx_default_cache is used (for example,
 415  * with the Posix entry points).
 416  */
 417 #define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
 418 #endif
 419
 420 extern struct rx_cache * rx_default_cache;
 421
 422 \f
 423 #ifdef __STDC__
 424 extern char * rx_cache_malloc (struct rx_cache * cache, int size);
 425 extern void rx_cache_free (struct rx_cache * cache, int size, char * mem);
 426 extern void rx_release_superset (struct rx *rx,
 427                                  struct rx_superset *set);
 428 extern struct rx_superset * rx_superset_cons (struct rx * rx,
 429                                               struct rx_nfa_state *car, struct rx_superset *cdr);
 430 extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ;
 431 extern struct rx_superstate * rx_superstate (struct rx *rx,
 432                                              struct rx_superset *set);
 433 extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ;
 434
 435 #else /* STDC */
 436 extern char * rx_cache_malloc ();
 437 extern void rx_cache_free ();
 438 extern void rx_release_superset ();
 439 extern struct rx_superset * rx_superset_cons ();
 440 extern struct rx_superset * rx_superstate_eclosure_union ();
 441 extern struct rx_superstate * rx_superstate ();
 442 extern struct rx_inx * rx_handle_cache_miss ();
 443
 444 #endif /* STDC */
 445
 446 #endif  /* RXSUPERH */