/* ,file-id archive://[lord]/454/rx/super.h/1998-05-18
 */
/* classes: h_files */

#ifndef SUPERH
#define SUPERH

/*	Copyright (C) 1997 Tom Lord
 * 
 * This program is provided to you under the terms of the Liberty Software
 * License.  You are NOT permitted to redistribute, modify, or use it
 * except in very specific ways described by that license.
 *
 * This software comes with NO WARRANTY.
 * 
 * You should have received a copy of the Liberty Software License
 * along with this software; see the file =LICENSE.  If not, write to
 * the Tom Lord, 1810 Francisco St. #2, Berkeley CA, 94703, USA.  
 */


struct rx_superset;
struct rx_super_edge;
struct rx_superstate;


/* One way to describe regexps that are regular expressions is as 
 * deterministic finite automata.   Because of the mathematical
 * relation of the DFAs we construct in Rx to underlying NFAs,
 * we call these DFAs "superstate automata".  Each DFA state is
 * a logically a composition of NFA states: a "superstate".
 *
 * "super.h" delcares functions and data types for lazilly constructing
 * DFAs from NFAs (as built in "nfa.h").  A cache is kept of DFA states.
 * States are constructed and reconstructed as needed.
 *
 * The interface to _NFAs_ is conceptually quite simple and simple
 * in practice as well. NFAs can be thought of as a graph of labeled 
 * nodes and labeled edges and the programming interface in "nfa.h" 
 * reflects that graph structure in a simple, obvious way.
 *
 * The inteface to DFAs, on the other hand, is simple in concept, but
 * the subtle and detailed in practice: the programming interface is 
 * built for performance, not simplicity.
 *
 * Every DFA state in our state graph is represented as a transition
 * table (with as many entries as their are characters in the character
 * set) together with some extra information, such as a state-label 
 * synthesized from the state labels of constituent NFA states.
 *
 * The format and handling of the transition table is again a little
 * subtle.  Each possible transition is represented by an "instruction frame"
 * containing a (virtual) machine instruction and operands.  
 *
 * Matchers that use the DFA interface start by constructing a starting 
 * state (initial transition table) and then following transitions for
 * successive characters, interpreting the instructions in each
 * instruction frame encountered.
 *
 * The code in "dfa.c" provides examples of using this interface.
 *
 */



struct rx_superset
{
  struct rx_nfa_state * car;	/* Not necessarily a valid pointer. */
  struct rx_superset * cdr;	/* Compare the "id" field to the current struct rx_nfa. */
  unsigned long state_label;	/* A combination of the member state labels (usually by MAX). */
  int has_cset_edges;		/* i.e., not a dead-end state. */

  /* Cache management data:
   */
  int refs;			/* A reference count. */
  int id;			/* This must match the id of a struct rx_nfa. */
  struct rx_nfa * starts_for;	/* If this is a start state. */
  struct rx_superstate * superstate; /* If it is in the cache. */
  struct hashtab_item hash_item; /* Used in hash-consing these lists */
};

#define rx_protect_superset(RX,CON) (++(CON)->refs)

struct rx_super_edge
{
  bitset cset;
  struct rx_superstate * present;	/* source state */
  struct rx_superstate * future;	/* destination state */
  struct rx_super_edge * next_same_present;
  struct rx_super_edge * next_same_dest;
  struct rx_super_edge * prev_same_dest;
};

#define rx_lock_superstate(R,S)  ((S)->locks++)
#define rx_unlock_superstate(R,S) (--(S)->locks)

extern int rx_superstate_counter;


/* The heart of the matcher is a `word-code-interpreter' 
 * (like a byte-code interpreter, except that instructions
 * are a full word wide).
 *
 * Instructions are not stored in a vector of code, instead,
 * they are scattered throughout the data structures built
 * by the regexp compiler and the matcher.  One word-code instruction,
 * together with the arguments to that instruction, constitute
 * an instruction frame (struct rx_inx).
 *
 * This structure type is padded by hand to a power of 2 because
 * in some critical code, we dispatch by indexing a table
 * of instruction frames.  If that indexing can be accomplished
 * by just a shift of the index, we're happy.
 *
 * Instructions take at most one argument, but there are two
 * slots in an instruction frame that might hold that argument.
 * These are called data and data_2.  The data slot is only
 * used for one instruction (RX_NEXT_CHAR).  For all other 
 * instructions, data should be set to 0.  This is used to 
 * speed up dispatching: the field "data" is fetched and tested
 * before the field "inx".  If it is non-0, there is no need to fetch
 * the field "inx" at all -- the instruction is certainly RX_NEXT_CHAR
 * and we already have the parameter to that instruction in a register.
 *
 */

struct rx_inx 
{
  void * data;
  void * data_2;
  void * inx;
  void * fnord;
};


enum rx_opcode
{
  /* RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
   * destination superstate has been reclaimed (or was never built).
   * It recomputes the destination superstate.
   * RX_CACHE_MISS is also stored in a superstate transition table before
   * any of its edges have been built.
   */
  rx_cache_miss = 1, /* data is (struct rx_distinct_future *) */

  /* RX_NEXT_CHAR is called to consume the next character and take the
   * corresponding transition.  This is the only instruction that uses 
   * the DATA field of the instruction frame instead of DATA_2.
   * The comments about rx_inx explain this further.
   */
  rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */

  /* RX_BACKTRACK indicates that a transition fails.  Don't
   * confuse this with rx_backtrack_point.
   */
  rx_backtrack = rx_next_char + 1, /* no data */

  rx_num_instructions = rx_backtrack + 1
};


struct rx_superstate
{
  struct rx_superset * members;		 /* A list of NFA states. */
  struct rx_super_edge * outgoing_edges; /* A list, linked by "next_same_present". */
  struct rx_super_edge * incoming_edges; /* A queue, linked by "next/prev_same_dest". */

  /* Superstate Cache Management
   *
   * Upon creation, superstates are given a reference count of one
   * and added to back of a queue of live superstates.  When a cache-hit yields
   * an already existing superstate, it is moved back to back of the queue.
   *
   */
  int locks;			/* Protection from reclamation (a reference count). */

  /* That queue is threaded through these variables:
   */
  struct rx_superstate * next_recyclable;
  struct rx_superstate * prev_recyclable;

  /* There is a (soft) upper bound on the amount of memory Rx
   * will use for caching superstates.  While that upper bound
   * is exceeded, Rx tries to free some superstates which have
   * a lock count of 0.
   *
   * At a rate related to the rate at which superstates are
   * being freed, Rx tries to "semi-free" additional superstates
   * with lock count of 0.  
   *
   * To "semi-free" a superstate is to fill in its transition table
   * with cache-miss instructions, but to leave the graph structure
   * of the superstate intact.  At the same time, the superstate is
   * removed from the queue of live superstates, and moved to a 
   * queue of semi-free superstates (also threaded through 
   * the fields next/prev_recyclable).
   *
   * When Rx really frees a superstate, completely destroying its state,
   * it always takes that superstate from the queue of semi-free superstates.
   * Thus, semi-freeing a superstate is phase I of a two-phase procedure
   * to completely free the superstate.
   *
   * When a state becomes semi-free, there are two possibilities of what
   * will happen next.  One possibility is that Rx will reclaim the states
   * storage.  The other is that the state will be referenced by a DFA transition
   * which in turn will result in a cache miss.  The cache miss handler will revert
   * the semi-free state to a live state, which is less expensive [we presume!]
   * than rebuilding the state from scratch.
   */
  int is_semifree;


  /* At run-time, a matcher follows transitions from one superstate to
   * the next.  At times, a destination state may be missing from the
   * superstate cache or from a particular instruction frame.  In those cases,
   * a destination _superset_ (set of NFA states) is computed, and that
   * is used as a key to search the cache of superstates.  If a cache
   * entry is found (and valid) then the superstate is missing only from
   * the instruction frame, which is filled in, and the transition
   * resumed normally.  If no cache entry is found, a new superstate is
   * constructed for that superset.
   *
   * The following field is used when validating cache entries in the
   * superstate cache keyed on supersets:
   */
  int rx_id;			/* Must match the field `id' of the superset. */
  
  /* Superstates of a given NFA are numbered sequentially.
   * The sequence number counter is the global variable rx_superstate_counter
   * which programs are free to modify.
   *
   * Sequence numbers are useful for applications which manipulate 
   * the DFA graph in ways other than simply following transition tables.
   *
   */
  int seq;

  /* The transition table:
   */
  int trans_size;		/* The number of transitions. */
  struct rx_inx transitions[1];	/* (actual size variable) */
};



/* automatically generated __STDC__ prototypes */
extern void rx_refresh_this_superstate (struct rx_superstate * superstate);
extern void rx_release_superset (struct rx_nfa *rx, struct rx_superset *set);
extern struct rx_superset * rx_superset_cons (struct rx_nfa * rx, struct rx_nfa_state *car, struct rx_superset *cdr);
extern struct rx_superset * rx_superstate_eclosure_union (struct rx_nfa * rx,
							  struct rx_superset *set,
							  struct rx_nfa_state_set *ecl) ;
extern struct rx_superstate * rx_superstate (struct rx_nfa *rx, struct rx_superset *set);
extern struct rx_inx * rx_handle_cache_miss (struct rx_nfa *rx,
					     struct rx_superstate *super,
					     unsigned char chr,
					     void *data) ;
extern struct rx_superstate * rx_next_superstate (struct rx_nfa * rx, struct rx_superstate * superstate, int x);
#endif  /* SUPERH */
