# -*- mode: perl; perl-indent-level: 2 -*-
#
# Compile and evaluate regular expressions in Perl without
# using built-in regular expressions.
#
# Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com)
# This program is in the PUBLIC DOMAIN.
#

#
# Note to readers:
#  The code is in three parts.
#  1. A parser for regexes
#  2. A compiler that trnaslates regexes into machines
#  3. An executer that executes a machine on a specified input
# 
# The parser is really turgid and hard to understand, because it isn't
# the point of the package.  I recommend that you skip to part 2 or 3
# immediately, then work backwards.  Parts 2 and 3 are much clearer.

package Regex;

# Regexps are handled in three phases.  First, they're parsed from a string
# form into an internal parse tree form, thus:
#
#  ABC => [ CONCAT => [ A, B, C ] ]
#  A* => [ STAR => A ]
#  A+ => [ PLUS => A ]
#  A|B|C => [ ALTERN => [ A, B, C ] ]
#  literal character x => [ LITERAL => x ]
#
# `parse' does this.

sub parse {
  @t = split(//, $_[1]);
  parse_altern(@t);
}

sub parse_altern {
  my @alterns;
  my @terms;
  my $c;
  while (defined($c = shift @_)) {
    next if $c eq '';
    push @seen, $c;
    if ($c eq '(') {
      my $next_term = &parse_altern;
      push @terms, $next_term;
    } elsif ($c eq ')') {
      push @alterns, &joinup(CONCAT, @terms) if @terms;
      return &joinup(ALTERN, @alterns);
    } elsif ($c eq '|') {
      push @alterns, &joinup(CONCAT, @terms) if @terms;
      @terms = ();
    } elsif ($c eq '*' || $c eq '+') {
      if (@terms) {
	$terms[-1] = [ ($c eq '*' ? STAR : PLUS) => $terms[-1] ];
      } else {
	$PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n";
	return undef;
      }
    } elsif ($c eq '\\') {
      push @terms, [ LITERAL => (shift) ];
    } else {
      push @terms, [ LITERAL => $c ];
    }
  }				# While there are tokens...
  push @alterns, &joinup(CONCAT, @terms) if @terms;
  return joinup(ALTERN, @alterns) if @alterns;
  return undef;
}

sub joinup {
  my $tag = shift;
  if (@_ == 1) {
    $_[0];
  } else {
    [ $tag => [ @_ ] ];
  }
}

package NFA;

################################################################
#
# Compile parsed regexp into representation of NFA
#
################################################################

$S = 'aa00';
$STARTSYMBOL = 0;
$ENDSYMBOL = 1;

sub new {
  compile(@_);
}

sub compile {
  my $pack = shift;
  my $rx = shift;
  my ($operator, $operands) = @$rx;

  # A literal has no suboperands to compile.
  # So invoke the special atom-compiler and return that result instead.
  if ($operator eq LITERAL) {
    return $pack->literal($operands);
  }

  my $startsym = "S" . &gensym();
  my $endsym = "E" . &gensym();
  my $result = { Symbols => [ $startsym, $endsym ] };

  # Compile the sub-operands first.
  my @submachines;
  if ($operator eq STAR || $operator eq PLUS) {
    @submachines = ($pack->compile($operands));
  } else {
    foreach $operand (@$operands) {
      push @submachines, $pack->compile($operand);
    }
  }

  if ($operator eq CONCAT) {
    return $submachines[0] if @submachines == 1;
    &putin($result, @submachines);
    my $i;
    for ($i = 0; $i < @submachines - 1; $i++) {
      my $tail = $submachines[$i]  {Symbols}[$ENDSYMBOL];
      my $head = $submachines[$i+1]{Symbols}[$STARTSYMBOL];
      $result->{$tail} = { '' => $head };
    }
    $result->{$startsym} = { '' => $submachines[0] {Symbols}[$STARTSYMBOL] };
    $result->{$submachines[-1]{Symbols}[$ENDSYMBOL]} = { '' => $endsym };
  } elsif ($operator eq STAR) {
    my $sm = $submachines[0];
    &putin($result, $sm);
    my ($s, $e) = @{$sm->{Symbols}};
    $result->{$e} = { '' => [$s, $endsym] };
    $result->{$startsym} = { '' => [$s, $endsym] };
  } elsif ($operator eq PLUS) {
    my $sm = $submachines[0];
    &putin($result, $sm);
    my ($s, $e) = @{$sm->{Symbols}};
    $result->{$e} = { '' => [$s, $endsym] };
    $result->{$startsym} = { '' => $s };
  } elsif ($operator eq ALTERN) {
    return $submachines[0] if @submachines == 1;
    &putin($result, @submachines);
    my @startsyms = map { $_->{Symbols}[$STARTSYMBOL] } @submachines;
    my @endsyms = map { $_->{Symbols}[$ENDSYMBOL] } @submachines;
    $result->{$startsym} = { '' => \@startsyms };
    foreach $es (@endsyms) {
      $result->{$es} = { '' => $endsym };
    }
  } else {
    warn "Bizarre oprerator `$operator' encountered.\n";
  }
  bless $result => $pack;
}

sub start_state {
  $_[0]{Symbols}[$STARTSYMBOL];
}

sub is_end_state {
  my $self = shift;
  my $state = shift;
  $state eq $self->{Symbols}[$ENDSYMBOL];
}

sub transition_table {
  my $self = shift;
  my $state = shift;

  $self->{$state} || {};
}

sub literal {
  my $pack = shift;
  my $what = shift;
  my $startsym = "S" . &gensym();
  my $endsym = "E" . &gensym();
  
  bless
  { Symbols   => [ $startsym, $endsym ],
    $startsym => { $what => $endsym } },
      => $pack;
}

# Given a list of machines, M1 ... Mn, put M2... Mn into M1.
sub putin {
  my $master = shift;
  foreach $m (@_) {
    foreach $state (keys %$m) {
      next if $state eq 'Symbols';
      if (exists $master->{$state}) {
	print STDERR "Warning: State name conflict for `$state'.\n";
      }
      $master->{$state} = $m->{$state};
    }
  }
  $master;
}

sub gensym {
  $S++;
}

################################################################
# 
# Execute NFA on a given string
#
################################################################

package NFA_Exec;


sub match {
  my $pack = shift;
  my $nfa = shift;
  my $string = shift;
  my $machine = $pack->init($nfa, $string);
  $machine->run();
  $machine->final_state();
}

sub new {
  &init(@_);
}

# 
# Create a new execution of the specified NFS, and feed it
# the specified string as its input.
#
sub init {
  my $pack = shift;
  my $nfa = shift;
  my $string = shift;
  my $self = {};
 
  $self->{nfa} = $nfa;
  $self->{input} = $string;
  $self->{pos} = 0;
  $self->{states} = [ $self->{nfa}->start_state ];

  bless $self => $pack;

  $self->epsilon_transit();

  $self;
}

#
# Run an execution to the end of the input 
#
sub run {
  my $self = shift;
  until ($self->end_of_input() || $self->states() == 0) {
    $self->step;
  }
}

#
# Is this execution object at the end of its input? 
#
sub end_of_input {
  my $self = shift;
  $self->{pos} >= length($self->{input});
}

#
# Advance an execution by one step.
#
sub step {
  my $self = shift;
  my $next_symbol = substr($self->{input}, $self->{pos}, 1);
  if ($next_symbol eq '') {
    # error 
  } else {
    $self->transit($next_symbol);
    $self->epsilon_transit();
  }
  $self->{pos}++;
}

# 
# Perform e-transitions in an execution 
#
sub epsilon_transit {
  my $self = shift;
  my @newstates = $self->states;
  my @result = @newstates;
  my %seen = map {($_ => 1)} @newstates;

  for (;;) {
    my $s;
    my @nextstates;
    foreach $s (@newstates) {
      my $nextstate = $self->{nfa}->transition_table($s)->{''};
      next unless defined $nextstate;
      push @nextstates, ref $nextstate ? @$nextstate : $nextstate;
    }
    @newstates = grep {! $seen{$_}++} @nextstates;
    last unless @newstates;
    push @result, @newstates;
  }

  $self->{states} = \@result;
}

# 
# Perform a transition
# 
sub transit {
  my $self = shift;
  my $symbol = shift;
  
  $self->{states} = $self->transition_table->{$symbol};
}

#
# Current states
#
sub states {
  my $self = shift;
  @{$self->{states}};
}


#
# Should we accept?
#
sub final_state {
  my $self = shift;
  my $s;
  foreach $s ($self->states) {
    return 1 if $self->{nfa}->is_end_state($s);
  }
  0;
}

#
# Get current transition table
# This is interesting because we have to merge the transition
# tables for several states.
sub transition_table {
  my $self = shift;
  my $s;
  my %ttab;
  foreach $s ($self->states) {
    my $sub_ttab = $self->{nfa}->transition_table($s);
    my ($symbol, $next_state);
    while (($symbol, $next_state) = each %$sub_ttab) {
      push @{$ttab{$symbol}}, ref $next_state ? @$next_state : $next_state;
    }
  }
  \%ttab;
}

1;
