package DiffXML;

use warnings;
use strict;

require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(diff_xml);

use Fcntl qw(O_RDONLY);
use XML::Parser::Expat;

my $TYPE_CHAR = 1;
my $TYPE_START = 2;
my $TYPE_END = 3;
my $TYPE_ATTR = 4;


#
# Take 2 xml files (xml1, xml2) and a patterns to ignore.
#
# If xml1 or xml2 is invalid xml it should return a sentence describing
# the first error it finds including where in the file it is.
#
# If xml1 is identical to xml2 it should return an empty string.
#
# If xml1 is different to xml2 is should return a sentence describing the 
# first difference it finds including where in the file it is.
#
# It should allow attributes to be reordered but their values should be
# the same including space. If an attribute is missing or added then that
# counts as a difference. Elements should be ordered identically. Text
# within elements will have leading and trailing space ignored but non-
# whitespace must match identically.
#
# Every part of the xml file is given a name and users can pass regular
# expressions which are matched against these names to determine if an
# element and all its sub elements should be ignored.
#
# In the naming system elements and attributes are separated by colons ':'.
# Attribute names are begun with a at symbol '@' and contained text is 
# named '#value'.
#
# So for example:
# dreme:runtime:@cpu
#
# Is the name of the cpu attribute of the runtime element.
#
# Also for example:
# dreme:model:seed:#value
#
# Is the name of the text contained within the seed element.
#
#


sub diff_xml {
  my ($xmlfilename1, $xmlfilename2, @patterns) = @_;
  my ($fh1, $fh2);

  # make regular expressions from the patterns to ignore
  my @ignore_regex = ();
  for my $pattern (@patterns) {
    push(@ignore_regex, qr/$pattern/);
  }

  sysopen($fh1, $xmlfilename1, O_RDONLY) or die("Failed to open $xmlfilename1");
  sysopen($fh2, $xmlfilename2, O_RDONLY) or die("Failed to open $xmlfilename2");

  my ($s1, $p1) = &make_parser(@ignore_regex);
  my ($s2, $p2) = &make_parser(@ignore_regex);

  my $message = '';
  while (!$message && &has_more($s1) && &has_more($s2)) {
    $message = &load_more_events($s1, $p1, $fh1, $s2, $p2, $fh2);
    last if $message;
    my ($e1, $e2);
    while (&has_event($s1) && &has_event($s2)) {
      $e1 = &take_event($s1);
      $e2 = &take_event($s2);
      if ($e1->{type} != $e2->{type} || $e1->{name} ne $e2->{name}) {
        $message = "A difference was detected between the reference " . 
            "and tested xml files.\n" .
            "The reference xml at line " . $e1->{line} . ", column " . 
            ($e1->{column} + 1) . " has a " . &type_str($e1->{type}) . 
            " with the name " . $e1->{name} . ".\n" .
            "But the tested xml at line " . $e2->{line} . ", column ". 
            ($e2->{column} + 1) . " has a " . &type_str($e2->{type}) . 
            " with the name " . $e2->{name} . ".\n";
        last;
      }
      # type and name match so check data if applicable
      if ($e1->{type} == $TYPE_ATTR) {
        if ($e1->{data} ne $e2->{data}) {
          $message = "A difference was detected between the reference " . 
              "and tested xml files.\n" .
              "The element attribute named " . $e1->{name} . 
              " is not consistent.\n" .
              "The reference xml at line " . $e1->{line} . ", column " . 
              ($e1->{column} + 1) . " has the value \"" . $e1->{data} . "\".\n".
              "But the tested xml at line " . $e2->{line} . ", column ". 
              ($e2->{column} + 1) . " has the value \"" . $e2->{data} . "\".\n";
          last;
        }
      } elsif ($e1->{type} == $TYPE_CHAR) {
        # this gets messy as we now need to compare across lists of events
        # that may not be equal in length
        &replace_event($s1, $e1);
        &replace_event($s2, $e2);
        # do the work in another method to make things cleaner
        $message = &diff_xml_chars($s1, $p1, $fh1, $s2, $p2, $fh2);
      }
    }
  }

  if (!$message && (&has_more($s1) || &has_more($s2))) {
    # I don't think this state can occur because the end tag
    # would not match
    $message = "Unmatched events\n";
  }

  close($fh1);
  close($fh2);

  return $message;
}

sub diff_xml_chars {
  my ($s1, $p1, $fh1, $s2, $p2, $fh2) = @_;
  my $message = '';
  my ($line1, $col1, $line2, $col2);
  my $trailing_space_match = 1;
  while (!$message && (&has_chars($s1) || &has_chars($s2))) {
    $message = &load_more_events($s1, $p1, $fh1, $s2, $p2, $fh2);
    last if $message;
    if (&has_chars($s1) && &has_chars($s2)) {
      my $e1 = &take_event($s1);
      my $e2 = &take_event($s2);
      if ($trailing_space_match) {
        $line1 = $e1->{line};
        $col1 = $e1->{column};
        $line2 = $e2->{line};
        $col2 = $e2->{column};
      }
      my $len1 = length($e1->{data});
      my $len2 = length($e2->{data});
      my $min = ($len1 < $len2 ? $len1 : $len2);
      for (my $i = 0; $i < $min; $i++) {
        my $c1 = substr($e1->{data}, $i, 1);
        my $c2 = substr($e2->{data}, $i, 1);
        # If either of the characters is not a space then they must match
        # and there must not have been any mismatches in preceeding whitespace
        if (!(&is_spaces($c1) && &is_spaces($c2)) && ($c1 ne $c2 || !$trailing_space_match)) {
          $message = "A difference was detected between the reference " . 
              "and tested xml files.\n" .
              "The element text content named " . $e1->{name} . 
              " is not consistent.\n" .
              "The reference xml at line " . $line1 . ", column " . 
              ($col1 + 1) . " is not consistent with " .
              "the tested xml at line " . $line2 . ", column ". 
              ($col2 + 1) . ".\n";
          last;
        } elsif ($c1 eq $c2 && $trailing_space_match) {
          # update last match location
          if (&is_nl($c1)) {
            $col1 = 0;
            $col2 = 0;
            $line1++;
            $line2++;
          } else {
            $col1++;
            $col2++;
          }
        } else {
          $trailing_space_match = 0;
        }
      }
      # push back left over characters as events
      if ($len1 < $len2) {
        my $event = &make_event($TYPE_CHAR, $e2->{name}, 
          substr($e2->{data}, $min), $line2, $col2);
        &replace_event($s2, $event);
      } elsif ($len2 < $len1) {
        my $event = &make_event($TYPE_CHAR, $e1->{name}, 
          substr($e1->{data}, $min), $line1, $col1);
        &replace_event($s1, $event);
      }
    } else {
      my $event;
      # at most only one of these will have an event
      $event = &take_event($s1) if (&has_chars($s1));
      $event = &take_event($s2) if (&has_chars($s2));
      if ($event && !&is_spaces($event->{data})) {
        $message = "A difference was detected between the reference " . 
            "and tested xml files.\n" .
            "The element text content named " . $event->{name} . 
            " is not consistent.\n" .
            "The reference xml at line " . $line1 . ", column " . 
            ($col1 + 1) . " is not consistent with " .
            "the tested xml at line " . $line2 . ", column ". 
            ($col2 + 1) . ".\n";
        last;
      }
    }
  }
  return $message;
}

sub type_str {
  my ($val) = @_;
  if ($val == $TYPE_CHAR) {
    return "element text content";
  } elsif ($val == $TYPE_START) {
    return "element start";
  } elsif ($val == $TYPE_END) {
    return "element end";
  } elsif ($val == $TYPE_ATTR) {
    return "element attribute";
  }
}

sub is_spaces {
  my ($c) = @_;
  return $c =~ m/^[ \t\n]*$/;
}

sub is_blank {
  my ($c) = @_;
  return $c =~ m/^[ \t]$/;
}

sub is_nl {
  my ($c) = @_;
  return $c =~ m/^\n$/;
}

sub make_event {
  my ($type, $name, $data, $line, $column) = @_;
  return {
      type => $type, 
      name => $name, 
      data => $data, 
      line => $line,
      column => $column
    };
}

sub queue_event {
  my ($state, $type, $name, $data, $line, $column) = @_;
  push(@{$state->{events}},
    &make_event($type, $name, $data, $line, $column)
  );
}

sub take_event {
  my ($state) = @_;
  my $event = shift(@{$state->{events}});
  return $event;
}

sub replace_event {
  my ($state, $event) = @_;
  unshift(@{$state->{events}}, $event);
}

sub has_event {
  my ($state) = @_;
  return scalar(@{$state->{events}}) > 0;
}

sub has_more {
  my ($state) = @_;
  return !$state->{finished} || scalar(@{$state->{events}}) > 0
}

sub has_chars {
  my ($state) = @_;
  if (scalar(@{$state->{events}}) > 0) {
    my $event = @{$state->{events}}[0];
    return $event->{type} == $TYPE_CHAR;
  } else {
    return !$state->{finished};
  }
}

sub should_ignore {
  my ($name, @ignore_re_list) = @_;
  for my $ignore_re (@ignore_re_list) {
    return 1 if ($name =~ m/$ignore_re/);
  }
  return 0;
}

sub full_name {
  my @elements = @_;
  return '' unless @elements;
  my $full_name = $elements[0]->{name};
  for (my $i = 1; $i < scalar(@elements); $i++) {
    my $element = $elements[$i];
    $full_name .= ':' . $element->{name};
  }
  return $full_name;
}

sub load_more_events {
  my ($s1, $p1, $fh1, $s2, $p2, $fh2) = @_;
  &parse_file_part($s1, $p1, $fh1);
  return &xml_parse_error($s1, "reference") if $s1->{error};
  &parse_file_part($s2, $p2, $fh2);
  return &xml_parse_error($s2, "reference") if $s2->{error};
  return '';
}

sub xml_parse_error {
  my ($state, $file_desc) = @_;
  my $message = "The " . $file_desc . " xml was malformed ".
      "so parsing it gave the error \"" . $state->{error} . 
      "\" at the line " . $state->{line} . ", column " . $state->{column} . 
      ".\n";
  return $message;
}

sub parse_file_part {
  my ($state, $parser, $fh) = @_;
  my $buffer;

  return if ($state->{finished});

  eval {
    # read until the state contains some data to compare
    while (scalar(@{$state->{events}}) == 0 && !$state->{error}) {
      my $count = read($fh, $buffer, 100);
      die("Error reading file") unless defined($count);
      if ($count == 0) {
        $state->{finished} = 1;
        $parser->parse_done();
        last;
      }
      $parser->parse_more($buffer);
    }
  };
  if ($@) {
    my $message = $@;
    if ($message =~ m/(.*) at line (\d+), column (\d+), byte \d+ at /) {
      $state->{error} = $1;
      $state->{line} = $2;
      $state->{column} = $3;
    } else {
      die($@);
    }
  }
}

sub make_parser {
  my $state = {
    ignore => \@_,
    finished => 0,
    text => 0,
    error => "",
    line => 1,
    column => 0,
    elements => [], # stack of {
                    #   name => ELEM_NAME, index => INDEX, 
                    #   position => INDEX_OF_SAME, children => COUNT,  
                    #   counts => {sub_element_name => COUNT}
                    #   ignore => [0|1]
                    # }
    events => []  # queue of {type => TYPE, name => FULL_NAME, data => TEXT_OR_UNDEF}
  };
  my $parser = new XML::Parser::ExpatNB; # non blocking parser
  $parser->setHandlers(Start => sub {handle_start($state, @_);});
  $parser->setHandlers(End => sub {handle_end($state, @_);});
  $parser->setHandlers(Char => sub {handle_char($state, @_);});

  return ($state, $parser);
}

sub handle_start {
  my ($state, $expat, $element, @attr_pairs) = @_;
  # don't bother if we've seen any errors
  return if $state->{error};

  if ($state->{text}) {# TODO add position information
    $state->{error} = "character data and elements intermixed";
    $state->{line} = $expat->current_line;
    $state->{column} = $expat->current_column;
    return;
  }

  # update elements
  my $index = 1;
  my $position = 1;
  my $ignore = 0;
  if (scalar(@{$state->{elements}}) > 0) {
    my $parent = @{$state->{elements}}[-1];
    $parent->{children} += 1;
    $index = $parent->{children};
    $position = $parent->{counts}->{$element};
    $position = 0 unless defined($position);
    $position += 1;
    $parent->{counts}->{$element} = $position;
    $ignore = $parent->{ignore};
  }
  my $elem = {
    name => $element, 
    index => $index, 
    position => $position, 
    children => 0, 
    counts => {},
    ignore => $ignore
  };
  push(@{$state->{elements}}, $elem);

  # before we push any events, check if we're ignoring some parent element
  return if $ignore;

  my $name = &full_name(@{$state->{elements}});

  $ignore = &should_ignore($name, @{$state->{ignore}});

  $elem->{ignore} = $ignore;

  return if $ignore;

  # push element start event
  &queue_event($state, $TYPE_START, $name, undef, $expat->current_line, 
    $expat->current_column);

  # now get the attributes, sort them and check each one to see if it should 
  # be ignored before adding an attribute event for it.
  
  my %attributes = @attr_pairs;

  my @attr_names = sort(keys(%attributes));
  
  for my $attr_name (@attr_names) {
    my $attr_full_name = $name . '@' . $attr_name;
    if (! &should_ignore($attr_full_name, @{$state->{ignore}})) {
      my $value = '';
      # check that we should send the attribute value as well
      # as we give the option of just ignoring the value but
      # not the existance of the attribute.
      if (! &should_ignore($attr_full_name . '#value', @{$state->{ignore}})) {
        $value = $attributes{$attr_name};
      }
      &queue_event($state, $TYPE_ATTR, $attr_full_name, $value,
        $expat->current_line, $expat->current_column);
    }
  }

}

sub handle_end {
  my ($state, $expat, $element) = @_;
  # don't bother if we've seen any errors
  return if $state->{error};

  $state->{text} = 0;

  my $current = pop(@{$state->{elements}});
  return if ($current->{ignore});

  # push event
  my $name = &full_name(@{$state->{elements}}, $current);
  &queue_event($state, $TYPE_END, $name, undef, $expat->current_line, 
    $expat->current_column);
}

sub handle_char {
  my ($state, $expat, $string) = @_;
  # don't bother if we've seen any errors
  return if $state->{error};

  my $line = $expat->current_line;
  my $column = $expat->current_column;

  if (!$state->{text}) {
    # note that this assumes that we're not using binary mode
    return if &is_spaces($string);
    for (my $i = 0; $i < length($string); $i++) {
      my $c = substr($string, $i, 1);
      if (&is_blank($c)) {
        $column++;
      } elsif (&is_nl($c)) {
        $line++;
        $column = 0;
      } else {
        $string = substr($string, $i);
        last;
      }
    }
  }
  $state->{text} = 1;

  my $current = @{$state->{elements}}[-1];

  # this isn't done first because we need to be
  # sure the text isn't just spaces, tabs and newlines.
  if ($current->{children} > 0) {
    $state->{error} = "character data and elements intermixed";
    $state->{line} = $line;
    $state->{column} = $column;
    return;
  }

  return if ($current->{ignore});

  my $name = &full_name(@{$state->{elements}}) . '#value';

  return if (&should_ignore($name, @{$state->{ignore}}));

  # push new event
  &queue_event($state, $TYPE_CHAR, $name, $string, $line, $column);
}


