#!/usr/bin/perl -w

use Getopt::Long;
use Pod::Usage;
#use POSIX;
#use GD;
use strict;

my $ssFile='alirna.ps';
my $columnWidth=120;
my $formatT=0;
my $man=0;
my $ruler=0;
my $resnum=0;

GetOptions ('-s:s' => \$ssFile,
	    '-c:i' => \$columnWidth,
	    '-t' => \$formatT,
	    '-r' => \$ruler,
	    '-n' => \$resnum,
	    "help"=> \$man,
	    "man" => \$man,
	    "h"=> sub {pod2usage(-verbose => 1)},
	   ) || pod2usage(-verbose => 0);

pod2usage(-verbose => 2) if $man;

if (!-e $ssFile) {
    pod2usage(-msg => "RNAalifold secondary structure file $ssFile not found (use option -s)",
	      -verbose => 0);
  exit(1);
}

my $aln=readClustal();

my @ss=();
for my $i (1..length($aln->[0]->{seq})) {
  push @ss,'.';
}

my $consStruc='';

open(ALIRNA,"<$ssFile");
my $pairsFlag=0;
my $sequenceFlag=0;
while (<ALIRNA>) {
  $pairsFlag=1 if (/\/pairs/);
  if ($pairsFlag and /\[(\d+) (\d+)\]/) {
    $ss[$1-1]='(';
    $ss[$2-1]=')';
  }
  $pairsFlag=0 if ($pairsFlag and /def/);
}

$consStruc=join('',@ss);

for my $row (@$aln) {
  $row->{seq}=uc($row->{seq});
  if ($formatT) {
    $row->{seq}=~s/U/T/g;
  } else {
    $row->{seq}=~s/T/U/g;
  }
}


my $consSeq=consensusSeq($aln);

#print "$consSeq\n$consStruc\n";

plotAln($aln,$consSeq,$consStruc);



######################################################################
#
# consensusSeq(\@aln alnref)
#
# Returns consensus sequence of alignment
#
######################################################################

sub consensusSeq{

  my @aln=@{$_[0]};
  my $out='';

  for my $i (0..length($aln[0]->{seq})-1) {

    my %countHash=('A'=>0,'C'=>0,'G'=>0,'T'=>0,'U'=>0,'-'=>0);
    #print %countHash,"\n";
    for my $j (0..$#aln) {
      my $c=substr($aln[$j]->{seq},$i,1);
      $countHash{$c}++;
    }
    #print %countHash,"\n";
    my $maxCount=0;
    my $maxChar='';

    for my $c ('A','C','G','T','U','-') {
      if ($countHash{$c}>=$maxCount) {
	$maxChar=$c;
	$maxCount=$countHash{$c};
      }
    }
    $out.=$maxChar;
  }
  return $out;
}


######################################################################
#
# readClustal(filehandle)
#
# Reads Clustal W formatted alignment file and returns it in list of
# hash references with keys "name" and "seq" for the name and the sequence,
# resp.
#
######################################################################

sub readClustal{
  #  my $fh=shift;
  my @out=();
  my (%order, $order, %alignments);
  while (<>) {
    next if ( /^\s+$/ );
    my ($seqname, $aln_line) = ('', '');
    if ( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) {
      # clustal 1.4 format
      ($seqname,$aln_line) = ("$1/$2-$3",$4);
    } elsif ( /^(\S+)\s+([A-Z\-]+)\s*\d*$/ ) {
	  ($seqname,$aln_line) = ($1,$2);
	} else {
	  next;
  }
	  if ( !exists $order{$seqname} ) {
	    $order{$seqname} = $order++;
	  }
    $alignments{$seqname} .= $aln_line;
  }

  foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) {
    if ( $name =~ /(\S+):(\d+)-(\d+)/ ) {
      (my $sname,my $start, my $end) = ($1,$2,$3);
    } else {
      (my $sname, my $start) = ($name,1);
      my $str  = $alignments{$name};
      $str =~ s/[^A-Za-z]//g;
      my $end = length($str);
    }
    my $seq=$alignments{$name};
    push @out, {name=>$name,seq=>$seq};
  }
  return [@out];
}

######################################################################
#
# getPairs(\@aln alnref, $ss string)
#
# Evalutates the pairing of an alignment according to a given
# consensus secondary structure
#
# Returns list of all base pairs which is a hash with the following
# keys:
#
#  open ... column in the alignment of the first base in the pair "("
#  close ... column in the alignment of the second base in the pair ")"
#  all ... list of all basepairs in the different sequences in the alignment
#  pairing ... list of all different pairing basepairs
#  nonpairing ... list of all incompatible basepairs
#
######################################################################

sub getPairs{

  my @inputAln=@{$_[0]};
  my $ss=$_[1];

  # return nothing if there are no pairs
  if (!($ss=~tr/(/(/)) {
    return ();
  }

  my @aln=();
  foreach my $row (@inputAln) {
    my $seq=$row->{seq};
    $seq=uc($seq);
    $seq=~s/T/U/g;
    my @tmp=split(//,$seq);
    push @aln,\@tmp;
  }
  my @ss=split(//,$ss);

  my @pairs=();
  my @stack=();

  foreach my $column (0..$#ss) {

    my $currChar=$ss[$column];

    if ($currChar eq '(') {
      push @stack,$column;
    }

    if ($currChar eq ')') {
      my $openedCol=pop @stack;
      push @pairs,{open=>$openedCol,close=>$column};
    }
  }

  @pairs=sort {$a->{open} <=> $b->{open}} @pairs;

  foreach my $i (0..$#pairs) {
    #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n";

    my @all=();
    my @pairing=();
    my @nonpairing=();

    for my $j (0..$#aln) {
      my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}];
      push @all,$currPair;
    }

    for my $pair (@all) {
      if (($pair eq 'AU') or
	  ($pair eq 'UA') or
	  ($pair eq 'GC') or
	  ($pair eq 'CG') or
	  ($pair eq 'UG') or
	  ($pair eq 'GU')) {

	push @pairing, $pair;
      } elsif ($pair eq "--") {
	# do nothing
      } else {
	push @nonpairing,$pair;
      }
    }

    undef my %saw;
    my @uniquePairing = grep(!$saw{$_}++, @pairing);

    $pairs[$i]->{all}=[@all];
    $pairs[$i]->{pairing}=[@uniquePairing];
    $pairs[$i]->{nonpairing}=[@nonpairing];
  }

  return @pairs;

}


######################################################################
#
# plotAln(\@aln ref-to-alignment, $consensus string, $ss string)
#
# Creates a image of the alignment with various annotations
#
# \@aln ... alignment in list of hash format
# $consensus ... consensus sequence
# $ss ... consensus secondary structure in dot bracket notation
#
# Returns png as string.
#
######################################################################

sub plotAln{

  my @aln=@{$_[0]};
  my $consensus=$_[1];
  my $ss=$_[2];

  my $ps='';

  # Get important measures
  (my $fontWidth, my $fontHeight) = (6,6.5);

  my $length=length($aln[0]->{seq});
  my $maxName=0;
  foreach my $row (@aln) {
    $maxName=length($row->{name}) if (length($row->{name})>$maxName);
  }
  $maxName = 5 if $maxName<5 && $ruler;

  # Custom Sizes
  my $lineStep=$fontHeight+2;	# distance between lines
  my $blockStep=3.5*$fontHeight; # distance between blocks
  my $consStep=$fontHeight*0.5;	# distance between alignment and conservation curve
  my $ssStep=2;
  my $nameStep=3*$fontWidth;	# distance between names and sequences
  my $maxConsBar=2.5*$fontHeight; # Height of conservation curve
  my $startY=2;			# "y origin"
  my $namesX=$fontWidth;	# "x origin"
  my $seqsX=$namesX+$maxName*$fontWidth+$nameStep; # x-coord. where sequences start

  # Calculate width and height

  my $tmpColumns=$columnWidth;

  $tmpColumns=$length if ($length<$columnWidth);

  my $imageWidth = int(0.9+$namesX+($maxName+$tmpColumns)*$fontWidth+$nameStep+$fontWidth);
  $imageWidth += int($fontWidth + length("$length")*$fontWidth) if $resnum;
  my $numBlock = int(1+($length-1)/$columnWidth); 
  my $imageHeight = int($startY+$numBlock*((@aln+1+$ruler)*$lineStep+$blockStep+$consStep+$ssStep));


  my $white="1 1 1";
  my $black="0 0 0" ;
  my $grey="1 1 1";

  my $red="0.0 1";
  my $ocre="0.16 1";
  my $green="0.32 1";
  my $turq="0.48 1";
  my $blue="0.65 1";
  my $violet="0.81 1";


  my $red1="0.0 0.6";
  my $ocre1="0.16 0.6";
  my $green1="0.32 0.6";
  my $turq1="0.48 0.6";
  my $blue1="0.65 0.6";
  my $violet1="0.81 0.6";

  my $red2="0.0 0.2";
  my $ocre2="0.16 0.2";
  my $green2="0.32 0.2";
  my $turq2="0.48 0.2";
  my $blue2="0.65 0.2";
  my $violet2="0.81 0.2";


  my @colorMatrix=([$red,$red1,$red2],
		   [$ocre,$ocre1,$ocre2],
		   [$green,$green1,$green2],
		   [$turq,$turq1,$turq2],
		   [$blue,$blue1,$blue2],
		   [$violet,$violet1,$violet2]);

  my @pairs=getPairs(\@aln,$ss);

  foreach my $pair (@pairs) {

    foreach my $column ($pair->{open},$pair->{close}) {
      my $block = int(0.999+($column+1)/$columnWidth);
      my $effCol=$column-($block-1)*$columnWidth;
      my $x=$seqsX+$effCol*$fontWidth;

      foreach my $row (0..$#aln) {

	my $pairing=@{$pair->{pairing}};
	my $nonpairing=@{$pair->{nonpairing}};

	my $color;

	if ($nonpairing <=2) {
	  $color=$colorMatrix[$pairing-1][$nonpairing];

	  if (($pair->{all}->[$row] eq 'AU') or
	      ($pair->{all}->[$row] eq 'UA') or
	      ($pair->{all}->[$row] eq 'GC') or
	      ($pair->{all}->[$row] eq 'CG') or
	      ($pair->{all}->[$row] eq 'GU') or
	      ($pair->{all}->[$row] eq 'UG')) {

	    my $y=$startY+($block-1)*($lineStep*(@aln+1+$ruler)+$blockStep+$consStep)+$ssStep*($block)+($row+1)*$lineStep;

	    my $xtmp=$x+$fontWidth;
	    my $ytmp1=$y-1;
	    my $ytmp2=$y+$fontHeight+1;
	    $ps.="$x $ytmp1 $xtmp $ytmp2 $color box\n";
	  }
	}
      }
    }
  }
  # Calculate conservation scores as (M+1)/(N+1), where M is the
  # number of matches to the consensus and N is the number of
  # sequences in the alignment

  my @conservation=();		# each column one score

  for my $column (0..$length-1) {

    my $consChar=substr($consensus,$column,1);

    # if consensus is gap, score=0
    if ($consChar eq "-" or $consChar eq "_") {
      push @conservation, 0;
      next;
    }

    my $match=0;
    for my $row (0..$#aln) {
      my $currChar=substr($aln[$row]->{seq},$column,1);
      $match++ if ($currChar eq $consChar);
    }

    my $score=($match-1)/(@aln-1);
    push @conservation,$score;
  }

  # Draw the alignments in chunks
  my $currY=$startY;
  my $currPos=0;

  while ($currPos<$length) {

    # secondary structure in first line
    #$out->string($font,$seqsX,$currY,substr($ss,$currPos,$columnWidth),$black);
    my $tmpSeq=substr($ss,$currPos,$columnWidth);
    $tmpSeq=~s/\(/\\\(/g;
    $tmpSeq=~s/\)/\\\)/g;
    $ps .= "0 setgray\n";
    $ps.="($tmpSeq) $seqsX $currY string\n";
    $currY+=$lineStep+$ssStep;

    # sequences labeled only with the organism-specifier
    foreach my $row (@aln) {
      #$out->string($font,$namesX,$currY,$row->{name},$black);
      $ps.="($row->{name}) $namesX $currY string\n";
      #$out->string($font,$seqsX,$currY,substr($row->{seq},$currPos,$columnWidth),$black);
      my $tmpSeq=substr($row->{seq},$currPos,$columnWidth);
      $ps.="($tmpSeq) $seqsX $currY string\n";
      if ($resnum) {
	my $numX=$seqsX+(length($tmpSeq)+1)*$fontWidth; # x-coord. where residue nums start
	$tmpSeq = substr($row->{seq},0, $currPos+$columnWidth);
	my $p = length($tmpSeq) - $tmpSeq =~ tr/_-//; # remove gaps
	$ps .= sprintf("(%" . length($length) . "d) $numX $currY string\n", $p);
      }
      $currY+=$lineStep;
    }
    if ($ruler) {
      $ps .= "(ruler) $namesX $currY string\n";
      my $p = 10 * (int(1.5+$currPos/10));
      my $r = "." x ($p-$currPos-length("$p"));
      $r .= $p; $p+=10;
      my $maxp = $currPos+$columnWidth;
      $maxp = $length if $maxp>$length;
      for (;$p<=$maxp; $p+=10) {
	$r .= sprintf("%10d", $p);
      }
      $r =~ tr/ /./;
      $r .= "." x ($maxp-$p+10) if $p>$maxp;
      $ps .= "($r) $seqsX $currY string\n";
      $currY+=$lineStep;
    }

    # conservation curve
    $currY+=$consStep;
    $ps .= "0.6 setgray\n";
    for my $col ($currPos..$currPos+$columnWidth-1) {
      my $score=shift @conservation;
      last if (!defined $score);
      my $barHeight=$maxConsBar*$score;
      $barHeight=1 if ($barHeight==0);
      my $x=$seqsX+($col-($columnWidth*int($currPos/$columnWidth)))*$fontWidth;

      my $ytmp1=$currY+$maxConsBar-$barHeight;
      my $xtmp2=$x+$fontWidth;
      my $ytmp2=$currY+$maxConsBar;
      $ps .= "$x $ytmp1 $xtmp2 $ytmp2 box2\n";
    }
    $currY+=$blockStep;
    $currPos+=$columnWidth;
  }

  my $BB="2 2 $imageWidth $imageHeight";

  while (<DATA>) {
    s/!BB!/$BB/;
    s/!HEIGHT!/$imageHeight/;
    print;
  }
  print $ps;
  print "showpage\n";
}

=head1 NAME

coloraln.pl - colorize an alignment with consensus structure

=head1 SYNOPSIS

  coloraln.pl [-r] [-n] [-c columns] [-s structure.ps] file.aln

=head1 DESCRIPTION

colorrna.pl reads an alignment in CLUSTAL format, and a consensus
secondary structure (which it extracts from a postscript secondary
structure plot). It produces a postscript figure of the alignment, in
which compensatory mutation supporting the consensus structure are
marked by color. The color scheme is the same employed by RNAalifold
and alidot: Red marks pairs with no sequence variation; ochre, green,
turquoise, blue, and violet mark pairs with 2,3,4,5,6 different tpyes
of pairs, respectively.

=head1 OPTIONS

=over 4

=item B<-s> I<file>

read I<file> to extract the consensus structure (default: C<alirna.ps>)

=item B<-c> I<width>

break alignments into blocks of at most I<width> columns, (default: 120)

=item B<-t>

suppress conversion of C<T> to C<U>, i.e. do not convert DNA to
RNA, (default: convert to C<U>)

=item B<-r>

add a "ruler" below the alignment showing sequence positions

=item B<-n>

write sequence position at the end of each sequence line

=back

=cut


__DATA__
%!PS-Adobe-3.0 EPSF-3.0
%%BoundingBox: !BB!
%%EndComments

% draws Vienna RNA like colored boxes
/box { % x1 y1 x2 y2 hue saturation
  gsave
  dup 0.3 mul 1 exch sub sethsbcolor
  exch 3 index sub exch 2 index sub rectfill
  grestore
} def

% draws a box in current color
/box2 { % x1 y1 x2 y2
  exch 3 index sub exch 2 index sub rectfill
} def

/string { % (Text) x y
  6 add
  moveto
  show
} def

0 !HEIGHT! translate
1 -1 scale
/Courier findfont
[10 0 0 -10 0 0] makefont setfont

%100 100 106 110 0.5 0.5 1 box
%(A) 100 100 string
%(This is a string) 200 200 string
