#!/usr/bin/perl -w
# -*-Perl-*-
# Last changed Time-stamp: <2006-08-15 22:04:27 ivo>
# after predicting a conensus structure using RNAalifold,
# refold the indivual sequences, using the consensus structure as constraint

# read a clustal fromat alignment, plus either the standart output of
# RNAalifold or a dot plot produced by "RNAalifold -p"

# usage refold.pl [-t thresh] clustal.aln alidot.ps
# or    refold.pl clustal.aln clustal.alifold

use Getopt::Long;
#use POSIX;
#use GD;
use strict;

my $thresh = 0.9;
my $help = 0;

GetOptions ('-t=f' => \$thresh,
	    "help"=>\$help,
	    "h"=>\$help,
	    );

if ($help){
  print "\  refold.pl [-t threshold] myseqs.aln alidot.ps | RNAfold -C\n";
  print "or refold.pl myseqs.aln myseqs.alifold | RNAfold -C\n";
  print " -t ... use only pairs with p>thresh (default: 0.9)\n\n";
  exit(1);
}

my $aln = readClustal();
my $len = length($aln->[0]->{seq});

$_ = <>;
#print STDERR "$_\n";
my $constraint = (/^%\!PS/) ? read_dot() : read_alifold();

#print STDERR "$constraint\n";

foreach my $a (@$aln) {
    print "> $a->{name}\n";

    # remove gaps
    my $seq = $a->{seq};
    my $cons = $constraint;
    my @pt = make_pair_table($cons);
    my %pair = ("AU" => 5,
		"GC" => 1, 
		"CG" => 2, 
		"UA" => 6,
		"GU" => 3, 
		"GT" => 3, 
		"TG" => 4, 
		"UG" => 4,
		"AT" => 5, 
		"TA" => 6);


    for my $p (0..length($seq)-1) {
	# open non-compatible pairs as well as pairs to a gap position
	my $c = substr($seq,$p,1);
	if ($c eq '-') {
	    substr($cons,$p,1) = 'x'; # mark for removal
	    substr($cons,$pt[$p],1) = '.' 
		if $pt[$p]>0  && substr($cons,$pt[$p],1) ne 'x';   # open pair
	}
	elsif ($pt[$p]>$p) {
	    substr($cons,$p,1) = substr($cons,$pt[$p],1) = '.'
		unless exists $pair{$c . substr($seq,$pt[$p],1)};
	}
    }
#    print STDERR length($seq), length($cons), "\n";
    $cons =~ s/x//g;
    $seq  =~ s/-//g;
#    print STDERR length($seq), length($cons), "\n";
    print "$seq\n$cons\n";
}


sub make_pair_table {
   #indices start at 0 in this version!
   my $structure = shift;
   my (@olist, @table);
   my ($hx,$i) = (0,0);

   foreach my $c (split(//,$structure)) {
       if ($c eq '.') {
	  $table[$i]= -1;
     } elsif ($c eq '(') {
	  $olist[$hx++]=$i;
     } elsif ($c eq ')') {
	 my $j = $olist[--$hx];
	 die ("unbalanced brackets in make_pair_table") if ($hx<0);
	 $table[$i]=$j;
	 $table[$j]=$i;
     }
      $i++;
   }
   die ("too few closed brackets in make_pair_table") if ($hx!=0);
   return @table;
}

sub read_alifold {
    while (<>) {
	return $1 if /^([(.)]+)/;
    }
}

sub read_dot {
    my $cons = '.' x $len;
    while(<>) {
	next if /^%/;
	next unless /lbox$/;
	my @F = split;
	next if $F[5] < $thresh;
	substr($cons,$F[3]-1,1) = '(';
	substr($cons,$F[4]-1,1) = ')';
    }
    return $cons;
}

######################################################################
#
# readClustal(filehandle)
#
# Reads Clustal W formatted alignment file and returns it in list of
# hash references with keys "name" and "seq" for the name and the sequence,
# resp.
# 
# Fixme: the code below assumes all uppercase sequences 
######################################################################

sub readClustal{
#  my $fh=shift;
  my @out=();
  my (%order, $order, %alignments);
  while(<>) {
	last if eof;
	next if ( /^\s+$/ );
	my ($seqname, $aln_line) = ('', '');
	if( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) {
	  # clustal 1.4 format
	  ($seqname,$aln_line) = ("$1/$2-$3",$4);
	} elsif( /^(\S+)\s+([A-Z\-]+)\s*$/ ) {
	  ($seqname,$aln_line) = ($1,$2);
	} else {
	  next;
  }
	if( !exists $order{$seqname} ) {
	  $order{$seqname} = $order++;
	}
	$alignments{$seqname} .= $aln_line;
  }

  foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) {
	if( $name =~ /(\S+):(\d+)-(\d+)/ ) {
	  (my $sname,my $start, my $end) = ($1,$2,$3);
	} else {
	  (my $sname, my $start) = ($name,1);
	  my $str  = $alignments{$name};
	  $str =~ s/[^A-Za-z]//g;
	  my $end = length($str);
	}
	my $seq=$alignments{$name};
	push @out, {name=>$name,seq=>$seq};
  }
  return [@out];
}

######################################################################
#
# getPairs(\@aln alnref, $ss string)
#
# Evalutates the pairing of an alignment according to a given
# consensus secondary structure
#
# Returns list of all base pairs which is a hash with the following
# keys:
#
#  open ... column in the alignment of the first base in the pair "("
#  close ... column in the alignment of the second base in the pair ")"
#  all ... list of all basepairs in the different sequences in the alignment
#  pairing ... list of all different pairing basepairs
#  nonpairing ... list of all incompatible basepairs
#
######################################################################

sub getPairs{

  my @inputAln=@{$_[0]};
  my $ss=$_[1];

  # return nothing if there are no pairs
  if (!($ss=~tr/(/(/)){
	return ();
  }

  my @aln=();
  foreach my $row (@inputAln){
	my $seq=$row->{seq};
	$seq=uc($seq);
	$seq=~s/T/U/g;
	my @tmp=split(//,$seq);
	push @aln,\@tmp;
  }
  my @ss=split(//,$ss);

  my @pairs=();
  my @stack=();

  foreach my $column (0..$#ss){

	my $currChar=$ss[$column];

	if ($currChar eq '('){
	  push @stack,$column;
	}

	if ($currChar eq ')'){
	  my $openedCol=pop @stack;
	  push @pairs,{open=>$openedCol,close=>$column};
	}
  }

  @pairs=sort {$a->{open} <=> $b->{open}} @pairs;

  foreach my $i (0..$#pairs){
	#print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n";

	my @all=();
	my @pairing=();
	my @nonpairing=();

	for my $j (0..$#aln){
	  my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}];
	  push @all,$currPair;
	}

	for my $pair (@all){
	  if (($pair eq 'AU') or
		  ($pair eq 'UA') or
		  ($pair eq 'GC') or
		  ($pair eq 'CG') or
		  ($pair eq 'UG') or
		  ($pair eq 'GU')){

		push @pairing, $pair;
	  } elsif ($pair eq "--"){
		# do nothing
	  } else {
		push @nonpairing,$pair;
	  }
	}

	undef my %saw;
    my @uniquePairing = grep(!$saw{$_}++, @pairing);

	$pairs[$i]->{all}=[@all];
	$pairs[$i]->{pairing}=[@uniquePairing];
	$pairs[$i]->{nonpairing}=[@nonpairing];
  }

  return @pairs;

}


=head1 NAME

refold.pl - refold using consensus structure as constraint

=head1 SYNOPSIS

  refold.pl [-t thresh] file.aln alidot.ps
  refold.pl file.aln file.alifold

=head1 DESCRIPTION

refold.pl reads an alignment in CLUSTAL format, and a consensus
secondary structure, either in the form of the standard output of
C<RNAalifold>, or in the form of a dot plot as produced by
C<RNAalifold -p>. For each sequence in the alignment it writes the
name, sequence, and constraint structure to stdout in a format
suitable for piping into C<RNAfold -C>.

The constraint string is produced by removing from the consensus
structure all gaps, as well as all pairs not compatible with the
particular sequence. If the structure is read from a dot plot file,
only pairs with probability > then some threshold (default 0.9) are
used.

=head1 OPTIONS

=over 4

=item B<-t> I<thresh>

use only pairs with p>thresh as constraint. Only applicable when
reading consensus structure from a dot plot.

=back

=head1 EXAMPLE

  RNAalifold test.aln > test.alifold
  refold.pl test.aln test.alifold > test.cfold
  RNAfold -C < test.cfold

=cut
