############################################################################
# demotic_fasta package   
#    Parses fasta or ssearch output, stores extracted information in convenient vars.
#    SRE, Wed Jun 25 13:41:41 2003
#
#  SVN $Id$
############################################################################

package demotic_fasta;

# parse(\*STDIN) would parse ssearch output
# coming in via stdin.
#
sub parse (*) {
    my $fh = shift;
    my $parsing_header  = 1;
    my $parsing_hitlist = 0;
    my $parsing_alilist = 0;
    my $target;
    my $alilinecount = 0;

    # Initialize everything... so we can call the parser
    # repeatedly, one call per ssearch output.
    #
    # This section also documents what's available in
    # variables within the package.
    # 
    # Arrays are indexed [0..nhits-1] or [0..nali-1]
    #
    $queryname      = "";	# Name of the query sequence
    $querydesc      = "";	# Description line for the query (or "")
    $querylen       = 0;	# Length of the query in residues
    $db             = "";	# Name of the target database file
    $db_nseq        = 0;	# Number of sequences in the target database
    $db_nletters    = "";	# Number of residues in the target database
                                # (stored as a string so we can have large #'s)

				# The top hit list (still in rank order)
    $nhits          = 0;	# Number of entries in the hit list
    @hit_target     = ();	# Target sequence name (by rank)
    %target_desc    = ();	# Target sequence description (by targ name)
    %target_len     = ();	# Length of target sequence
    @hit_score      = ();	# Raw score (by rank)
    @hit_bitscore   = ();	# Bit score (by rank)
    @hit_Eval       = ();	# E-value (by rank)

				# The alignment output (in order it came in)
				# all indexed by # of alignment [0..nali-1]
    $nali           = 0;	# Number of alignments
    @ali_target     = ();	# Target sequence name
    @ali_score      = ();	# Smith/Waterman raw score of alignment
    @ali_bitscore   = ();	# bit score
    @ali_evalue     = ();	# E-value
    @ali_nident     = ();	# Number of identical residues
    @ali_alen       = ();	# Length of alignment (overlap)
    @ali_identity   = ();	# Percent identity
#    @ali_npos       = (); # Number of positives (similar positions)
#    @ali_positive   = (); # Percent of positions matched or similar
    @ali_qstart     = ();	# Start position on query
    @ali_qend       = ();	# End position on query
    @ali_tstart     = ();	# Start position on target
    @ali_tend       = ();	# End position on target
    @ali_qali       = (); # Aligned string from query
    @ali_tali       = (); # Aligned string from target (subject)
    
    if (defined($save_querycount) && $save_querycount > 1) { # on subsequent queries, we had to use the >>> start line to detect
	# the end of the prev query; we socked the necessary info away in tmp vars.
	$querycount = $save_querycount;
	$queryname  = $save_queryname;
	$querydesc  = $save_querydesc;
	$querylen   = $save_querylen;
    }

    # Now, loop over the lines of our input, and parse 'em.
    #
    while (<$fh>) {
	if ($parsing_header) {
	    if (/^The best scores are:/) { # start of hit list
		$parsing_header  = 0;
		$parsing_hitlist = 1;
		next;
	    } elsif (/^!! No sequences/) { # no hit list: no hits; just return
		return 1;	# return "success"
	    } elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # allows blank query. \S\S is "nt" or "aa"
		$querycount = $1;
		$queryname  = $2;
		$querydesc  = $3;
		$querylen   = $4;
		if ($queryname eq "") { 
		    $queryname = "unnamed_query";
		}
	    } elsif (/^\s+(\d+)\s+residues in\s+(\d+)\s+sequences\s*$/) {
		$db_nletters = $1;
		$db_nseq     = $2;
	    }
	} 
	elsif ($parsing_hitlist) {
	    if (/^\s*$/) {	# blank line marks end of hit list, start of alignments
		$parsing_hitlist = 0;
		$parsing_alilist = 1;
		next;
	    } elsif (/^(\S+)\s*(.*\S?)\s*\(\s*(\d+)\)\s+(\d+)\s+(\S+)\s+(\S+)\s*$/) {
		$hit_target[$nhits]    = $1;
		$target_desc{$1}       = $2;
	        $target_len{$1}        = $3;
		$hit_score[$nhits]     = $4;
		$hit_bitscore[$nhits]  = $5;
		$hit_Eval[$nhits]      = $6;
		$nhits++;
	    }
	}
	elsif ($parsing_alilist) {
	    if (/^>>(\S+)\s*(.*)\s+\((\d+) \S\S\)\s*$/) {  # the \S\S is either nt or aa
		$target = $1;
		$target_desc{$target} = $2;
		if ($3 != $target_len{$target}) { die "can't happen.", "1)", $3, "2)", $target_len{$target}; }
	    } 
	    elsif (/^ s-w opt:\s+(\d+)\s+Z-score:\s*(\S+)\s+bits:\s+(\S+)\s+E\(\d*\):\s+(\S+)\s*$/) {  # SSEARCH
		$nali++;
		$ali_target[$nali-1]   = $target;
		$ali_score[$nali-1]    = $1;
		$ali_bitscore[$nali-1] = $3;
		$ali_evalue[$nali-1]   = $4;
	    } 
	    elsif (/^ initn:\s*\d+\s*init1:\s*\d+\s*opt:\s*(\d+)\s*Z-score:\s*(\S+)\s*bits:\s*(\S+)\s*E\(\d*\):\s*(\S+)\s*$/) { # FASTA
		$nali++;
		$ali_target[$nali-1]   = $target;
		$ali_score[$nali-1]    = $1;
		$ali_bitscore[$nali-1] = $3;
		$ali_evalue[$nali-1]   = $4;
	    }		
	    elsif (/^Smith-Waterman score:\s+(\d+);\s+(\S+)% identity .* in (\d+) \S\S overlap \((\d+)-(\d+):(\d+)-(\d+)\)\s*/) {
		$ali_identity[$nali-1]   = $2;
		$ali_alen[$nali-1]       = $3;
		$ali_qstart[$nali-1]     = $4;
		$ali_qend[$nali-1]       = $5;
		$ali_tstart[$nali-1]     = $6;
		$ali_tend[$nali-1]       = $7;
#		$ali_nident[$nali-1]     = $1;
#		$ali_npos[$nali-1]       = $4;
#		$ali_positive[$nali-1]   = $5;
		$alilinecount            = 0;
	    } 
	    elsif (/^\S+\s+(\S+)\s*$/) { # only ali lines are right-flushed
		if ($alilinecount % 2 == 0) {
		    $ali_qali[$nali-1]  .= $1; 
		} else {
		    $ali_qali[$nali-1]  .= $1; 
		}
		$alilinecount++;
	    }

	    elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # next query is starting. \S\S is "nt" or "aa"
		$save_querycount = $1;
		$save_queryname  = $2;
		$save_querydesc  = $3;
		$save_querylen   = $4;
		if ($save_queryname eq "") { $save_queryname = "unnamed_query"; }
		return 1;	# normal return. We've finished output for a query, and stored some info about the next one.
	    }
	}
    } # this closes the loop over lines in the input stream: at EOF.

    if ($parsing_alilist) { return 1; } else { return 0; }  # at EOF: normal return if we were in the alignment section.
}


sub exblxout {
    my $ofh     = shift;
    my $i;
    
    for ($i = 0; $i <= $nali-1; $i++) {
	printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n",
	$ali_evalue[$i],
	$ali_identity[$i],
	$ali_tstart[$i],
	$ali_tend[$i],
	$ali_target[$i],
	$ali_qstart[$i],
	$ali_qend[$i],
	$queryname;
    }
}

sub tblout {
    my $ofh     = shift;
    my $i;
    
    for ($i = 0; $i <= $nali-1; $i++) {
	printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n",
	$ali_evalue[$i],
	$ali_identity[$i],
	$ali_qstart[$i],
	$ali_qend[$i],
	$queryname,
	$ali_tstart[$i],
	$ali_tend[$i],
	$ali_target[$i],
	$target_desc{$ali_target[$i]};
    }
}    


sub gffout {
    my $ofh     = shift;
    my $source  = shift;
    my $feature = shift;
    my $i;
    my $strand;
    
    for ($i = 0; $i <= $nali-1; $i++) {
	if ($ali_qstart[$i] > $ali_qend[$i]) { $strand = "-"; }
	else { $strand = "+"; } 

	printf $ofh "%s\t%s\t%s\t%d\t%d\t%.1f\t%s\t.\tgene \"%s\"\n",
	$ali_target[$i],
	$source,
	$feature,
	$ali_tstart[$i],
	$ali_tend[$i],
	$ali_bitscore[$i],
	$strand,
	$queryname;
    }
}

sub profmarkout {
    my $ofh = shift;
    my $i;

    for ($i = 0; $i < $nhits; $i++) {
	printf $ofh "%g\t%.1f\t%s\t%s\n", $hit_Eval[$i], $hit_bitscore[$i], $hit_target[$i], $queryname;
    }
}


1;
__END__