#!/usr/local/bin/perl5

# Program: gc-stats.pl
# Purpose: parses output of gene-check script into summary statistics
# Test Protocal: http://genecats.soe.ucsc.edu/qa/test-protocols/gene-check.protocol.html
# Summary results: http://genecats.soe.ucsc.edu/qa/test-results/gene-check.results.html
# Author: Jennifer Jackson
# Int Date: 2005-11-14
# Rev Date: 2005-11-XX


$documentation = "See test protocol documentation
        http://genecats.soe.ucsc.edu/qa/test-protocols/gene-check.protocol.html";

if ($#ARGV != 2) { die
"\nUSAGE: $0 <genePreds_table> <gc-details> <gc-summary>
        Parses output of gene-check script into summary statistics
        Output <gc-stats> and <gc-report> are generated by program 
        STDERR is sent to screen
        <genePreds_table> file name format: pred_table.db.server.YYMMDD
	<gc-details>      file name format: pred_table.db.server.YYMMDD.gc-details
        <gc-summary>      file name format: pred_table.db.server.YYMMDD.gc-summary
        <gc-stats>        file name format: pred_table.db.server.YYMMDD.gc-stats
        <gc-report>       file name format: pred_table.db.server.YYMMDD.gc-report
	\nWhen complete, do the following to save your results ......	
	1)Save the <gc-report> file in report directory:
        /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results 
        2)Add the html contents of <gc-stats> into results file:
	/usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html
        3)Keep working files until QA complete in work directory, then delete:
        /cluster/store9/qa/gene-check
        $documentation\n\n";}

# State the program has started
print STDERR "PROCESSING: $0 program started, verifing input files\n";

# get input file names     
$in_preds   = $ARGV[0];
$in_detail  = $ARGV[1];
$in_summ    = $ARGV[2];

# attempt to open and die if cannot be found/read
open(PREDS,"$in_preds")   || die "ERROR:  Cannot open file $in_preds\n";
open(DETAIL,"$in_detail") || die "ERROR:  Cannot open file $in_detail\n";
open(SUMM,"$in_summ")     || die "ERROR:  Cannot open file $in_summ\n";

# verify file names of infiles
# die if database & server are not the same between all three
# warn if dates are different

# check genePreds_table input file name format
(@name_pred) = split(/\./,$in_preds);
if ($#name_pred != 3) { 
  die "ERROR:  $in_preds file name not formatted correctly
        <genePreds_table> file name format: pred_table.db.server.YYMMDD 
	$documentation\n"; 
}
$pred_table  = $name_pred[0];
$pred_db     = $name_pred[1];
$pred_server = $name_pred[2]; 
$pred_date   = $name_pred[3]; 

# check gc-details input file name format
(@name_detail) = split(/\./,$in_detail);
if ($#name_detail != 4) {
  die "ERROR:  $in_detail file name not formatted correctly
        <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details 
	$documentation\n"; 
}
$detail_table  = $name_detail[0];
$detail_file   = $name_detail[4];
if ($detail_file ne "gc-details") {
  die "ERROR:  $in_detail file name not formatted correctly or in wrong order
        USAGE:  $0 <genePreds_table> <gc-details> <gc-summary>
        <gc-details> file name format: pred_table.db.server.YYMMDD.gc-details
        $documentation\n";
}

# check gc-summary input file name format
(@name_summ) = split(/\./,$in_summ);
if ($#name_summ != 4) {
  die "ERROR:  $in_summ file name not formatted correctly
       	<gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary 
	$documentation\n"; 
}
$summ_table  = $name_summ[0];
$summ_file   = $name_summ[4];
if ($summ_file ne "gc-summary") {
  die "ERROR:  $in_summ file name not formatted correctly or in wrong order
	USAGE:  $0 <genePreds_table> <gc-details> <gc-summary>
        <gc-summary> file name format: pred_table.db.server.YYMMDD.gc-summary
        $documentation\n";
}

# State that files are readable
print STDERR "PROCESSING: input files identified and readable\n"; 

# check that pred_table name is same on all files
if (("$pred_table" eq "$detail_table") && ("$detail_table" eq "$summ_table")) {
   print STDERR "PROCESSING: genePreds_table same between all input files
PROCESSING: finished checking format; statistics being generated\n";
}  else {
   die "ERROR:  All files must be from same genePreds_table
        <genePreds_table> file name format: pred_table.db.server.YYMMDD
        <gc-details>      file name format: pred_table.db.server.YYMMDD.gc-details
        <gc-summary>      file name format: pred_table.db.server.YYMMDD.gc-summary
	$documentation\n";
}

# open outfiles for writing results
$stat_file = "$pred_table.$pred_db.$pred_server.$pred_date.gc-stats";
$report_file  = "$pred_table.$pred_db.$pred_server.$pred_date.gc-report";
open STAT, ">$stat_file";
open REPORT,  ">$report_file";
print STAT   "$0 gc-stats file for $in_preds\n\n";
print STAT   "Paste the html formated results below into file:\n"; 
print STAT   "/usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.results.html\n\n";
print REPORT "$0 gc-report file for $in_preds\n\n";
print REPORT "Save file at /usr/local/apache/htdocs-genecats/qa/test-results/gc-stats.reports\n\n";


# in preds, count up ids & preds per id
$preds_id_all  = 0;
$preds_id_uniq = 0;
$preds_id_max  = 1;
$preds_id_name = "none";
while(<PREDS>) {
 chomp;
 $preds_id_all++; 
 (@content_preds) = split(/\t/);

 if (exists $preds_count_id{$content_preds[0]}) {
    $preds_count_id{$content_preds[0]}++;
    if ($preds_count_id{$content_preds[0]} > $preds_id_max) {
	$preds_id_max  = $preds_count_id{$content_preds[0]};
	$preds_id_name = $content_preds[0];
    }
 } else {
    $preds_count_id{$content_preds[0]} = 1;
    $preds_id_uniq++;
 }

}

print REPORT "$in_preds contains gene prediction table from database
	$preds_id_uniq unique sequences have $preds_id_all gene predictions
	the sequence $preds_id_name has the most gene predictions: $preds_id_max predictions\n\n";

# in summary, count up ids & uniq preds & errors per uniq preds
$summ_id_uniq = 0;
$summ_id_max = 1;
$summ_id_name = "none";
$summ_longid_all = 0;
$summ_longid_causes_max = 1;
$summ_longid_causes_name = "none";
while(<SUMM>) {
 chomp;
 $summ_longid_all++;
# print STDERR "longid $summ_longid_all \n";
 unless ($summ_longid_all <= 2) {
 ($summ_id,$chr,$chrs,$chre,$strand,$stat,$frame,$start,$stop,$orfStop,$cdsGap,$utrGap,$cdsSplice,$utrSplice,$causes) = split(/\t/);
 $summ_longid = "$summ_id.$chr.$chrs.$chre";

 (@array_causes) = split(/\,/,$causes);

 $summ_causes_longid{$summ_longid} = ($#array_causes + 1);

    if ($summ_causes_longid{$summ_longid} > $summ_longid_causes_max) {
        $summ_longid_causes_max  = $summ_causes_longid{$summ_longid};
        $summ_longid_causes_name = $summ_longid;
    }
 if (exists $summ_count_id{$summ_id}) {
    $summ_count_id{$summ_id}++;
   if ($summ_count_id{$summ_id} > $summ_id_max) {
       $summ_id_max  = $summ_count_id{$summ_id};
       $summ_id_name = $summ_id;
   }
 } else {
    $summ_count_id{$summ_id} = 1;
    $summ_id_uniq++;
 }
 }
}

$prob_preds_all = ($summ_longid_all - 2);

print REPORT "$in_summ contains sequences with 1 or more gene-check exceptions
	$summ_id_uniq unique sequences have $prob_preds_all predictions with exceptions
	the sequence $summ_id_name has the most predictions with exceptions: $summ_id_max predictions
	the prediction $summ_longid_causes_name has the most exception types: $summ_longid_causes_max types\n\n";

# compare total predictions with those that have exceptions by id
# %summ_count_id contains the ids with # predictions with exceptions
# %preds_count_id contains the ids with # all predictions
$all_pass = 0;
$all_fail = 0;
$some_pass_fail = 0;

foreach $common_id (keys %preds_count_id) {

 if (exists $summ_count_id{$common_id}) {

  if ($summ_count_id{$common_id} == $preds_count_id{$common_id}) { $all_fail++; }
  if ($summ_count_id{$common_id} <  $preds_count_id{$common_id}) { $some_pass_fail++; }
 
 } else {
   $all_pass++;
 }
}



# in details, count up uniq ids and types errors per id
# counting variables for total errors of each type and
# number of unique IDs associated have each exception in gc-stat
# total number of each type of error in file in gc-report
$detail_id_uniq   = 0;
$detail_lines_all = 0;
$detail_gap_all   = 0;
$detail_gap_id    = 0;
$detail_badCdsSplice_all = 0;
$detail_badCdsSplice_id = 0;
$detail_badUtrSplice_all = 0;
$detail_badUtrSplice_id = 0;
$detail_noCDS_all = 0;
$detail_noCDS_id = 0;
$detail_noStart_all = 0;
$detail_noStart_id = 0;
$detail_noStop_all = 0;
$detail_noStop_id = 0;
$detail_inFrameStop_all = 0;
$detail_inFrameStop_id = 0;
$detail_frameDiscontig_all = 0;
$detail_frameDiscontig_id = 0;
$detail_frameMismatch_all = 0;
$detail_frameMismatch_id = 0;
$detail_badFrame_all = 0;
$detail_badFrame_id = 0;
$detail_other_all = 0;
$detail_other_id = 0;


# parse file and run through error type fields

while(<DETAIL>) {
 chomp;
  $detail_lines_all++;
  unless ($detail_lines_all < 2) {
   ($detail_id,$type,$info,$exception,$coords) = split(/\t/);
  }
  unless (exists $detail_id_uniq_count{$detail_id}) {
    $detail_id_uniq_count{$detail_id} = "1";
    $detail_id_uniq++;
  } 

 if ($type eq "gap") { 
   $detail_gap_all++;
   unless (exists $detail_gap_uniq_count{$detail_id}) {
    $detail_gap_uniq_count{$detail_id} = "1";  
    $detail_gap_id++;
   }
 }

 elsif ($type eq "badCdsSplice") {
   $detail_badCdsSplice_all++;
   unless (exists $detail_badCdsSplice_uniq_count{$detail_id}) {
    $detail_badCdsSplice_uniq_count{$detail_id} = "1";
    $detail_badCdsSplice_id++;
   }
 }

 elsif ($type eq "badUtrSplice") {
   $detail_badUtrSplice_all++;
   unless (exists $detail_badUtrSplice_uniq_count{$detail_id}) {
    $detail_badUtrSplice_uniq_count{$detail_id} = "1";
    $detail_badUtrSplice_id++;
   }
 }

 elsif ($type eq "noCDS") {
   $detail_noCDS_all++;
   unless (exists $detail_noCDS_uniq_count{$detail_id}) {
    $detail_noCDS_uniq_count{$detail_id} = "1";
    $detail_noCDS_id++;
   }
 }

 elsif ($type eq "noStart") {
   $detail_noStart_all++;
   unless (exists $detail_noStart_uniq_count{$detail_id}) {
    $detail_noStart_uniq_count{$detail_id} = "1";
    $detail_noStart_id++;
   }
 }

 elsif ($type eq "noStop") {
   $detail_noStop_all++;
   unless (exists $detail_noStop_uniq_count{$detail_id}) {
    $detail_noStop_uniq_count{$detail_id} = "1";
    $detail_noStop_id++;
   }
 }

 elsif ($type eq "inFrameStop") {
   $detail_inFrameStop_all++;
   unless (exists $detail_inFrameStop_uniq_count{$detail_id}) {
    $detail_inFrameStop_uniq_count{$detail_id} = "1";
    $detail_inFrameStop_id++;
   }
 }

 elsif ($type eq "frameDiscontig") {
   $detail_frameDiscontig_all++;
   unless (exists $detail_frameDiscontig_uniq_count{$detail_id}) {
    $detail_frameDiscontig_uniq_count{$detail_id} = "1";
    $detail_frameDiscontig_id++;
   }
 }

 elsif ($type eq "frameMismatch") {
   $detail_frameMismatch_all++;
   unless (exists $detail_frameMismatch_uniq_count{$detail_id}) {
    $detail_frameMismatch_uniq_count{$detail_id} = "1";
    $detail_frameMismatch_id++;
   }
 }

 elsif ($type eq "badFrame") {
   $detail_badFrame_all++;
   unless (exists $detail_badFrame_uniq_count{$detail_id}) {
    $detail_badFrame_uniq_count{$detail_id} = "1";
    $detail_badFrame_id++;
   }
 }

 else {
   $detail_other_all++;
   unless (exists $detail_other_uniq_count{$detail_id}) {
    $detail_other_uniq_count{$detail_id} = "1";
    $detail_other_id++;
   }
 }


}
# summarize seqs w/ exceptions into percentages of total seqs w/ preds
$detail_errors_all = ($detail_lines_all - 2);
$detail_gap_pct = (($detail_gap_id/$preds_id_uniq) * 100);
$detail_badCdsSplice_pct = (($detail_badCdsSplice_id/$preds_id_uniq) * 100);
$detail_badUtrSplice_pct = (($detail_badUtrSplice_id/$preds_id_uniq) * 100);
$detail_noCDS_pct = (($detail_noCDS_id/$preds_id_uniq) * 100);
$detail_noStart_pct = (($detail_noStart_id/$preds_id_uniq) * 100);
$detail_noStop_pct = (($detail_noStop_id/$preds_id_uniq) * 100);
$detail_inFrameStop_pct = (($detail_inFrameStop_id/$preds_id_uniq) * 100);
$detail_frameDiscontig_pct = (($detail_frameDiscontig_id/$preds_id_uniq) * 100);
$detail_frameMismatch_pct = (($detail_frameMismatch_id/$preds_id_uniq) * 100);
$detail_badFrame_pct = (($detail_badFrame_id/$preds_id_uniq) * 100);
$detail_other_pct = (($detail_other_id/$preds_id_uniq) * 100);

# summarize preds w/ exceptions into percentage of total preds
$prob_preds_all_pct = (($prob_preds_all/$preds_id_all) * 100);

# summarize seqs w/ % w/o exceptions into percentage of total unique seqs w/ preds
$all_pass_pct = (($all_pass/$preds_id_uniq) * 100);
$all_fail_pct = (($all_fail/$preds_id_uniq) * 100);
$some_pass_fail_pct = (($some_pass_fail/$preds_id_uniq) * 100);

print REPORT "$in_detail contains details for exceptions, including coordinates
	one exception per line; sequences can appear on multiple lines
	use exception key or sequence name to grep details from file
	$detail_id_uniq unique sequences with prediction exceptions
	$detail_lines_all total gene prediction exceptions\n
Gene-check exception summary statistics based on <gc-details> file content
exception key: <total> <unique by sequence> <pct of unique sequences in genePreds file> 
----------------------------------------------------------------------\n";

printf REPORT "gap:\t\t$detail_gap_all\t$detail_gap_id\t%3.1f %\n", "$detail_gap_pct";
printf REPORT "badCdsSplice:\t$detail_badCdsSplice_all\t$detail_badCdsSplice_id\t%3.1f %\n", "$detail_badCdsSplice_pct";
printf REPORT "badUtrSplice:\t$detail_badUtrSplice_all\t$detail_badUtrSplice_id\t%3.1f %\n", "$detail_badUtrSplice_pct";
printf REPORT "noCDS:\t\t$detail_noCDS_all\t$detail_noCDS_id\t%3.1f %\n", "$detail_noCDS_pct";
printf REPORT "noStart:\t$detail_noStart_all\t$detail_noStart_id\t%3.1f %\n", "$detail_noStart_pct";
printf REPORT "noStop:\t\t$detail_noStop_all\t$detail_noStop_id\t%3.1f %\n", "$detail_noStop_pct";
printf REPORT "inFrameStop:\t$detail_inFrameStop_all\t$detail_inFrameStop_id\t%3.1f %\n", "$detail_inFrameStop_pct";
printf REPORT "frameDiscontig:\t$detail_frameDiscontig_all\t$detail_frameDiscontig_id\t%3.1f %\n", "$detail_frameDiscontig_pct";
printf REPORT "frameMismatch:\t$detail_frameMismatch_all\t$detail_frameMismatch_id\t%3.1f %\n", "$detail_frameMismatch_pct";
printf REPORT "badFrame:\t$detail_badFrame_all\t$detail_badFrame_id\t%3.1f %\n", "$detail_badFrame_pct";
printf REPORT "other:\t\t$detail_other_all\t$detail_other_id\t%3.1f %\n", "$detail_other_pct";

print REPORT "\n\nGlobal summary statistics for $pred_table gene prediction track
-------------------------------------------------------------------\n";
printf REPORT "Pct predictions with one or more gene-check exceptions: %3.1f %\n", "$prob_preds_all_pct";
printf REPORT "Pct unique sequences with all predictions having no exceptions: %3.1f %\n", "$all_pass_pct";
printf REPORT "Pct unique sequences with all predictions having 1 or more exceptions: %3.1f %\n", "$all_fail_pct"; 
printf REPORT "Pct unique sequences with mixed predictions (some have exceptions, some do not): %3.1f %\n", "$some_pass_fail_pct";

print REPORT "\n\n\nEND OF REPORT\n\n";

print STDERR "PROCESSING: outfile <gc-report> finalized\n";

# <gc-stats> output file variables
# 1  db.genePred	$pred_db.$pred_table
# 2  date		$pred_date
# 3  server		$pred_server
# 4  uniqSeqs		$preds_id_uniq
# 5  allPreds		$preds_id_all
# 6  probPreds		$prob_preds_all,$prob_preds_all_pct
# 7  allPass		$all_pass,$all_pass_pct
# 8  allFail		$all_fail,$all_fail_pct
# 8  somePassFail	$some_pass_fail,$some_pass_fail_pct
# 10 gap		$detail_gap_id,$detail_gap_pct
# 11 badCdsSplice	$detail_badCdsSplice_id,$detail_badCdsSplice_pct
# 12 badUtrSplice	$detail_badUtrSplice_id,$detail_badUtrSplice_pct
# 13 noCDS		$detail_noCDS_id,$detail_noCDS_pct
# 14 noStart		$detail_noStart_id,$detail_noStart_pct
# 15 noStop		$detail_noStop_id,$detail_noStop_pct
# 16 inFrameStop	$detail_inFrameStop_id,$detail_inFrameStop_pct
# 17 frameDiscontig	$detail_frameDiscontig_id,$detail_frameDiscontig_pct
# 18 frameMismatch 	$detail_frameMismatch_id,$detail_frameMismatch_pct
# 19 badFrame		$detail_badFrame_id,$detail_badFrame_pct
# 20 other		$detail_other_id,$detail_other_pct


# write out html format output to <gc-stats>


print  STAT "\n\n";
print  STAT "<tr>\n";
print  STAT "<td>$pred_db.$pred_table</td>\n";
print  STAT "<td>$pred_date</td>\n";
print  STAT "<td>$pred_server</td>\n";
print  STAT "<td>$preds_id_uniq</td>\n";
print  STAT "<td>$preds_id_all</td>\n";
printf STAT "<td>$prob_preds_all(%3.1f%)</td>\n", "$prob_preds_all_pct";
printf STAT "<td>$all_pass(%3.1f%)</td>\n", "$all_pass_pct";
printf STAT "<td>$all_fail(%3.1f%)</td>\n", "$all_fail_pct";
printf STAT "<td>$some_pass_fail(%3.1f%)</td>\n", "$some_pass_fail_pct";
printf STAT "<td>$detail_gap_id(%3.1f%)</td>\n", "$detail_gap_pct";
printf STAT "<td>$detail_badCdsSplice_id(%3.1f%)</td>\n", "$detail_badCdsSplice_pct";
printf STAT "<td>$detail_badUtrSplice_id(%3.1f%)</td>\n", "$detail_badUtrSplice_pct";
printf STAT "<td>$detail_noCDS_id(%3.1f%)</td>\n", "$detail_noCDS_pct";
printf STAT "<td>$detail_noStart_id(%3.1f%)</td>\n", "$detail_noStart_pct";
printf STAT "<td>$detail_noStop_id(%3.1f%)</td>\n", "$detail_noStop_pct";
printf STAT "<td>$detail_inFrameStop_id(%3.1f%)</td>\n", "$detail_inFrameStop_pct";
printf STAT "<td>$detail_frameDiscontig_id(%3.1f%)</td>\n", "$detail_frameDiscontig_pct";
printf STAT "<td>$detail_frameMismatch_id(%3.1f%)</td>\n", "$detail_frameMismatch_pct";
printf STAT "<td>$detail_badFrame_id(%3.1f%)</td>\n", "$detail_badFrame_pct";
printf STAT "<td>$detail_other_id(%3.1f%)</td>\n", "$detail_other_pct";
print  STAT "</tr>\n";
print  STAT "\n\n\nEND OF REPORT\n\n";




print STDERR "PROCESSING: outfile <gc-stats> finalized\n";
print STDERR "PROCESSING: $0 program finished witout errors!\n";


