#!/usr/bin/env perl

use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use AsmHub;
use File::Basename;

my $argc = scalar(@ARGV);

if ($argc != 4) {
  printf STDERR "usage: asmHubEnsGene.pl asmId asmId.names.tab bbi/asmId ensVersion\n";
  printf STDERR "where asmId is the assembly identifier,\n";
  printf STDERR "and   asmId.names.tab is naming file for this assembly,\n";
  printf STDERR "and bbi/asmId is the path prefix to .ensGene.bb.\n";
  printf STDERR "the ensVersion is from trackData/ensGene/version.txt\n";
  exit 255;
}

# from Perl Cookbook Recipe 2.17, print out large numbers with comma
# delimiters:
sub commify($) {
    my $text = reverse $_[0];
    $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
    return scalar reverse $text
}

# $scriptDir/asmHubEnsGene.pl $asmId $buildDir/html/$asmId.names.tab $buildDir/bbi/$asmId > $buildDir/html/$asmId.ensGene.html "${ensVersion}"

my $asmId = shift;
my $namesFile = shift;
my $bbiPrefix = shift;
my $ensVersion = shift;
my $ensGeneBbi = "$bbiPrefix.ensGene.bb";
my $runDir = $bbiPrefix;
$runDir =~ s#/bbi/.*#/trackData/ensGene#;
my $fbResults = "${runDir}/fb.$asmId.ensGene.txt";
my $fbBases = "";
if ( -s "${fbResults}" ) {
  ($fbBases, undef) = split('\s+', `cat $fbResults`);
}

if ( ! -s $ensGeneBbi ) {
  printf STDERR "ERROR: can not find ensGene bbi file:\n\t'%s'\n", $ensGeneBbi;
  exit 255;
}

my $em = "<em>";
my $noEm = "</em>";
my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`;
chomp $assemblyDate;
my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`;
chomp $ncbiAssemblyId;
my $organism = `grep -v "^#" $namesFile | cut -f5`;
chomp $organism;
my $geneCount = `bigBedInfo $ensGeneBbi | egrep "itemCount:|basesCovered:" | xargs echo | sed -e 's/itemCount/Gene count/; s/ basesCovered/; Bases covered/;'`;
chomp $geneCount;
if (length($fbBases)) {
  $geneCount .= sprintf(" (%s bases in exons only)", commify($fbBases));
}

print <<_EOF_
<h2>Description</h2>
<p>
This track shows the Ensembl gene, $ensVersion, annotations on
the $assemblyDate $em${organism}$noEm/$asmId genome assembly.<br>
<br>
These gene predictions were generated by
<a href="http://www.ensembl.org/index.html" target="_blank">Ensembl</a>.<br>
<br>
$geneCount
</p>

<h2>Methods</h2>

<p>
For a description of the methods used in Ensembl gene predictions, please
refer to
<a href="https://academic.oup.com/nar/article/30/1/38/1332872/The-Ensembl-genome-database-project"
target="_blank">Hubbard <em>et al</em>. (2002)</a>,
also listed in the References section below.
</p>

<h2>Credits</h2>

<p>
We would like to thank Ensembl for providing this annotation.  For more information, please see:
<a href="http://www.ensembl.org/info/genome/genebuild/genome_annotation.html" target=_blank>Ensembl&#39;s genome annotation page.</a>
</p>

<h2>References</h2>

<p>
Hubbard T, Barker D, Birney E, Cameron G, Chen Y, Clark L, Cox T, Cuff J,
Curwen V, Down T <em>et al</em>.
<a href="https://academic.oup.com/nar/article/30/1/38/1332872/The-Ensembl-genome-database-project"
target="_blank">The Ensembl genome database project</a>.
<em>Nucleic Acids Res</em>. 2002 Jan 1;30(1):38-41.
PMID: <a href="https://www.ncbi.nlm.nih.gov/pubmed/11752248" target="_blank">11752248</a>; PMC: <a
href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC99161/" target="_blank">PMC99161</a>
</p>
_EOF_
   ;

