# for emacs: -*- mode: sh; -*-

# This file describes browser build for the bosTau8
# Bos taurus -- UMD 3.1.1 (Dec. 2009)
# Center for Bioinformatics and Computational Biology, University of Maryland
# UMD 3.1.1


#############################################################################
# fetch sequence from new style download directory (DONE - 2014-09-05 - Steve)
    # NCBI has redesigned their FTP download site, new type of address
    #      and naming scheme
    mkdir -p /hive/data/genomes/bosTau8/genbank
    cd /hive/data/genomes/bosTau8/genbank

    rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCA_000003055.4_Bos_taurus_UMD_3.1.1 ./

    # measure what we have here:
    faSize GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/*.fna.gz GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/*.fna.gz
    # 2670028162 bases (20740269 N's 2649287893 real 2649287893 upper 0 lower)
    #  in 3178 sequences in 31 files
    # Total size: mean 840159.9 sd 9115217.1 min 224 (GJ060330.1)
    #  max 158337067 (GK000001.2) median 1356

    # note that these totals do not include chrM since the GenBank ftp directory
    # did not include a non-nuclear directory

#############################################################################
# fixup to UCSC naming scheme (DONE - 2014-09-05 - Steve)
    mkdir /hive/data/genomes/bosTau8/ucsc
    cd /hive/data/genomes/bosTau8/ucsc
    # evolving these scripts over time, could become source tree scripts

    cat << '_EOF_' > ucscCompositeAgp.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

my $primary="../genbank/GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly";

open (FH, "<$primary/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat $primary/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat $primary/assembled_chromosomes/FASTA/chr${chrN}.fna.gz|") or die "can not read chr${chrN}.fna.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $primary="../genbank/GCA_000003055.4_Bos_taurus_UMD_3.1.1/GCA_000003055.4_Bos_taurus_UMD_3.1.1_assembly_structure/Primary_Assembly";

my $agpFile =  "$primary/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "$primary/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">chrUn.agp") or die "can not write to chrUn.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\./v/;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">chrUn.fa") or die "can not write to chrUn.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/ .*//;
        $line =~ s/\./v/;
        $line =~ s/>//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs

    chmod +x ucscCompositeAgp.pl unplaced.pl
    ./ucscCompositeAgp.pl
    ./unplaced.pl

# download the mitochondrial sequence

export mitoAcc=NC_006853.1

wget -O ${mitoAcc}.fa \
 "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc"

echo ">chrM" > chrM.fa
grep -v "^>" ${mitoAcc}.fa >> chrM.fa

export mSize=`faCount chrM.fa | grep total | awk '{print $2}'`

/bin/echo -e "chrM\t1\t$mSize\t1\tF\t$mitoAcc\t1\t$mSize\t+" > chrM.agp

    # verify this sequence is still the same as it was before
    #  these translations
    faSize *.fa

    # 2670044500 bases (20740269 N's 2649304231 real 2649304231 upper 0 lower)
    #  in 3179 sequences in 32 files
    # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1)
    #  max 158337067 (chr1) median 1356

    faSize chrM.fa
    # 16338 bases (0 N's 16338 real 16338 upper 0 lower)
    #  in 1 sequences in 1 files

#############################################################################
#  Initial database build (DONE - 2014-09-10 - Steve)

    cd /hive/data/genomes/bosTau8
    cat << '_EOF_' > bosTau8.config.ra
# Config parameters for makeGenomeDb.pl:
db bosTau8
clade mammal
genomeCladePriority 31
scientificName Bos taurus
commonName Cow
assemblyDate Dec. 2009
assemblyLabel University of Maryland Bos_taurus_UMD_3.1.1
assemblyShortLabel UMD_3.1.1
orderKey 2378
mitoAcc none
# mito included in assembly NC_006853.1
fastaFiles /hive/data/genomes/bosTau8/ucsc/chr*.fa
agpFiles /hive/data/genomes/bosTau8/ucsc/chr*.agp
# qualFiles none
dbDbSpeciesDir cow
photoCreditURL http://www.genome.gov/10005141
photoCreditName NHGRI Press Photos
ncbiGenomeId 82
ncbiAssemblyId 189361
ncbiAssemblyName Bos_taurus_UMD_3.1.1
ncbiBioProject 33843
genBankAccessionID GCA_000003055.4
taxId 9913
'_EOF_'
    # << happy emacs

    # verify sequence and AGP are OK:
    time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp bosTau8.config.ra > agp.log 2>&1
    # real    2m22.582s

    # then finish it off:
    time nice -n +19 makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db bosTau8.config.ra > db.log 2>&1

##########################################################################
# running repeat masker (DONE - 2014-09-10 - Steve)
    mkdir /hive/data/genomes/bosTau8/bed/repeatMasker
    cd /hive/data/genomes/bosTau8/bed/repeatMasker
    time  doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku bosTau8 > do.log 2>&1
    # real    599m0.131s

    cat faSize.rmsk.txt
    # 2670044500 bases (20740269 N's 2649304231 real 1360109237 upper
    #    1289194994 lower) in 3179 sequences in 1 files
    # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1)
    #    max 158337067 (chr1) median 1356
    # %48.28 masked total, %48.66 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    # CC   RELEASE 20140131;

    featureBits -countGaps bosTau8 rmsk
    # 1289639153 bases of 2670044500 (48.300%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE 2014-09-12 - Steve)

    mkdir /hive/data/genomes/bosTau8/bed/simpleRepeat
    cd /hive/data/genomes/bosTau8/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        bosTau8 > do.log 2>&1
    # real    5m50.579s

    cat fb.simpleRepeat
    # 33445177 bases of 2649307237 (1.262%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/bosTau8
    twoBitMask bosTau8.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed bosTau8.2bit
    #   you can safely ignore the warning about fields >= 13

    twoBitToFa bosTau8.2bit stdout | faSize stdin > faSize.bosTau8.2bit.txt
    cat faSize.bosTau8.2bit.txt
    # 2670044500 bases (20740269 N's 2649304231 real 1359475670 upper
    #    1289828561 lower) in 3179 sequences in 1 files
    # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1)
    #    max 158337067 (chr1) median 1356
    # %48.31 masked total, %48.69 masked real

    rm /gbdb/bosTau8/bosTau8.2bit
    ln -s `pwd`/bosTau8.2bit /gbdb/bosTau8/bosTau8.2bit

##########################################################################
## WINDOWMASKER (DONE - 2014-09-12 - Steve)

    mkdir /hive/data/genomes/bosTau8/bed/windowMasker
    cd /hive/data/genomes/bosTau8/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev bosTau8 > do.log 2>&1
    # real    165m31.022s

    # Masking statistics
    cat faSize.bosTau8.cleanWMSdust.txt
    # 2670044500 bases (20740269 N's 2649304231 real 1560587113 upper
    #    1088717118 lower) in 3179 sequences in 1 files
    # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1)
    #    max 158337067 (chr1) median 1356
    # %40.78 masked total, %41.09 masked real

    cat fb.bosTau8.rmsk.windowmaskerSdust.txt
    # 862982888 bases of 2670044500 (32.321%) in intersection

##########################################################################
# cpgIslands - (DONE - 2014-09-12 - Steve)
    mkdir /hive/data/genomes/bosTau8/bed/cpgIslands
    cd /hive/data/genomes/bosTau8/bed/cpgIslands
    time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku bosTau8 > do.log 2>&1
    # real    14m36.983s

    cat fb.bosTau8.cpgIslandExt.txt
    # 24127795 bases of 2649307237 (0.911%) in intersection

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2014-09-12 - Steve)
    mkdir /hive/data/genomes/bosTau8/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/bosTau8/bed/cpgIslandsUnmasked

    # run stepwise so the loading can be done in a different table
    time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/bosTau8/bosTau8.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku bosTau8 > do.log 2>&1
    # real    36m46.811s

    cat fb.bosTau8.cpgIslandExtUnmasked.txt
    # 32916929 bases of 2649307237 (1.242%) in intersection

#############################################################################
# cytoBandIdeo - (NEED more work 2014-09-12 - Steve)
    mkdir /hive/data/genomes/bosTau8/bed/cytoBand
    cd /hive/data/genomes/bosTau8/bed/cytoBand
    makeCytoBandIdeo.csh bosTau8

#########################################################################
# genscan - (DONE 2014-09-12 - Steve)
    mkdir /hive/data/genomes/bosTau8/bed/genscan
    cd /hive/data/genomes/bosTau8/bed/genscan
    time doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku bosTau8 > do.log 2>&1
    # real    65m53.319s

    cat fb.bosTau8.genscan.txt
    # 54695662 bases of 2649307237 (2.065%) in intersection

    cat fb.bosTau8.genscanSubopt.txt
    # 50492196 bases of 2649307237 (1.906%) in intersection

########################################################################
# Create kluster run files (DONE - 2014-10-10 - Steve)

    # numerator is bosTau8 gapless bases "real" as reported by:
    featureBits -noRandom -noHap bosTau8 gap
    # 20737263 bases of 2640185480 (0.785%) in intersection

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2640185480 / 2861349177 \) \* 1024
    ( 2640185480 / 2861349177 ) * 1024 = 944.851454

    # ==> use -repMatch=900 according to size scaled down from 1024 for human.
    #   and rounded down to nearest 50 (runup 899.666140 to 900 in this case)
    cd /hive/data/genomes/bosTau8
    blat bosTau8.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/bosTau8.11.ooc \
        -repMatch=900 &
    #   Wrote 33613 overused 11-mers to jkStuff/bosTau8.11.ooc
    #   check non-bridged gaps to see what the typical size is:
    hgsql -N \
        -e 'select * from gap where bridge="no" order by size;' bosTau8 \
        | sort -k7,7nr
    #   most non-bridged gaps have size = 5000, follow pig's example (most
    #   100, but use 5000)
    #   decide on a minimum gap for this break, use either 100 or 5000 will
    #   generate 13387 liftOver rows, but if use 6000, only got 11703 rows.
    #   so use 100 here to get more liftOver row.
    gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \
        -bedFile=jkStuff/nonBridged.bed

########################################################################
# GENBANK AUTO UPDATE (DONE - 2014-10-10 - Steve)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
    # Bos taurus      19983   1559521 13243

    # edit etc/genbank.conf to add bosTau8 just after bosTau7

# bosTau8 (Cow)
bosTau8.serverGenome = /hive/data/genomes/bosTau8/bosTau8.2bit
bosTau8.clusterGenome = /hive/data/genomes/bosTau8/bosTau8.2bit
bosTau8.ooc = /hive/data/genomes/bosTau8/bosTau8.11.ooc
bosTau8.lift = no
bosTau8.perChromTables = no
bosTau8.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
bosTau8.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
bosTau8.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
bosTau8.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
bosTau8.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
bosTau8.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
bosTau8.downloadDir = bosTau8
bosTau8.refseq.mrna.native.load  = yes
bosTau8.refseq.mrna.xeno.load = yes
bosTau8.refseq.mrna.xeno.loadDesc  = yes
bosTau8.upstreamGeneTbl = refGene


    git add etc/genbank.conf
    git commit -m "Added bosTau8; refs #13852"
    git push
    # update /cluster/data/genbank/:
    make etc-update


# Edit src/lib/gbGenome.c to add new species.  Skipped

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time nice -n +19 ./bin/gbAlignStep -initial bosTau8
    # logFile: var/build/logs/2014.10.13-13:00:37.bosTau8.initalign.log
    #   real    362m8.057s
    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.bosTau8

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau8
    # logFile: var/dbload/hgwdev/logs/2014.10.14-09:00:05.bosTau8.dbload.log


    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add bosTau8 to:
    #   etc/align.dbs
    #   etc/hgwdev.dbs
    git add etc/align.dbs
    git add etc/hgwdev.dbs
    git commit -m "Added bosTau8 - Cow" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
#  SWAP hg38 LASTZ (DONE - 2014-10-15 - Steve)
    #   original alignment done at hg38.txt
    cd /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15

    cat fb.hg38.chainBosTau8Link.txt
    # 1401921010 bases of 3049335806 (45.975%) in intersection
    # Create link
    cd /hive/data/genomes/hg38/bed
    ln -s  lastzBosTau8.2014-10-15 lastz.bosTau8

    #   running the swap
    mkdir /hive/data/genomes/bosTau8/bed/blastz.hg38.swap
    cd /hive/data/genomes/bosTau8/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15/DEF \
        -swap  -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real     116m32.121s
    cat fb.bosTau8.chainHg38Link.txt
    #   1336307377 bases of 2649307237 (50.440%) in intersection
    cd /hive/data/genomes/bosTau8/bed
    ln -s blastz.hg38.swap lastz.hg38

#########################################################################
#  SWAP mm10 LASTZ (DONE - 2014-10-15 - Steve)
    #   original alignment done at mm10.txt
    cd /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15

    cat fb.mm10.chainBosTau8Link.txt
    # 698722925 bases of 2652783500 (26.339%) in intersection
    # Create link
    cd /hive/data/genomes/mm10/bed
    ln -s  lastzBosTau8.2014-10-15 lastz.bosTau8

    #   and the swap
    mkdir /hive/data/genomes/bosTau8/bed/blastz.mm10.swap
    cd /hive/data/genomes/bosTau8/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real    58m4.272s
    cat fb.bosTau8.chainMm10Link.txt
    # 687270584 bases of 2649307237 (25.942%) in intersection
    # Create link
    cd /hive/data/genomes/bosTau8/bed
    ln -s blastz.mm10.swap lastz.mm10

#########################################################################
# LIFTOVER TO bosTau6 (DONE - 2014-10-21 - Steve)
    ssh hgwdev
    cd /hive/data/genomes/bosTau8
    ln -s jkStuff/bosTau8.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau8 bosTau6 > doLiftOverToBosTau6.log 2>&1
    # real    82m56.388s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau8/liftOver/
    md5sum bosTau8ToBosTau6.over.chain.gz >> md5sum.txt
    popd

#########################################################################
# LIFTOVER TO bosTau7 (DONE - 2014-10-21 - Steve)
    ssh hgwdev
    cd /hive/data/genomes/bosTau8
    ln -s jkStuff/bosTau8.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau8 bosTau7 > doLiftOverToBosTau7.log 2>&1
    # real    105m34.448s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau8/liftOver/
    md5sum bosTau8ToBosTau7.over.chain.gz >> md5sum.txt
    popd

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE 2014-10-21 - Steve)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=bosTau8 -keys all.joiner
    joinerCheck -database=bosTau8 -tableCoverage all.joiner
    joinerCheck -database=bosTau8 -times all.joiner

    cd /hive/data/genomes/bosTau8
    makeDownloads.pl bosTau8 > do.log 2>&1

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/bosTau8/pushQ
    cd /hive/data/genomes/bosTau8/pushQ
    makePushQSql.pl bosTau8 > bosTau8.pushQ.sql 2> stderr.out
    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/bosTau8/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/bosTau8/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/bosTau8/bbi/quality.bw
    # WARNING: bosTau8 does not have seq
    # WARNING: bosTau8 does not have extFile

    #   copy it to hgwbeta
    scp -p bosTau8.pushQ.sql qateam@hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < bosTau8.pushQ.sql
    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
# uscsToINSDC table/track (DONE - 2014-10-21 - Hiram)
    # fixup the translateNames.sh file from rn6 work to use the new
    # genbank structure  (copied over from rn6/bed/ucscToINSDC)
    ./translateNames.sh  NC_006853.1
    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed
    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 16
    # use the 16 in this sed
    sed -e "s/21/16/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab bosTau8 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords bosTau8
    # should cover %100 entirely:
    featureBits -countGaps bosTau8 ucscToINSDC
    # 2670044500 bases of 2670044500 (100.000%) in intersection

#########################################################################
# lastz Horse equCab2 (DONE - 2014-10-27 - Hiram)
    # a tuning run to determine lastz parameters, this process should move
    # out of this directory to a bosTau8/bed/lastz... directory:

    mkdir /hive/users/hiram/multiz/100way/bosTau8.equCab2
    cd /hive/users/hiram/multiz/100way/bosTau8.equCab2
    zcat /hive/data/genomes/equCab2/bed/genscan/genscan.pep.gz \
       > equCab2.genscan.pep
    zcat /hive/data/genomes/equCab2/bed/genscan/genscan.gp.gz \
       > equCab2.genscan.gp
    zcat /hive/data/genomes/bosTau8/bed/genscan/genscan.pep.gz \
       > bosTau8.genscan.pep
    zcat /hive/data/genomes/bosTau8/bed/genscan/genscan.gp.gz \
       > bosTau8.genscan.gp

    # determine a correspondence between genes in the two sequences:
    time (blat -prot -oneOff=1 bosTau8.genscan.pep equCab2.genscan.pep \
       -out=maf bosTau8.equCab2.oneOff.maf) > blat.log 2>&1
    #  real    282m40.628s
    ../mafScoreSizeScan.pl bosTau8.equCab2.oneOff.maf  > mafScoreSizeScan.list
    ave mafScoreSizeScan.list | grep "^Q3" | awk '{print $2}' \
       | sed -e 's/.000000//' > mafScoreSizeScan.Q3

    # with those regions identified, extract corresponding sequence to
    # use in the lastz tuning operation.  Filter all the scores for the
    # top quartile:
    Q3=`cat mafScoreSizeScan.Q3`
    awk '$1 > '$Q3' && $2 > 100 && $7 > 80 && $8 > 80' mafScoreSizeScan.list \
            > top.all.list
    cut -f3,4 top.all.list | sort > selected.topAll.list
    topCount=`cat selected.topAll.list | wc -l`
    echo "number of top scorers: $topCount"
    #  number of top scorers: 3240
    for N in 100 200 300 400
do
  cat top.all.list | randomLines -decomment stdin $N top.$N.list
  cut -f3,4 top.$N.list | sort > selected.top$N.list
  $TOP/selectedFasta.sh $target $query top$N
            ./tune.top$N.sh &
  /cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz_D \
     --inferonly=/hive/users/hiram/multiz/100way/create_scores_file.control \
        bosTau8.top${N}.fa equCab2.top${N}.fa \
          | /hive/users/hiram/multiz/100way/expand_scores_file.py \
             > bosTau8.equCab2.tuning.top${N}.txt 2>&1
done

    # scan the four result files to see the range in the matrix scores
    # hopefully not too wildly different:
    ../matrixSummary.pl
#  read 4 .txt files    bosTau8.equCab2
      A     C     G     T       averages        4 files bosTau8.equCab2
A    81  -119   -30  -133
C  -119   100   -95   -30
G   -30   -95   100  -119
T  -133   -30  -119    81
      A     C     G     T       ranges  4 files bosTau8.equCab2
A    12    13     4    11
C    13     0    31     4
G     4    31     0    13
T    11     4    13    12
      A     C     G     T       ranges percent  4 files bosTau8.equCab2
A  14.7  10.9  13.0   8.2
C  10.9   0.0  32.6  13.0
G  13.0  32.6  -0.0  10.9
T   8.2  13.0  10.9  14.7

     # looks good, using the results from the top400:

     mkdir /hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11
     cd /hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11

     cp -p /hive/users/hiram/multiz/100way/bosTau8.equCab2/bosTau8.equCab2.tuning.top400.txt \
        ./bosTau8.equCab2.tuning.Q.txt

     cat << '_EOF_' > DEF
# Horse pig vs Horse
# parameters obtained from a tuning run of lastz_D
# /hive/users/hiram/multiz/100way/bosTau8.equCab2/bosTau8.equCab2.tuning.top400.txt
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz

BLASTZ_T=2
BLASTZ_O=400
BLASTZ_E=30
BLASTZ_M=50
BLASTZ_X=880
BLASTZ_Y=3400
BLASTZ_Q=/hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11/bosTau8.equCab2.tuning.Q.txt
#       A     C     G     T
# A    88  -111   -31  -128
# C  -111   100   -75   -31
# G   -31   -75   100  -111
# T  -128   -31  -111    88

# TARGET: Cow bosTau8
SEQ1_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit
SEQ1_LEN=/hive/data/genomes/bosTau8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LIMIT=40
SEQ1_LAP=10000

# QUERY: Horse equCab2
SEQ2_DIR=/hive/data/genomes/equCab2/equCab2.2bit
SEQ2_LEN=/hive/data/genomes/equCab2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs

        time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # real    365m26.314s
    cat fb.bosTau8.chainEquCab2Link.txt
    #  1463493515 bases of 2649307237 (55.241%) in intersection

    time (doRecipBest.pl bosTau8 equCab2 -buildDir=`pwd` -workhorse=hgwdev) \
	> best.log 2>&1 &
    # real    59m58.918s

    #	and the swap
    mkdir /hive/data/genomes/equCab2/bed/blastz.bosTau8.swap
    cd /hive/data/genomes/equCab2/bed/blastz.bosTau8.swap
    time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/bosTau8/bed/lastzEquCab2.2014-12-11/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1
    #   real    139m36.654s

    cat fb.equCab2.chainBosTau8Link.txt
    #	1497036779 bases of 2428790173 (61.637%) in intersection

    time (doRecipBest.pl equCab2 bosTau8 -buildDir=`pwd` -workhorse=hgwdev) \
	> best.log 2>&1
    #   real    75m13.344s

#########################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# LASTZ mouse/bosTau8 sheep/oviAri3 - (DONE - 2015-01-20 - Hiram)
    mkdir /hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20
    cd /hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20

    cp -p \
/hive/users/hiram/multiz/100way/bosTau8.oviAri3/bosTau8.oviAri3.tuning.top400.txt \
     ./bosTau8.oviAri3.tuning.Q.txt

    cat << '_EOF_' > DEF
# Horse vs. Sheep
# parameters obtained from a tuning run of lastz_D
# /hive/users/hiram/multiz/100way/bosTau8.oviAri3/bosTau8.oviAri3.tuning.top400.txt

BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
BLASTZ_O=400
BLASTZ_E=30
BLASTZ_M=50
BLASTZ_X=1000
BLASTZ_Y=3400
BLASTZ_K=3000
BLASTZ_L=3000
BLASTZ_Q=/hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20/bosTau8.oviAri3.tuning.Q.txt
#       A     C     G     T
# A   100  -172   -55  -173
# C  -172    95  -212   -55
# G   -55  -212    95  -172
# T  -173   -55  -172   100

# TARGET: Horse bosTau8
SEQ1_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit
SEQ1_LEN=/hive/data/genomes/bosTau8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: Sheep oviAri3
SEQ2_DIR=/hive/data/genomes/oviAri3/oviAri3.2bit
SEQ2_LEN=/hive/data/genomes/oviAri3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=40
SEQ2_LAP=0

BASE=/hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
      -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > do.log 2>&1
    # real    1312m54.887s

    cat fb.bosTau8.chainOviAri3Link.txt
    # 2236790379 bases of 2649307237 (84.429%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` bosTau8 oviAri3) > rbest.log 2>&1 &
    # real    38m9.755s

    # and for the swap:
    mkdir /hive/data/genomes/oviAri3/bed/blastz.bosTau8.swap
    cd /hive/data/genomes/oviAri3/bed/blastz.bosTau8.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/bosTau8/bed/lastzOviAri3.2015-01-20/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    477m36.613s

    cat fb.oviAri3.chainBosTau8Link.txt
    # 2217800123 bases of 2534335866 (87.510%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` oviAri3 bosTau8) > rbest.log 2>&1
    # real    44m43.976s

#########################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-06-26 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau8/bed/geneid
    cd /hive/data/genomes/bosTau8/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/B.taurus/bosTau8/geneid_v1.4/bosTau8.geneid.prot
    wget --timestamping \
http://genome.crg.es/genepredictions/B.taurus/bosTau8/geneid_v1.4/bosTau8.geneid.gtf
    ldHgGene -gtf -genePredExt bosTau8 geneid bosTau8.geneid.gtf

    #  Read 33831 transcripts in 268467 lines in 1 files
    #     33831 groups 1612 seqs 1 sources 3 feature types
    #  33831 gene predictions

    featureBits -countGaps bosTau8 geneid
# 37012407 bases of 2670044500 (1.386%) in intersection

    featureBits -countGaps bosTau3 geneid
# 44623578 bases of 2902896901 (1.537%) in intersection

##########################################################################
# SGP GENE PREDICTIONS (DONE - 2015-07-20 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau8/bed/sgpGene
    cd /hive/data/genomes/bosTau8/bed/sgpGene

for F in 00README bosTau8.Hs-Mm.sgp2 bosTau8.Hs-Mm.sgp2.cds \
    bosTau8.Hs-Mm.sgp2.gff bosTau8.Hs-Mm.sgp2.gff3 bosTau8.Hs-Mm.sgp2.gtf \
    bosTau8.Hs-Mm.sgp2.prot
do
wget --timestamping \
  http://genome.crg.es/genepredictions/B.taurus/bosTau8/SGP2/hg38/${F}
done

    ldHgGene -gtf -genePredExt bosTau8 sgpGene bosTau8.Hs-Mm.sgp2.gtf
    # Read 35519 transcripts in 297667 lines in 1 files
    # 35519 groups 364 seqs 1 sources 3 feature types
    # 35519 gene predictions

    featureBits -countGaps bosTau8 sgpGene
# 36198709 bases of 2670044500 (1.356%) in intersection
    featureBits -countGaps bosTau3 sgpGene
# 40658543 bases of 2902896901 (1.401%) in intersection

##########################################################################
# LASTZ cow/bosTau8 killer whale/orcOrc1 - (DONE - 2016-06-01,02 - Hiram)
    mkdir /hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01
    cd /hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01

    printf '# cow vs. killer whale
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_M=254

# TARGET: cow bosTau8
SEQ1_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit
SEQ1_LEN=/hive/data/genomes/bosTau8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: killer whale orcOrc1
SEQ2_DIR=/hive/data/genomes/orcOrc1/orcOrc1.2bit
SEQ2_LEN=/hive/data/genomes/orcOrc1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # real    1439m45.464s

    cat fb.bosTau8.chainOrcOrc1Link.txt
    # 1670894199 bases of 2649307237 (63.069%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` bosTau8 orcOrc1) \
      > rbest.log 2>&1 &
    # real    590m0.045s

    # and for the swap:
    mkdir /hive/data/genomes/orcOrc1/bed/blastz.bosTau8.swap
    cd /hive/data/genomes/orcOrc1/bed/blastz.bosTau8.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/bosTau8/bed/lastzOrcOrc1.2016-06-01/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    411m32.032s

    cat fb.orcOrc1.chainBosTau8Link.txt
    # 1659691513 bases of 2249582125 (73.778%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` orcOrc1 bosTau8) \
       > rbest.log 2>&1
    # real    515m19.362s

############################################################################
# DBSNP B148 / SNP148 (DONE 8/1/16 jcasper)
    # RedMine #16846
    screen
    mkdir -p /hive/data/outside/dbSNP/148/cow_bosTau8
    cd /hive/data/outside/dbSNP/148/cow_bosTau8
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (cow_9913 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_
    # -- use the first num for build setting in config.ra
    # Note that there are two cow entries - 9913 and 30522.  30522, however,
    # has B128 as its most recent build.
    # The buildAssembly setting in config.ra is empty because dbSNP stopped including
    # that in file names.
    # Had to fetch
    # ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000003055.6.assembly.txt
    # manually and install it in the config file because NCBI's FTP server
    # appears to be having some problems with providing the right header
    # information (curl and wget on our server complain about it, anyway).
    cat > config.ra <<EOF
db bosTau8
build 148
buildAssembly
orgDir cow_9913
refAssemblyLabel Bos_taurus_UMD_3.1.1
ncbiAssemblyReportFile GCF_000003055.6.assembly.txt
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
    # Some trial and error is always required to get the config.ra just right.
    # First stop:
#*** 2 lines of b148_ContigInfo.bcp.gz contained contig names that
#*** could not be mapped to chrom.size via their GenBank contig mappings; see
#*** /hive/data/outside/dbSNP/148/cow_bosTau8/cantLiftUpSeqNames.txt .
#
#*** You must account for all 6337 contig_acc values in config.ra,
#*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
#*** Check the auto-generated suggested.lft to see if it covers all
#*** 6337 contigs; if it does, add 'liftUp suggested.lft' to config.ra.
#*** Then run again with -continue=loadDbSnp .

    cat cantLiftUpSeqNames.txt
#unplaced-scaffold (no chrUn_GJ058435v1 in chrom.sizes)  NW_003099180    na
#unplaced-scaffold (no chrUn_GJ058447v1 in chrom.sizes)  NW_003099192    na

# These GenBank contigs (GJ058435.1 and GJ058447.1) were added in the most
# recent version of the Bos_taurus_UMD_3.1.1 assembly - the one with accession
# number GCF_000003055.6.  Our bosTau8 is build from GCF_000003055.5.
# Handling this by dropping those two contigs from the build.

    tawk '{print $2}' cantLiftUpSeqNames.txt > ignoreDbSnpContigs.txt
    echo 'ignoreDbSnpContigsFile ignoreDbSnpContigs.txt' >> config.ra

    cat >> config.ra <<EOF
liftUp suggested.lft
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >> do.log 2>&1 &
    tail -f do.log

# *** All done!

    zcat snp148.bed.gz \
    | ~/kent/src/hg/utils/automation/categorizeSnps.pl
#Mult:     120755
#Common:   10469
#Flagged:  0
#leftover: 99100955
    # 10,469 SNPs out of ~100M that meet the threshold for Common... that's
    # pretty sparse.  Not sure it's worth pushing out as a separate subtrack.

##############################################################################
# LIFTOVER TO bosTau9 (DONE - 2018-11-07 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau8/bed/blat.bosTau9.2018-11-07
    cd /hive/data/genomes/bosTau8/bed/blat.bosTau9.2018-11-07
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/bosTau8/jkStuff/bosTau8.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau8 bosTau9) > do.log 2>&1 &
    # real    933m14.111s

    # verify the convert link on the test browser is now
    # active from bosTau8 to bosTau9

##############################################################################