# for emacs: -*- mode: sh; -*-

# This file describes browser build for the papAnu3

#########################################################################
# reuse photograph from papAnu2 previous versions
#    (DONE - 2017-06-20 - Hiram)

mkdir /hive/data/genomes/papAnu3
cd /hive/data/genomes/papAnu3
cp -p ../papAnu2/photoReference.txt .

cat photoReference.txt

photoCreditURL  http://www.oumedicine.com/pathology/general-program-info/faculty-staff/roman-f-wolf-dvm
photoCreditName Roman Wolf, University of Oklahoma Health Sciences Center

#########################################################################
#  Initial steps (DONE - 2017-06-20 - Hiram)

# To start this initialBuild.txt document, from a previous assembly document:

mkdir ~/kent/src/hg/makeDb/doc/papAnu3
cd ~/kent/src/hg/makeDb/doc/papAnu3

# best to use a most recent document since it has the latest features and
# procedures:
 sed -e 's/xenTro9/papAnu3/g; s/XenTro9/PapAnu3/g; s/DONE/TBD/g;' ../xenTro9/initialBuild.txt  > initialBuild.txt

mkdir /hive/data/genomes/papAnu3/refseq
cd /hive/data/genomes/papAnu3/refseq

rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Papio_anubis/all_assembly_versions/GCF_000264685.2_Panu_2.0/ ./

# sent 2600 bytes  received 4190503044 bytes  10183488.81 bytes/sec
# total size is 4189981785  speedup is 1.00

# check assembly size for later reference:

faSize G*_2.0_genomic.fna.gz

# 2948397226 bases (55130419 N's 2893266807 real 1783082117 upper
#	1110184690 lower) in 63251 sequences in 1 files
# Total size: mean 46614.2 sd 2497421.7 min 200 (NW_003880573.1)
#	max 220367699 (NC_018152.1) median 1416
# %37.65 masked total, %38.37 masked real


# this information is from the top of 
#    papAnu3/refseq/GCF_000264685.2_Panu_2.0_assembly_report.txt

# Assembly name:  Panu_2.0
# Organism name:  Papio anubis (olive baboon)
# Isolate:  1X1155
# Sex:  female
# Taxid:          9555
# BioSample:      SAMN02981400
# BioProject:     PRJNA169345
# Submitter:      Baylor College of Medicine
# Date:           2013-2-4
# Synonyms:       papAnu2       
# Assembly type:  haploid
# Release type:   
# Assembly level: Chromosome
# Genome representation: full
# WGS project:    AHZZ01
# Assembly method: CABOG v. 6.1; ATLAS-LINK v. 1.0; ATLAS-GAPFILL v. 2.0
# Genome coverage: 2.5x Sanger; 4.5x 454; 85x Illumina
# Sequencing technology: Sanger 3730; 454 FLX; Illumina
# RefSeq category: Representative Genome
# GenBank assembly accession: GCA_000264685.1
# RefSeq assembly accession: GCF_000264685.2
# RefSeq assembly and GenBank assemblies identical: no
#
## Assembly-Units:
## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
## GCA_000264695.1      GCF_000264695.1 Primary Assembly
##      GCF_000749495.1 non-nuclear

#############################################################################
# establish config.ra file (DONE - Hiram - 2017-06-20)
    # arguments here are: <db> <clade> <trackDbDir> <assembly_report.txt>
    cd /hive/data/genomes/papAnu3
    $HOME/kent/src/hg/utils/automation/prepConfig.pl papAnu3 vertebrate \
        baboon ./refseq/*_assembly_report.txt > papAnu3.config.ra

    # fixup common name to remain compatible with previous panAnu2 version
    # and therfore orderKey wasn't correct.  Also fixup shortLabel
    # to check orderKey:
    hgsql -e 'select name,orderKey,organism from dbDb order by orderKey;' \
        hgcentraltest | less

    # verify it looks sane
    cat papAnu3.config.ra
# config parameters for makeGenomeDb.pl:
db papAnu3
clade vertebrate
genomeCladePriority 70
scientificName Papio anubis
commonName Baboon
assemblyDate Feb. 2013
assemblyLabel Baylor College of Medicine
assemblyShortLabel Baylor Panu_2.0
orderKey 2049
# mitochondrial sequence included in refseq release
# mitoAcc NC_020006.2
mitoAcc none
fastaFiles /hive/data/genomes/papAnu3/ucsc/*.fa.gz
agpFiles /hive/data/genomes/papAnu3/ucsc/*.agp
# qualFiles none
dbDbSpeciesDir baboon
photoCreditURL  http://www.oumedicine.com/pathology/general-program-info/faculty-staff/roman-f-wolf-dvm
photoCreditName Roman Wolf, University of Oklahoma Health Sciences Center
ncbiGenomeId 394
ncbiAssemblyId 399268
ncbiAssemblyName Panu_2.0
ncbiBioProject 169345
ncbiBioSample SAMN02981400
genBankAccessionID GCF_000264685.2
taxId 9555

#############################################################################
# setup UCSC named files (DONE - 2017-06-20 - Hiram)

    mkdir /hive/data/genomes/papAnu3/ucsc
    cd /hive/data/genomes/papAnu3/ucsc

    # check for duplicate sequences:
    time faToTwoBit -noMask ../refseq/G*_2.0_genomic.fna.gz refseq.2bit
    #  real    1m15.720s

    twoBitDup refseq.2bit
    # no output is a good result, otherwise, would have to eliminate duplicates
    # the scripts creating the fasta here will be using this refseq.2bit file

    ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
       ../refseq/G*_2.0_genomic.fna.gz \
          ../refseq/G*_2.0_assembly_structure/Primary_Assembly
# NC_018152.1 chr1
# NC_018153.1 chr2
# NC_018154.1 chr3
# NC_018155.1 chr4
# NC_018156.1 chr5
# NC_018157.1 chr6
# NC_018158.1 chr7
# NC_018159.1 chr8
# NC_018160.1 chr9
# NC_018161.1 chr10
# NC_018162.1 chr11
# NC_018163.1 chr12
# NC_018164.1 chr13
# NC_018165.1 chr14
# NC_018166.1 chr15
# NC_018167.1 chr16
# NC_018168.1 chr17
# NC_018169.1 chr18
# NC_018170.1 chr19
# NC_018171.1 chr20
# NC_018172.1 chrX

    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
# processed 63229 sequences into chrUn.fa.gz
#   real    23m8.123s

# there are no unlocalized sequences
#    time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \
#       ../refseq/*_assembly_structure/Primary_Assembly

    # bash syntax here
    mitoAcc=`grep "^# mitoAcc" ../papAnu3.config.ra | awk '{print $NF}'`
    printf "# mitoAcc %s\n" "$mitoAcc"
# mitoAcc NC_020006.2

    zcat \
  ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \
     | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp

    printf ">chrM\n" > chrM.fa
    twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa
    gzip chrM.fa

    # verify fasta and AGPs agree
    time faToTwoBit *.fa.gz test.2bit
    # real    1m20.981s

    time cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4
    # All AGP and FASTA entries agree - both files are valid
    # real    0m44.106s

    # and no sequence lost from orginal:
    twoBitToFa test.2bit stdout | faSize stdin
# 2948397226 bases (55130419 N's 2893266807 real 2893266807 upper 0 lower)
#	in 63251 sequences in 1 files
# Total size: mean 46614.2 sd 2497421.7 min 200 (chrUn_NW_003880573v1)
#	max 220367699 (chr1) median 1416
# %0.00 masked total, %0.00 masked real

    # same numbers as above
# 2948397226 bases (55130419 N's 2893266807 real 1783082117 upper
#	1110184690 lower) in 63251 sequences in 1 files
# Total size: mean 46614.2 sd 2497421.7 min 200 (NW_003880573.1)
#	max 220367699 (NC_018152.1) median 1416

    # no longer need these temporary 2bit files
    rm refseq.2bit test.2bit

#############################################################################
#  Initial database build (DONE - 2017-06-20 - Hiram)

    cd /hive/data/genomes/papAnu3
    # verify sequence and AGP are OK:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp papAnu3.config.ra) > agp.log 2>&1
    # real    3m14.288s
    # verify there was no error in that step:
    tail agp.log
    #  *** All done!  (through the 'agp' step)

    # then finish it off:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db papAnu3.config.ra) > db.log 2>&1
    # real    24m55.338s

    # check in the trackDb files created in TemporaryTrackDbCheckout/
    #    and add papAnu3 to trackDb/makefile

    # temporary symlink until masked sequence is available
    cd /hive/data/genomes/papAnu3
    ln -s `pwd`/papAnu3.unmasked.2bit /gbdb/papAnu3/papAnu3.2bit

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2017-05-20 - Hiram)
    mkdir /hive/data/genomes/papAnu3/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/papAnu3/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/papAnu3/papAnu3.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku papAnu3) > do.log 2>&1
    # real    10m10.843s

    cat fb.papAnu3.cpgIslandExtUnmasked.txt
    # 39762268 bases of 2893270787 (1.374%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2017-06-20 - Hiram)
    mkdir /hive/data/genomes/papAnu3/bed/cytoBand
    cd /hive/data/genomes/papAnu3/bed/cytoBand
    makeCytoBandIdeo.csh papAnu3

#########################################################################
# ucscToINSDC and ucscToRefSeq table/track (DONE - 2017-06-20 - Hiram)
    # the sequence here is working for a 'refseq' assembly
    # beware of a chrM situation may be specific depending upon what is
    # available in the assembly

    mkdir /hive/data/genomes/papAnu3/bed/ucscToINSDC
    cd /hive/data/genomes/papAnu3/bed/ucscToINSDC

    # find accession for chrM
    grep chrM ../../papAnu3.agp
# chrM    1       16516   1       O       NC_020006.2     1       16516   +

    # find the genbank accession for NC_020006.2 at Entrez nucleotide
    # The NC_020006.2 name is the RefSeq name, the genbank name is: JX946196.2
    # the assembly_report does not have this AY name since the chrM sequence
    # is not in the genbank assembly:
    grep NC_020006.2 ../../refseq/GCF*_2.0_assembly_report.txt
# MT      assembled-molecule      MT      Mitochondrion   na      <>      NC_020006.2     non-nuclear     16516   na

    # if there is a chrM, use its INSDC name as a second argument:
    # this is a RefSeq assembly, use the chrM refSeq name:
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
      ../../refseq/GCF_*structure/Primary_Assembly NC_020006.2

    # this is actually ucscToRefSeq since this is a RefSeq assembly
    sort -k2 ucscToINSDC.txt > ucscToRefSeq.txt
    rm -f ucscToINSDC.txt
    awk '{printf "%s\t%s\n", $2, $1}' ucscToRefSeq.txt \
       | sort > refSeqToUcsc.txt

    # chrM processing needs special help, fixup with the sed
    # extract the refseq vs. genbank names from the assembly_report
    # columns 5 and 7 are the INSDC and RefSeq names
    grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \
      | awk '{printf "%s\t%s\n", $2, $1}' | sed -e 's/na/JX946196.2/' \
          | sort > refseq.insdc.txt

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > ucsc.coordinate.tab
    join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-4 \
           > ucscToRefSeq.bed

    join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \
           > ucscToINSDC.bed

    # verify chrM is correct:
    grep chrM *.bed
    # ucscToINSDC.bed:chrM    0       16516   JX946196.2
    # ucscToRefSeq.bed:chrM   0       16516   NC_020006.2

    # should be same line counts throughout:
    # in this case one is missing in the final result due to the duplicate
    # contig being removed
    wc -l *
    #	63251 refSeqToUcsc.txt
    #	63251 refseq.insdc.txt
    #	63251 ucsc.coordinate.tab
    #	63251 ucscToINSDC.bed
    #	63251 ucscToRefSeq.bed
    #	63251 ucscToRefSeq.txt

    export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    # 20
    # use the $chrSize in this sed
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab papAnu3 ucscToINSDC stdin ucscToINSDC.bed
    # should be the same for ucscToRefSeq:
    export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    #  20
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab papAnu3 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    # checkTableCoords should be silent
    checkTableCoords papAnu3
    # each should cover %100 entirely:
    featureBits -countGaps papAnu3 ucscToINSDC
    # 2948397226 bases of 2948397226 (100.000%) in intersection

    featureBits -countGaps papAnu3 ucscToRefSeq
    # 2948397226 bases of 2948397226 (100.000%) in intersection

#########################################################################
# add chromAlias table (DONE - 2017-06-20 - Hiram)

    mkdir /hive/data/genomes/papAnu3/bed/chromAlias
    cd /hive/data/genomes/papAnu3/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' papAnu3 \
        | sort -k1,1 > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' papAnu3 \
        | sort -k1,1 > ucsc.genbank.tab

    awk '{printf "%s\t%s\n", $1,$2}' ../ensLift/ucscToEns.txt \
        | sort -k1,1 > ucsc.ensembl.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl

    hgLoadSqlTab papAnu3 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        papAnu3.chromAlias.tab

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2017-06-20 - Hiram)
    cd ~/kent/src/hg/makeDb/trackDb/xenTro/papAnu3

    # preview prefixes and suffixes:
    hgsql -N -e "select frag from gold;" papAnu3 \
      | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c 
#  198931 AHZZ.1
#       1 NC_.2

    # implies a rule: '[AN][CH][Z_][Z0-9]+(\.[0-9]+)?'

    # verify this rule will find them all and eliminate them all:
    hgsql -N -e "select frag from gold;" papAnu3 | wc -l
    # 198932

    hgsql -N -e "select frag from gold;" papAnu3 \
       | egrep -e '[AN][CH][Z_][Z0-9]+(\.[0-9]+)?' | wc -l
    # 198932

    hgsql -N -e "select frag from gold;" papAnu3 \
       | egrep -v -e '[AN][CH][Z_][Z0-9]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/chicken/papAnu3/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AN][CH][Z_][Z0-9]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

##########################################################################
# running repeat masker (DONE - 2017-06-20 - Hiram)
    mkdir /hive/data/genomes/papAnu3/bed/repeatMasker
    cd /hive/data/genomes/papAnu3/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku papAnu3) > do.log 2>&1 &
    # real    752m10.908s

    cat faSize.rmsk.txt
# 2948397226 bases (55130419 N's 2893266807 real 1378626320 upper
#	1514640487 lower) in 63251 sequences in 1 files
# Total size: mean 46614.2 sd 2497421.7 min 200 (chrUn_NW_003880573v1)
#	max 220367699 (chr1) median 1416
# %51.37 masked total, %52.35 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    # CC   RELEASE 20140131;                                            *

    time featureBits -countGaps papAnu3 rmsk
    # 1515828879 bases of 2948397226 (51.412%) in intersection
    # real    1m14.038s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the faSize count above
    #   separates out the N's from the bases, it doesn't show lower case N's

    # faster way to get the same result on high contig count assemblies:
    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' papAnu3 \
        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
    #	total 1515828879.000000
    #	real    0m39.569s

##########################################################################
# running simple repeat (DONE - 2017-06-20 - Hiram)

    mkdir /hive/data/genomes/papAnu3/bed/simpleRepeat
    cd /hive/data/genomes/papAnu3/bed/simpleRepeat
    # using trf409 5 here guessing smaller genome (human == 6)
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        -trf409 5 papAnu3) > do.log 2>&1 &
    # real    36m31.858s

    cat fb.simpleRepeat
    # 163817450 bases of 2893270787 (5.662%) in intersection

    # adding this trfMask to the other masking
    cd /hive/data/genomes/papAnu3

    # when using the Window Masker result:
#    twoBitMask bed/windowMasker/papAnu3.cleanWMSdust.2bit \
#       -add bed/simpleRepeat/trfMask.bed  papAnu3.2bit
    #   you can safely ignore the warning about fields >= 13

    # when using Rmsk results, add to rmsk after it is done:
    twoBitMask papAnu3.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed papAnu3.2bit
    #   you can safely ignore the warning about fields >= 13

    twoBitToFa papAnu3.2bit stdout | faSize stdin > faSize.papAnu3.2bit.txt
    cat faSize.papAnu3.2bit.txt
# 2948397226 bases (55130419 N's 2893266807 real 1377167920 upper
#	1516098887 lower) in 63251 sequences in 1 files
# Total size: mean 46614.2 sd 2497421.7 min 200 (chrUn_NW_003880573v1)
#	max 220367699 (chr1) median 1416
# %51.42 masked total, %52.40 masked real

    # reset the symlink
    rm /gbdb/papAnu3/papAnu3.2bit
    ln -s `pwd`/papAnu3.2bit /gbdb/papAnu3/papAnu3.2bit

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2017-06-21 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/papAnu3/bed/microsat
    cd /cluster/data/papAnu3/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed papAnu3 microsat microsat.bed
    # Read 28500 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2017-06-21 - Hiram)

    mkdir /hive/data/genomes/papAnu3/bed/windowMasker
    cd /hive/data/genomes/papAnu3/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev papAnu3) > do.log 2>&1
    # real    237m57.966s

    # Masking statistics
    cat faSize.papAnu3.cleanWMSdust.txt
# 2948397226 bases (55130419 N's 2893266807 real 1766616003 upper 1126650804 lower) in 63251 sequences in 1 files
# Total size: mean 46614.2 sd 2497421.7 min 200 (chrUn_NW_003880573v1) max 220367699 (chr1) median 1416
# %38.21 masked total, %38.94 masked real

    cat fb.papAnu3.rmsk.windowmaskerSdust.txt
    # 897063458 bases of 2948397226 (30.425%) in intersection

##########################################################################
# run up idKeys files for ncbiRefSeq (DONE - 2017-06-21 - Hiram)
    mkdir /hive/data/genomes/papAnu3/bed/idKeys
    cd /hive/data/genomes/papAnu3/bed/idKeys

    time (doIdKeys.pl -buildDir=`pwd`  papAnu3) > do.log 2>&1 &
    # real    70m22.608s

    cat papAnu3.keySignature.txt
    #  23214ba62fc59aeee20827fb5be54544

##########################################################################
# cpgIslands - (DONE - 2017-06-21 - Hiram)
    mkdir /hive/data/genomes/papAnu3/bed/cpgIslands
    cd /hive/data/genomes/papAnu3/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku papAnu3) > do.log 2>&1 &
    # real    17m26.948s

    cat fb.papAnu3.cpgIslandExt.txt
    # 21276357 bases of 2893270787 (0.735%) in intersection

##############################################################################
# genscan - (DONE - 2017-06-21 - Hiram)
    mkdir /hive/data/genomes/papAnu3/bed/genscan
    cd /hive/data/genomes/papAnu3/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku papAnu3) > do.log 2>&1 &
    # real    197m24.943s

    cat fb.papAnu3.genscan.txt
    # 52318577 bases of 2893270787 (1.808%) in intersection

    cat fb.papAnu3.genscanSubopt.txt
    # 51120501 bases of 2893270787 (1.767%) in intersection

#############################################################################
# augustus gene track (DONE - 2017-06-21 - Hiram)

    mkdir /hive/data/genomes/papAnu3/bed/augustus
    cd /hive/data/genomes/papAnu3/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
     -species=human -dbHost=hgwdev -workhorse=hgwdev papAnu3) > do.log 2>&1 &
    # real    119m13.121s

    cat fb.papAnu3.augustusGene.txt
    # 47314970 bases of 2893270787 (1.635%) in intersection

#############################################################################
# lastz/chain/net swap human/hg38 (DONE - 2017-06-21 - Hiram)
    # original alignment
    cd /hive/data/genomes/hg38/bed/lastzPapAnu3.2017-06-21

    cat fb.hg38.chainPapAnu3Link.txt
    # 2632683317 bases of 3049335806 (86.336%) in intersection

    # and for the swap:
    mkdir /hive/data/genomes/papAnu3/bed/blastz.hg38.swap
    cd /hive/data/genomes/papAnu3/bed/blastz.hg38.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/hg38/bed/lastzPapAnu3.2017-06-21/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    232m5.391s

    cat fb.papAnu3.chainHg38Link.txt
    # 2501550280 bases of 2893270787 (86.461%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` papAnu3 hg38) \
      > rbest.log 2>&1
    # real    446m1.402s

#########################################################################
# lastz/chain/net swap mouse/mm10 (DONE - 2017-06-21 - Hiram)

    # alignment to mouse/mm10:
    cd /hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21
    cat fb.mm10.chainPapAnu3Link.txt
    #	910628118 bases of 2652783500 (34.327%) in intersection

    mkdir /hive/data/genomes/papAnu3/bed/blastz.mm10.swap
    cd /hive/data/genomes/papAnu3/bed/blastz.mm10.swap
    time (doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1
    #	real    66m35.501s

    cat fb.papAnu3.chainMm10Link.txt
    #	897929517 bases of 2893270787 (31.035%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev papAnu3 mm10 \
      -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 &
    # real    578m46.893s

##############################################################################
# Create kluster run files (DONE - 2017-06-21 - Hiram)

    # numerator is papAnu3 gapless bases "real" as reported by:
    featureBits -noRandom -noHap papAnu3 gap
    # 42059064 bases of 2682285126 (1.568%) in intersection
    #                   ^^^

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2682285126 / 2861349177 \) \* 1024
    #  ( 2682285126 / 2861349177 ) * 1024 = 959.917787

    # ==> use -repMatch=1025 same as papAnu2
    cd /hive/data/genomes/papAnu3
    blat papAnu3.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/papAnu3.11.ooc \
        -repMatch=1025
    #   Wrote 29129 overused 11-mers to jkStuff/papAnu3.11.ooc
    # papAnu2 at repMatch=1025 was:
    #   Wrote 29128 overused 11-mers to jkStuff/papAnu2.11.ooc

    #   check non-bridged gaps to see what the typical size is:
    hgsql -N \
        -e 'select * from gap where bridge="no" order by size;' papAnu3 \
        | sort -k7,7nr | ave -col=7 stdin
# Q1 100.000000
# median 100.000000
# Q3 100.000000
# average 100.000000
# min 100.000000
# max 100.000000
# count 9250
# total 925000.000000
# standard deviation 0.000000

    # all these gap sizes are 100
    gapToLift -verbose=2 -minGap=100 papAnu3 jkStuff/papAnu3.nonBridged.lft \
       -bedFile=jkStuff/papAnu3.nonBridged.bed

#########################################################################
# LIFTOVER TO papAnu2 (DONE - 2017-06-23 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/papAnu3/bed/blat.papAnu2.2017-06-23
    cd /hive/data/genomes/papAnu3/bed/blat.papAnu2.2017-06-23
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/papAnu3/jkStuff/papAnu3.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         papAnu3 papAnu2) > do.log 2>&1
    # real    396m5.063s

    # verify the convert link on the test browser is now active from papAnu3 to
    # papAnu2

#########################################################################
# LIFTOVER TO papAnu4 (DONE - 2018-01-08 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/papAnu3/bed/blat.papAnu4.2018-01-08
    cd /hive/data/genomes/papAnu3/bed/blat.papAnu4.2018-01-08
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/papAnu3/jkStuff/papAnu3.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         papAnu3 papAnu4) > do.log 2>&1 &
    # real    579m54.169s

    # verify the convert link on the test browser is now active from papAnu3 to
    # papAnu3

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2017-06-21 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism             mrnaCnt   estCnt  refSeqCnt
    # Papio anubis    512     145626  505
    # Papio anubis anubis     6       0       0
    # Papio cynocephalus      23      0       0
    # Papio cynocephalus x Papio anubis       3       0       0
    # Papio hamadryas 102     0       0
    # Papio hamadryas hamadryas       1       0       0
    # Papio papio     10      0       0

    # edit etc/genbank.conf to add papAnu3 just before papAnu2
# Baboon
# papAnu3 (baboon)
papAnu3.serverGenome = /hive/data/genomes/papAnu3/papAnu3.2bit
papAnu3.clusterGenome = /hive/data/genomes/papAnu3/papAnu3.2bit
papAnu3.ooc = /hive/data/genomes/papAnu3/jkStuff/papAnu3.11.ooc
papAnu3.lift = /hive/data/genomes/papAnu3/jkStuff/papAnu3.nonBridged.lift
papAnu3.perChromTables = no
papAnu3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
papAnu3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
papAnu3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
papAnu3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
papAnu3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
papAnu3.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
papAnu3.downloadDir = papAnu3
# DO NOT NEED genbank.mrna.xeno except for human, mouse
# defaults yes: genbank.mrna.native.load, genbank.mrna.native.loadDesc,
# genbank.est.native.load, refseq.mrna.native.load, refseq.mrna.native.loadDesc,
# refseq.mrna.xeno.load , refseq.mrna.xeno.loadDesc

    git commit -m 'adding papAnu3 baboon refs #19660' etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial papAnu3
    # logFile: var/build/logs/2017.06.21-10:02:02.papAnu3.initalign.log
    #   real    422m26.969s


    tail -2 var/build/logs/2017.06.21-10:02:02.papAnu3.initalign.log
    # hgwdev 2017.06.21-17:01:26 papAnu3.initalign: Succeeded: papAnu3
    # hgwdev 2017.06.21-17:04:29 papAnu3.initalign: finish

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.papAnu3

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad papAnu3
    # logFile: var/dbload/hgwdev/logs/2017.06.21-18:22:49.papAnu3.dbload.log
    #  real    18m30.415s

    tail -1 var/dbload/hgwdev/logs/2017.06.21-18:22:49.papAnu3.dbload.log
    #  hgwdev 2017.06.21-18:41:19 papAnu3.dbload: finish

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add papAnu3 to:
    #   etc/align.dbs etc/hgwdev.dbs
    git add etc/align.dbs etc/hgwdev.dbs
    git commit -m 'adding papAnu3 to the update alignments refs #19660' etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#############################################################################
# ncbiRefSeq (TBD - 2016-05-13 - Hiram)

    mkdir /hive/data/genomes/papAnu3/bed/ncbiRefSeq
    cd /hive/data/genomes/papAnu3/bed/ncbiRefSeq
    # running step wise as this script is still under development
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_other Gallus_gallus \
      GCF_000002315.4_Gallus_gallus-5.0 papAnu3) > download.log 2>&1
    # real    16m29.536s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=process -bigClusterHub=ku -dbHost=hgwdev \
      -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_other Gallus_gallus \
      GCF_000002315.4_Gallus_gallus-5.0 papAnu3) > process.log 2>&1
    # real    3m58.858s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=load -bigClusterHub=ku -dbHost=hgwdev \
      -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_other Gallus_gallus \
      GCF_000002315.4_Gallus_gallus-5.0 papAnu3) > load.log 2>&1
    # real    0m33.205s

    cat fb.ncbiRefSeq.papAnu3.txt
    #  82563006 bases of 1218501075 (6.776%) in intersection

    featureBits -enrichment papAnu3 refGene ncbiRefSeq 
    # refGene 1.181%, ncbiRefSeq 6.776%, both 1.175%, cover 99.49%,
    #    enrich 14.68x

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2017-06-23 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("papAnu3", "blat1c", "17892", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("papAnu3", "blat1c", "17893", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## reset default position to same as what hg38 has
##  (DONE - 2017-06-23 - Hiram)

    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chr1:13964116-14118692"
	where name="papAnu3";' hgcentraltest

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2017-06-23 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=papAnu3 -tableCoverage all.joiner
    joinerCheck -database=papAnu3 -times all.joiner
    joinerCheck -database=papAnu3 -keys all.joiner

    cd /hive/data/genomes/papAnu3
    time (makeDownloads.pl -workhorse=hgwdev papAnu3) > downloads.log 2>&1
    #  real    26m28.250s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/papAnu3/pushQ
    cd /hive/data/genomes/papAnu3/pushQ
    time (makePushQSql.pl -redmineList papAnu3) > papAnu3.pushQ.sql 2> stderr.out
    #  real    3m42.117s

    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: papAnu3 does not have seq
    # WARNING: papAnu3 does not have extFile

    # add the path names to the listing files in the redmine issue
    # in the three appropriate entry boxes:

/hive/data/genomes/papAnu3/pushQ/redmine.papAnu3.file.list
/hive/data/genomes/papAnu3/pushQ/redmine.papAnu3.releaseLog.txt
/hive/data/genomes/papAnu3/pushQ/redmine.papAnu3.table.list

#########################################################################