# for emacs: -*- mode: sh; -*-

# This file describes browser build for the calMil1
# Callorhinchus milii - Elephant shark

#	DATE:   11-Dec-2013
#	ORGANISM:       Callorhinchus milii
#	TAXID:  7868
#	ASSEMBLY LONG NAME:     Callorhinchus_milii-6.1.3
#	ASSEMBLY SHORT NAME:    Callorhinchus_milii-6.1.3
#	ASSEMBLY SUBMITTER:  Institute of Molecular and Cell Biology, Singapore
#	ASSEMBLY TYPE:  Haploid
#	NUMBER OF ASSEMBLY-UNITS:       1
#	ASSEMBLY ACCESSION:     GCA_000165045.2

#	FTP-RELEASE DATE: 12-Dec-2013


#       rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Callorhinchus_milii/Callorhinchus_milii-6.1.3/

#	Mitochondrial sequence: NC_014285


#############################################################################
# fetch sequence from genbank (DONE - 2014-03-04 - Hiram)
    mkdir -p /hive/data/genomes/calMil1/genbank
    cd /hive/data/genomes/calMil1/genbank

    rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Callorhinchus_milii/Callorhinchus_milii-6.1.3/ ./

    # measure sequence to be used here  (there will be the chrMT later ...)
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz

    # 974481817 bases (37545129 N's 936936688 real 936936688 upper 0 lower)
    # in 21203 sequences in 1 files
    # Total size: mean 45959.6 sd 516515.0 min 66
    # (gi|564405813|gb|AAVX02067420.1|)
    # max 18507834 (gi|564982704|gb|KI635855.1|) median 1428
    # %0.00 masked total, %0.00 masked real


#############################################################################
# fixup names for UCSC standards (DONE - 2014-03-04 - Hiram)
    cd /hive/data/genomes/calMil1
    $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl calMil1
    # constructs ./ucsc/ directory here:
# -rw-rw-r-- 1   6131987 Mar  4 12:24 calMil1.ucsc.agp
# -rw-rw-r-- 1 285809423 Mar  4 12:30 calMil1.ucsc.fa.gz
# -rw-rw-r-- 1       203 Mar  4 12:30 checkAgp.result.txt

#############################################################################
#  Initial database build (DONE - 2014-04-01 - Hiram)

    cd /hive/data/genomes/calMil1
    cat << '_EOF_' > calMil1.config.ra
# Config parameters for makeGenomeDb.pl:
db calMil1
clade vertebrate
genomeCladePriority 70
scientificName Callorhinchus milii
commonName Elephant shark
assemblyDate Dec. 2013
assemblyLabel Institute of Molecular and Cell Biology, Singapore
assemblyShortLabel Callorhinchus_milii-6.1.3
orderKey 4796
mitoAcc NC_014285.1
fastaFiles /hive/data/genomes/calMil1/ucsc/calMil1.ucsc.fa.gz
agpFiles /hive/data/genomes/calMil1/ucsc/calMil1.ucsc.agp
dbDbSpeciesDir calMil
photoCreditURL http://www.flagstaffotos.com.au/
photoCreditName Flagstaff Fotos/Wikipedia
ncbiGenomeId 689
ncbiAssemblyId 85971
ncbiAssemblyName Callorhinchus_milii-6.1.3
ncbiBioProject 18361
genBankAccessionID GCA_000165045.2
taxId 7868
'_EOF_'
    # << happy emacs

    # stepwise to verify sequence and AGP file
    makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
	-stop=seq calMil1.config.ra > seq.log 2>&1

    # verify sequence and AGP are OK:
    makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
	-continue=agp -stop=agp calMil1.config.ra > agp.log 2>&1

    # then finish it off:
    makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
	-continue=db calMil1.config.ra > db.log 2>&1
    #  real    22m15.793s
    makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
	-continue=dbDb calMil1.config.ra > dbDb.log 2>&1

##########################################################################
# running repeat masker (DONE - 2014-04-01 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/repeatMasker
    cd /hive/data/genomes/calMil1/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=ku calMil1 > do.log 2>&1 &
    #    real    65m15.324s

    cat faSize.rmsk.txt
    #  974498586 bases (37545143 N's 936953443 real 675030843 upper
    #    261922600 lower) in 21204 sequences in 1 files
    #  Total size: mean 45958.2 sd 516502.9 min 66 (AAVX02067420)
    #    max 18507834 (KI635855) median 1428
    #  %26.88 masked total, %27.95 masked real

    egrep -i "versi|relea" do.log
#  RepeatMasker version open-4.0.3
#    June 20 2013 (open-4-0-3) version of RepeatMasker
#  CC   RELEASE 20130422;     

    featureBits -countGaps calMil1 rmsk
    #   262125804 bases of 974498586 (26.899%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2014-04-01 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/simpleRepeat
    cd /hive/data/genomes/calMil1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
	calMil1 > do.log 2>&1 &
    #  real    34m39.070s
    cat fb.simpleRepeat 
    #   25376243 bases of 936953458 (2.708%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/calMil1
    twoBitMask calMil1.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed calMil1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa calMil1.2bit stdout | faSize stdin > faSize.calMil1.2bit.txt
    cat faSize.calMil1.2bit.txt
#  974498586 bases (37545143 N's 936953443 real 673857728 upper
#     263095715 lower) in 21204 sequences in 1 files
#  Total size: mean 45958.2 sd 516502.9 min 66 (AAVX02067420)
#     max 18507834 (KI635855) median 1428
#  %27.00 masked total, %28.08 masked real

    rm /gbdb/calMil1/calMil1.2bit
    ln -s `pwd`/calMil1.2bit /gbdb/calMil1/calMil1.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2014-04-01 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/gap
    cd /hive/data/genomes/calMil1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../calMil1.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m9.753s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits -countGaps calMil1 -not gap -bed=notGap.bed

    time featureBits calMil1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   15 bases of 936953458 (0.000%) in intersection
    #  real    7m11.392s

    # not enough to worry about, in fact they are all on chrM
    # nothing to do, take a look at felCat5.txt for an example
    # of what to do here with the new gaps

##########################################################################
## WINDOWMASKER (DONE - 2014-04-01 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/windowMasker
    cd /hive/data/genomes/calMil1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev calMil1 > do.log 2>&1 &
    #  real    60m16.423s

    # Masking statistics
    cat faSize.calMil1.cleanWMSdust.txt
# 974498586 bases (37545143 N's 936953443 real 579835761 upper
#    357117682 lower) in 21204 sequences in 1 files
# Total size: mean 45958.2 sd 516502.9 min 66 (AAVX02067420)
#   max 18507834 (KI635855) median 1428
# %36.65 masked total, %38.11 masked real

    # how much does this window masker and repeat masker overlap:
    # if RM finished before this got here, the answer is in:
    cat fb.calMil1.rmsk.windowmaskerSdust.txt
    #   226835726 bases of 974498586 (23.277%) in intersection

    # or, if WM finished first, that failed, and this was the last
    # step of the procedure:
    featureBits -countGaps calMil1 rmsk windowmaskerSdust
    #   226835726 bases of 974498586 (23.277%) in intersection

    # plus, if it failed, run the clean step to completely finish WM

#############################################################################
# cytoBandIdeo - (DONE - 2014-04-01 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/cytoBand
    cd /hive/data/genomes/calMil1/bed/cytoBand
    makeCytoBandIdeo.csh calMil1

##########################################################################
# cpgIslands - (DONE - 2014-04-02 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/cpgIslands
    cd /hive/data/genomes/calMil1/bed/cpgIslands
    time doCpgIslands.pl calMil1 > do.log 2>&1 &
    # real    27m23.569s

    cat fb.calMil1.cpgIslandExt.txt
    #   8426291 bases of 936953458 (0.899%) in intersection

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2014-04-01 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/calMil1/bed/cpgIslandsUnmasked

    # run stepwise so the loading can be done in a different table
    time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -stop=makeBed \
          -maskedSeq=/hive/data/genomes/calMil1/calMil1.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku calMil1 > makeBed.log 2>&1
    #  real    4m40.925s

    # debug load step so it can be loaded into a separate table:
    time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -debug -continue=load \
          -maskedSeq=/hive/data/genomes/calMil1/calMil1.unmasked.2bit \
        -workhorse=hgwdev -smallClusterHub=ku calMil1
    # edit and change the table name to load: cpgIslandExtUnmasked

    time ./doLoadCpg.csh > load.log 2>&1
    #   Read 35561 elements of size 10 from cpgIsland.bed
    #   real    0m8.166s

    cat fb.calMil1.cpgIslandExtUnmasked.txt 
    $ 33531280 bases of 2286657046 (1.466%) in intersection

    time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -continue=cleanup \
          -maskedSeq=/hive/data/genomes/calMil1/calMil1.unmasked.2bit \
        -workhorse=hgwdev -smallClusterHub=ku calMil1
    # real    0m50.679s

#########################################################################
# genscan - (DONE - 2014-04-02 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/genscan
    cd /hive/data/genomes/calMil1/bed/genscan
    time doGenscan.pl calMil1 > do.log 2>&1  &
    # real    35m25.695s
    # three broken jobs, run manually on hgwdev with window size 2000000
    time doGenscan.pl -continue=makeBed -buildDir=`pwd` calMil1 \
      > makeBed.log 2>&1
    # real    3m26.488s

    cat fb.calMil1.genscan.txt
    #   38945528 bases of 936953458 (4.157%) in intersection
    cat fb.calMil1.genscanSubopt.txt
    #   28097880 bases of 936953458 (2.999%) in intersection

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2014-04-02 - Hiram)
    # Use -repMatch=400, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 936953443 / 2897316137 \) \* 1024
    # ( 936953443 / 2897316137 ) * 1024 = 331.147959

    # round up to 400

    cd /hive/data/genomes/calMil1
    blat calMil1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/calMil1.11.ooc -repMatch=400
    #	Wrote 18953 overused 11-mers to jkStuff/calMil1.11.ooc

    # there are *only* bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" calMil1 | sort | uniq -c
    #    46217 yes

#########################################################################
# AUTO UPDATE GENBANK (WORKING - 2014-04-02 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism                mrnaCnt estCnt  refSeqCnt
# Callorhinchus milii     35299   237666  0

    # to decide which "native" mrna or ests you want to specify in genbank.conf
    # this appears that calMil1 has plenty of native est's

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add calMil1 following balAcu1
# calMil1 (Elephant shark)
calMil1.serverGenome = /hive/data/genomes/calMil1/calMil1.2bit
calMil1.clusterGenome = /hive/data/genomes/calMil1/calMil1.2bit
calMil1.ooc = /hive/data/genomes/calMil1/jkStuff/calMil1.11.ooc
calMil1.lift = no
calMil1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
calMil1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
calMil1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
calMil1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
calMil1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
calMil1.refseq.mrna.native.load = no
calMil1.refseq.mrna.xeno.load = yes
calMil1.genbank.mrna.xeno.load = no
calMil1.genbank.est.native.load = yes
calMil1.genbank.mrna.native.load = yes
calMil1.genbank.mrna.native.loadDesc = no
calMil1.downloadDir = calMil1
calMil1.perChromTables = no

    # end of section added to etc/genbank.conf
    # and edit src/lib/gbGenome.c to add new species.
    git commit -m "adding calMil1 Elephant shark refs #12976" \
        etc/genbank.conf src/lib/gbGenome.c
    git push
    make etc-update
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen			# long running job managed in screen
    cd /cluster/data/genbank
    time ./bin/gbAlignStep -initial calMil1 &
    #	var/build/logs/2014.04.02-09:58:24.calMil1.initalign.log
    #	real    634m0.768s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad calMil1 &
    #	logFile: var/dbload/hgwdev/logs/2014.04.03-12:17:27.calMil1.dbload.log
    #	real    18m53.557s

    # enable daily alignment and update of hgwdev (TBD - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add calMil1 to: etc/align.dbs etc/hgwdev.dbs
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added calMil1 to daily hgwdev build refs #12976" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
# set default position showing the missing SPP gene (DONE - 2014-04-04 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="KI635875:1934002-2090480" where name="calMil1";' \
	hgcentraltest

#########################################################################
# create ucscToINSDC name mapping (DONE - 2014-04-03 - Hiram)
    mkdir /hive/data/genomes/calMil1/bed/ucscToINSDC
    cd /hive/data/genomes/calMil1/bed/ucscToINSDC

    # this script has been maturing over time, it is close to complete.
    # to find a latest copy of it:
    # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh

    cp -p /hive/data/genomes/balAcu1/bed/ucscToINSDC/translateNames.sh .
    ./translateNames.sh
    # it says:
# need to find chrM accessions
    # so add this one:
    echo -e 'chrM\tNC_014285.1' >> ucscToINSDC.txt
    # needs to be sorted to work with join
    sort ucscToINSDC.txt > ucscToINSDC.tab

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \
        > name.coordinate.tab

    join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
# 12

    # use the 12 in this sed:
    sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab calMil1 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords calMil1 ucscToINSDC
    featureBits -countGaps calMil1 ucscToINSDC
    # 974498586 bases of 974498586 (100.000%) in intersection

##############################################################################
# construct download files (DONE - 2014-04-03 - Hiram)
    # after db name has been added to all.joiner and
    # joinerCheck -database=calMil1 -keys all.joiner
    # is clean

    cd /hive/data/genomes/calMil1
    time makeDownloads.pl -workhorse=hgwdev -dbHost=hgwdev calMil1 \
       > downloads.log 2>&1
    # real    10m14.722s

##############################################################################
# pushQ entry (DONE - 2014-04-04 - Hiram)
    mkdir /hive/data/genomes/calMil1/pushQ
    cd /hive/data/genomes/calMil1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl calMil1 2> stderr.txt | grep -v transMap > calMil1.sql
    #   real    1m54.437s

    scp -p calMil1.sql qateam@hgwbeta:/tmp
    ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/calMil1.sql'

###########################################################################
## blat server turned on (DONE - 2014-04-17 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("calMil1", "blat4c", "17854", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("calMil1", "blat4c", "17855", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################