# for emacs: -*- mode: sh; -*-

# This file describes browser build for the anoGam3
#
#  Can use existing photograph (otherwise find one before starting here)

#########################################################################
#  Initial steps, find photograph (DONE - 2017-12-19 - Hiram)
#  Photo obtained from CDC: https://phil.cdc.gov/Details.aspx?pid=444

# To start this initialBuild.txt document, from a previous assembly document:

mkdir ~/kent/src/hg/makeDb/doc/anoGam3
cd ~/kent/src/hg/makeDb/doc/anoGam3

sed -e 's/danRer11/anoGam3/g; s/DanRer11/AnoGam3/g; s/DONE/TBD/g;' ../danRer11/initialBuild.txt > initialBuild.txt

mkdir -p /hive/data/genomes/anoGam3/refseq
cd /hive/data/genomes/anoGam3/refseq
time rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Anopheles_gambiae/all_assembly_versions/GCF_000005575.2_AgamP3/ ./

# sent 871 bytes  received 334800319 bytes  13665354.69 bytes/sec
# total size is 334755420  speedup is 1.00
# real    0m24.010s

printf "photoCreditURL\thttps://phil.cdc.gov/Details.aspx?pid=444
photoCreditName\tphoto courtesy CDC/James D. Gathany\n" > photoReference.txt

# From top of refseq/GCF_000005575.2_AgamP3_assembly_report.txt

# Assembly name:  AgamP3
# Organism name:  Anopheles gambiae str. PEST (African malaria mosquito)
# Infraspecific name:  strain=PEST
# Taxid:          180454
# BioSample:      SAMN02952903
# BioProject:     PRJNA1438
# Submitter:      The International Consortium for the Sequencing of Anopheles Genome
# Date:           2006-10-16
# Assembly type:  haploid
# Release type:   major
# Assembly level: Chromosome
# Genome representation: full
# WGS project:    AAAB01
# RefSeq category: Representative Genome
# GenBank assembly accession: GCA_000005575.1
# RefSeq assembly accession: GCF_000005575.2
# RefSeq assembly and GenBank assemblies identical: no
#
## Assembly-Units:
## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
## GCA_000005585.1      GCF_000005585.2 Primary Assembly
##      GCF_000183175.1 non-nuclear

#############################################################################
# establish config.ra file (DONE - 2017-12-19 - Hiram)
    cd /hive/data/genomes/anoGam3
  ~/kent/src/hg/utils/automation/prepConfig.pl anoGam3 invertebrate anopheles \
       refseq/*_assembly_report.txt > anoGam3.config.ra

    # change the scientificName from Anopheles gambiae str. PEST
    # to simply: Anopheles gambiae
    # and reset the fastaFiles and agpFiles names to:
    # ucsc/chr*.fa.gz and ucsc/chr*.agp since there will be some extra files
    # in ucsc/*

    # verify it looks sane
    cat anoGam3.config.ra
# config parameters for makeGenomeDb.pl:
db anoGam3
clade insect
genomeCladePriority 70
scientificName Anopheles gambiae
commonName African malaria mosquito
assemblyDate Oct. 2006
assemblyLabel The International Consortium for the Sequencing of Anopheles Genome
assemblyShortLabel AgamP3
orderKey 1237
# mitochondrial sequence included in refseq release
# mitoAcc NC_002084.1
mitoAcc none
fastaFiles /hive/data/genomes/anoGam3/ucsc/chr*.fa.gz
agpFiles /hive/data/genomes/anoGam3/ucsc/chr*.agp
# qualFiles none
dbDbSpeciesDir anopheles
photoCreditURL  https://phil.cdc.gov/Details.aspx?pid=444
photoCreditName photo courtesy CDC/James D. Gathany
ncbiGenomeId 46
ncbiAssemblyId 305108
ncbiAssemblyName AgamP3
ncbiBioProject 1438
ncbiBioSample SAMN02952903
genBankAccessionID GCF_000005575.2
taxId 180454

#############################################################################
# setup UCSC named files (DONE - 2017-12-19 - Hiram)

    mkdir /hive/data/genomes/anoGam3/ucsc
    cd /hive/data/genomes/anoGam3/ucsc
    # measure what is in the refseq release:
    faSize ../refseq/*3_genomic.fna.gz

# 265027044 bases (12572948 N's 252454096 real 197549571 upper 54904525 lower) in 8090 sequences in 1 files
# Total size: mean 32759.8 sd 1187387.3 min 349 (NW_158375.1) max 61545105 (NT_078266.2) median 1523
# %20.72 masked total, %21.75 masked real

    # check for duplicate sequences:
    time faToTwoBit -noMask ../refseq/*3_genomic.fna.gz refseq.2bit
    #  real    0m52.302s
    twoBitDup refseq.2bit
    # there are 49 duplicates, will need to eliminate them
    twoBitDup refseq.2bit > dup.list
    awk '{print $1}' dup.list | sort > eliminate.list


    # new option required to ucscCompositeAgp.pl 2016-04-13
    time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
     ../refseq/*3_genomic.fna.gz ../refseq/*_assembly_structure/Primary_Assembly
# NC_004818.2 chrX
# NT_078265.2 chr2L
# NT_078266.2 chr2R
# NT_078267.5 chr3L
# NT_078268.4 chr3R
# real    1m21.125s

    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
	../refseq/*_assembly_structure/Primary_Assembly

    time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \
	../refseq/*_assembly_structure/Primary_Assembly

    # the chrUn sequence has the duplicates:
    faToTwoBit chrUn.fa.gz chrUn.2bit
    twoBitDup chrUn.2bit | awk '{print $1}' > duplicates.chrUn.list
    mv chrUn.fa.gz dups.chrUn.fa.gz
    faSomeRecords -exclude dups.chrUn.fa.gz \
	duplicates.chrUn.list stdout | gzip -c > chrUn.fa.gz
    mv chrUn.agp dups.chrUn.agp
    egrep -v -f duplicates.chrUn.list dups.chrUn.agp > chrUn.agp
    rm -f chrUn.2bit

    # also need chrM:
    echo ">chrM" > chrM.fa
    zgrep -v "^>" ../refseq/GCF_000005575.2_AgamP3_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz >> chrM.fa
    gzip chrM.fa

    zgrep -v "^#" ../refseq/GCF_000005575.2_AgamP3_assembly_structure/non-nuclear/assembled_chromosomes/AGP/chrMT.comp.agp.gz \
	| sed -e 's/NC_002084.1/chrM/;' > chrM.agp

    # verify sequence and AGP match:
    faToTwoBit chr*.fa.gz test.2bit
    cat chr*.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -2
    # Valid Fasta file entry
    # All AGP and FASTA entries agree - both files are valid

    # and we have all the sequence as the original (minus the duplicates)
    twoBitToFa test.2bit stdout | faSize stdin
# 264974304 bases (12572948 N's 252401356 real 252401356 upper 0 lower)
#	in 8041 sequences in 1 files

    # original refseq was:
# 265027044 bases (12572948 N's 252454096 real 197549571 upper 54904525 lower)
#	in 8090 sequences in 1 files

    # 49 missing sequences is correct:
    calc 8090 - 8041
    #  8090 - 8041 = 49.000000

    # size of duplicate sequence:
    faSomeRecords dups.chrUn.fa.gz duplicates.chrUn.list stdout | faSize stdin
# 52740 bases (0 N's 52740 real 52740 upper 0 lower) in 49 sequences in 1 files

    # amount of removed sequence is correct:

    calc 265027044 - 52740
    # 265027044 - 52740 = 264974304.000000

    # no longer need these
    rm refseq.2bit test.2bit

#############################################################################
#  Initial database build (DONE - 2017-12-19 - Hiram)

    cd /hive/data/genomes/anoGam3
    # verify sequence and AGP are OK:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp anoGam3.config.ra) > agp.log 2>&1
    # verify OK:
    tail agp.log
    # *** All done!  (through the 'agp' step)
    # real    0m26.234s

    # then finish it off:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db anoGam3.config.ra) > db.log 2>&1
    # real    11m9.465s

    # had to fixup names in dbDb to get it into the same pull-down menu
    # on the gateway page as is anoGam1:
hgsql -e 'update dbDb set organism="A. gambiae" where name like "anoGam3";' hgcentraltest
hgsql -e 'update dbDb set genome="A. gambiae" where name like "anoGam3";' hgcentraltest

    # check in the trackDb files created in TemporaryTrackDbCheckout/
    #    and add anoGam3 to trackDb/makefile

    # temporary symlink until masked sequence is available
    cd /hive/data/genomes/anoGam3
    ln -s `pwd`/anoGam3.unmasked.2bit /gbdb/anoGam3/anoGam3.2bit

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2017-12-19 - Hiram)
    mkdir /hive/data/genomes/anoGam3/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/anoGam3/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/anoGam3/anoGam3.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku anoGam3) > do.log 2>&1
    # real    1m45.616s

    cat fb.anoGam3.cpgIslandExtUnmasked.txt
    # 21871673 bases of 264424304 (8.271%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2017-12-19 - Hiram)
    mkdir /hive/data/genomes/anoGam3/bed/cytoBand
    cd /hive/data/genomes/anoGam3/bed/cytoBand
    makeCytoBandIdeo.csh anoGam3

#########################################################################
# ucscToRefSeq table/track (DONE - 2017-12-19 - Hiram)
    # the sequence here is working for a 'refseq' assembly with a chrM
    # situation may be specific depending upon what is available in the assembly

    mkdir /hive/data/genomes/anoGam3/bed/ucscToRefSeq
    cd /hive/data/genomes/anoGam3/bed/ucscToRefSeq

    # find accession for chrM
    grep chrM ../../anoGam3.agp
# chrM    1       15363   1       O       NC_002084.1     1       15363   +

    # use that accession here:
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
        ../../refseq/GCF_*structure/Primary_Assembly NC_002084.1
    # this is actually refseq, not genbank, and need to remove duplicates:
    egrep -v -f ../../ucsc/duplicates.chrUn.list ucscToINSDC.txt \
	> ucscToRefSeq.txt
    rm ucscToINSDC.txt

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    join -t$'\t' name.coordinate.tab ucscToRefSeq.txt > ucscToRefSeq.bed

    # should all be the same number of items:
    wc -l *
    # 8041 name.coordinate.tab
    # 8041 ucscToRefSeq.bed
    # 8041 ucscToRefSeq.txt

    cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1
    # 23
    # use the 23 in this sed
    sed -e "s/21/23/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab anoGam3 ucscToRefSeq stdin ucscToRefSeq.bed

    checkTableCoords anoGam3
    # should cover %100 entirely:
    featureBits -countGaps anoGam3 ucscToRefSeq
    # 264974304 bases of 264974304 (100.000%) in intersection

#########################################################################
# ucscToINSDC table/track (DONE - 2017-12-19 - Hiram)
    # using the idKeys constructed

    mkdir /hive/data/genomes/anoGam3/bed/ucscToINSDC
    cd /hive/data/genomes/anoGam3/bed/ucscToINSDC

    twoBitDup ../idKeys/genbank/genbank.anoGam3.2bit \
	| awk '{print $1}' > genbank.duplicate.list
    join -t$'\t' ../idKeys/anoGam3.idKeys.txt \
	../idKeys/genbank/refseqAnoGam3.idKeys.txt \
	| egrep -v -f genbank.duplicate.list | cut -f2- \
	| sort > ucscToINSDC.txt

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    join -t$'\t' name.coordinate.tab ucscToINSDC.txt > ucscToINSDC.bed
    # lookup the chrM sequence an Entrez nuclotide NC_002084.1 -> L20934.1
    # adding this chrM to the .bed file:
    grep chrM ../../*.sizes
    # chrM    15363
    printf "chrM\t0\t15363\tL20934.1\n" >> ucscToINSDC.bed

    # should all be the same row counts
    wc -l *
    # 8041 name.coordinate.tab
    # 8041 ucscToINSDC.bed
    # 8040 ucscToINSDC.txt

    # the ucscToINSDC.txt is missing the chrM identifier

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 23
    # use the 23 in this sed
    sed -e "s/21/23/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab anoGam3 ucscToINSDC stdin ucscToINSDC.bed

    checkTableCoords anoGam3
    # should cover %100 entirely:
    featureBits -countGaps anoGam3 ucscToINSDC
    # 264974304 bases of 264974304 (100.000%) in intersection

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2017-12-19 - Hiram)
    cd ~/kent/src/hg/makeDb/trackDb/anopheles/anoGam3
    # preview prefixes and suffixes:
    hgsql -N -e "select frag from gold;" anoGam3 \
      | sed -e 's/[0-9][0-9]*//; s/\.[0-9]\+$//' | sort | uniq -c
#    8115 AAAB
#       1 NC_

    # implies a rule: '[AN][ABC_]+[0-9]+(\.[0-9]+)?'

    # verify this rule will find them all and eliminate them all:
    hgsql -N -e "select frag from gold;" anoGam3 | wc -l
    # 8116

    hgsql -N -e "select frag from gold;" anoGam3 \
	| egrep -e '[AN][ABC_]+[0-9]+(\.[0-9]+)?' | wc -l
    # 8116

    hgsql -N -e "select frag from gold;" anoGam3 \
	| egrep -v -e '[AN][ABC_]+[0-9]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/zebrafish/anoGam3/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AN][ABC_]+[0-9]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

##########################################################################
# running repeat masker (DONE - 2017-12-19 - Hiram)
    mkdir /hive/data/genomes/anoGam3/bed/repeatMasker
    cd /hive/data/genomes/anoGam3/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku anoGam3) > do.log 2>&1
    # real    60m3.969s

    cat faSize.rmsk.txt
# 264974304 bases (12572948 N's 252401356 real 216585988 upper 35815368 lower) in 8041 sequences in 1 files
# Total size: mean 32952.9 sd 1190997.5 min 349 (chrUn_NW_158375v1) max 61545105 (chr2R) median 1529
# %13.52 masked total, %14.19 masked real


    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
	# grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker
	#    January 31 2015 (open-4-0-5) version of RepeatMasker
	# grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
	# CC   RELEASE 20140131;

    time featureBits -countGaps anoGam3 rmsk
    # 35874857 bases of 264974304 (13.539%) in intersection
    # real    0m5.461s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the faSize count above
    #   separates out the N's from the bases, it doesn't show lower case N's

    # faster way to get the same result for high contig count assemblies:
    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' anoGam3 \
        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
    # total 35874857.000000
    # real    0m1.798s

##########################################################################
# running simple repeat (DONE - 2017-12-19 - Hiram)

    mkdir /hive/data/genomes/anoGam3/bed/simpleRepeat
    cd /hive/data/genomes/anoGam3/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        -trf409 3 anoGam3) > do.log 2>&1
    # real    5m39.782s

    cat fb.simpleRepeat
    # 6341048 bases of 264424304 (2.398%) in intersection

    cd /hive/data/genomes/anoGam3
    # using the Window Masker result:
    #twoBitMask bed/windowMasker/anoGam3.cleanWMSdust.2bit \
    #   -add bed/simpleRepeat/trfMask.bed  anoGam3.2bit
    #   you can safely ignore the warning about fields >= 13

    # add to rmsk after it is done:
    twoBitMask anoGam3.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed anoGam3.2bit
    #   you can safely ignore the warning about fields >= 13
    twoBitToFa anoGam3.2bit stdout | faSize stdin > faSize.anoGam3.2bit.txt
    cat faSize.anoGam3.2bit.txt
# 264974304 bases (12572948 N's 252401356 real 216346487 upper 36054869 lower) in 8041 sequences in 1 files
# Total size: mean 32952.9 sd 1190997.5 min 349 (chrUn_NW_158375v1) max 61545105 (chr2R) median 1529
# %13.61 masked total, %14.28 masked real

    rm /gbdb/anoGam3/anoGam3.2bit
    ln -s `pwd`/anoGam3.2bit /gbdb/anoGam3/anoGam3.2bit

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2017-12-19 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/anoGam3/bed/microsat
    cd /cluster/data/anoGam3/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed anoGam3 microsat microsat.bed
    # Read 3959 elements of size 4 from microsat.bed
    featureBits -countGaps anoGam3 microsat
    # 226144 bases of 264974304 (0.085%) in intersection

##########################################################################
## WINDOWMASKER (DONE - 2017-12-19 - Hiram)

    mkdir /hive/data/genomes/anoGam3/bed/windowMasker
    cd /hive/data/genomes/anoGam3/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev anoGam3) > do.log 2>&1
    # real    12m20.317s

    # Masking statistics
    cat faSize.anoGam3.cleanWMSdust.txt
# 264974304 bases (12572948 N's 252401356 real 196003243 upper 56398113 lower) in 8041 sequences in 1 files
# Total size: mean 32952.9 sd 1190997.5 min 349 (chrUn_NW_158375v1) max 61545105 (chr2R) median 1529
# %21.28 masked total, %22.34 masked real

    cat fb.anoGam3.rmsk.windowmaskerSdust.txt
    # 19279994 bases of 264974304 (7.276%) in intersection

##########################################################################
# run up idKeys files for chromAlias (DONE - 2017-12-19 - Hiram)
    mkdir /hive/data/genomes/anoGam3/bed/idKeys
    cd /hive/data/genomes/anoGam3/bed/idKeys

    time (doIdKeys.pl -buildDir=`pwd` -twoBit=/hive/data/genomes/anoGam3/anoGam3.unmasked.2bit anoGam3) > do.log 2>&1 &
    # real    5m6.175s

    cat anoGam3.keySignature.txt
    #   ae5975a59b0fd10409ed20280a0188df

    # genbank and flybase idKeys were also run up here in ./genbank/
    # and ./flybase/

##########################################################################
# cpgIslands - (DONE - 2017-12-19 - Hiram)
    mkdir /hive/data/genomes/anoGam3/bed/cpgIslands
    cd /hive/data/genomes/anoGam3/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku anoGam3) > do.log 2>&1
    # real    1m15.884s

    cat  fb.anoGam3.cpgIslandExt.txt
    # 19703582 bases of 264424304 (7.452%) in intersection

##############################################################################
# genscan - (DONE - 2017-12-19 - Hiram)
    mkdir /hive/data/genomes/anoGam3/bed/genscan
    cd /hive/data/genomes/anoGam3/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku anoGam3) > do.log 2>&1
    # real    34m10.304s

    cat fb.anoGam3.genscan.txt
    # 35064324 bases of 264424304 (13.261%) in intersection

    cat fb.anoGam3.genscanSubopt.txt
    # 2488274 bases of 264424304 (0.941%) in intersection

#############################################################################
# Create kluster run files (DONE - 2017-12-20 - Hiram)

    # numerator is anoGam3 gapless bases "real" as reported by:
    featureBits -noRandom -noHap anoGam3 gap
    # 550000 bases of 229932020 (0.239%) in intersection
    #                   ^^^

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 229932020 / 2861349177 \) \* 1024
    #  ( 229932020 / 2861349177 ) * 1024 = 82.286493

    # ==> use -repMatch=100 according to size scaled down from 1024 for human.
    #   and rounded up to nearest 100
    cd /hive/data/genomes/anoGam3
    blat anoGam3.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/anoGam3.11.ooc \
        -repMatch=100
    # Wrote 10653 overused 11-mers to jkStuff/anoGam3.11.ooc


    #   check non-bridged gaps to see what the typical size is:
    hgsql -N \
        -e 'select * from gap where bridge="no" order by size;' anoGam3 \
        | sort -k7,7nr | ave -col=7 stdin
    # all these non-bridged gaps are 10,000
    gapToLift -verbose=2 -minGap=10000 anoGam3 jkStuff/nonBridged.lft \
        -bedFile=jkStuff/nonBridged.bed

########################################################################
# GENBANK AUTO UPDATE (DONE - 2017-12-20 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # organism               mrnaCnt estCnt  refSeqCnt
    # Anopheles gambiae       64151   153332  0

    # edit etc/genbank.conf to add anoGam3 just before rheMac2
# anoGam3 (A. gambiae)
anoGam3.serverGenome = /hive/data/genomes/anoGam3/anoGam3.2bit
anoGam3.clusterGenome = /hive/data/genomes/anoGam3/anoGam3.2bit
anoGam3.ooc = /hive/data/genomes/anoGam3/jkStuff/anoGam3.11.ooc
anoGam3.lift = /hive/data/genomes/anoGam3/jkStuff/nonBridged.lft
anoGam3.perChromTables = no
anoGam3.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
anoGam3.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
anoGam3.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
anoGam3.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
anoGam3.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
# DO NOT NEED genbank.mrna.xeno except for human, mouse
# anoGam1 had genbank.mrna.xeno.load as yes
# defaults yes: genbank.mrna.native.load, genbank.mrna.native.loadDesc,
# genbank.est.native.load, refseq.mrna.native.load, refseq.mrna.native.loadDesc,
# refseq.mrna.xeno.load , refseq.mrna.xeno.loadDesc
anoGam3.genbank.mrna.xeno.load = yes
anoGam3.downloadDir = anoGam3

    git commit -m "Added anoGam3; no redmine" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial anoGam3
    # logFile: var/build/logs/2017.12.20-08:12:09.anoGam3.initalign.log

    tail var/build/logs/2017.12.20-08:12:09.anoGam3.initalign.log
# hgwdev 2017.12.20-18:35:09 anoGam3.initalign: Succeeded: anoGam3
# hgwdev 2017.12.20-18:40:13 anoGam3.initalign: finish

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.anoGam3

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad anoGam3
    # logFile: var/dbload/hgwdev/logs/2017.12.20-19:54:42.anoGam3.dbload.log
    # real    71m51.995s
    tail -1 var/dbload/hgwdev/logs/2017.12.20-19:54:42.anoGam3.dbload.log
    # hgwdev 2017.12.20-21:06:34 anoGam3.dbload: finish

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add anoGam3 to:
    #   etc/align.dbs etc/hgwdev.dbs
    git add etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added anoGam3 - malaria mosquito refs #20743" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#############################################################################
# augustus gene track (DONE - 2017-12-20 - Hiram)

    mkdir /hive/data/genomes/anoGam3/bed/augustus
    cd /hive/data/genomes/anoGam3/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
        -species=zebrafish -dbHost=hgwdev \
           -workhorse=hgwdev anoGam3) > do.log 2>&1
    # real    65m0.735s

    cat fb.anoGam3.augustusGene.txt
    # 20832482 bases of 264424304 (7.878%) in intersection

#########################################################################
# add chromAlias table (DONE - 2017-12-20 - Hiram)

    mkdir /hive/data/genomes/anoGam3/bed/chromAlias
    cd /hive/data/genomes/anoGam3/bed/chromAlias

    # working up new procedure here to allow in the duplicates from the
    # others to the single names at UCSC

    # the corresponding lists with duplicates are made from the idKeys
    # run up for the various assemblies:
    join -t$'\t' ../idKeys/anoGam3.idKeys.txt \
	../../../anoGam2/flybase/both.idKeys.txt | cut -f2- \
	| sort > ucsc.ensembl.tab

    (printf "chrM\tL20934.1\n"; join -t$'\t'  ../idKeys/anoGam3.idKeys.txt \
	../idKeys/genbank/refseqAnoGam3.idKeys.txt | cut -f2-) \
	| sort > ucsc.genbank.tab

    join -t$'\t'  ../idKeys/anoGam3.idKeys.txt \
	../idKeys/refseq/refseqAnoGam3.idKeys.txt | cut -f2- \
	| sort > ucsc.refseq.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.genbank.tab \
	ucsc.refseq.tab ucsc.ensembl.tab \
	  > anoGam3.chromAlias.tab

    # verify chrM is OK:
    grep chrM anoGam3.chromAlias.tab
# L20934.1        chrM    genbank
# Mt      chrM    ensembl
# NC_002084.1     chrM    refseq

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t anoGam3.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 8090 =? 8090 OK
# checking genbank: 8090 =? 8090 OK
# checking ensembl: 8090 =? 8090 OK

    hgLoadSqlTab anoGam3 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        anoGam3.chromAlias.tab

##############################################################################
# lastz/chain/net swap D. melanogaster/dm6 (DONE - 2017-12-20 - Hiram)

    # alignment to dm6
    cd /hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20

    cat fb.dm6.chainAnoGam3Link.txt
    # 19641460 bases of 142573024 (13.776%) in intersection

    cat fb.dm6.chainRBestAnoGam3Link.txt
    # 15964959 bases of 142573024 (11.198%) in intersection

    #   running the swap
    mkdir /hive/data/genomes/anoGam3/bed/blastz.dm6.swap
    cd /hive/data/genomes/anoGam3/bed/blastz.dm6.swap
    time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20/DEF \
        -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > swap.log 2>&1
    # real    2m0.901s

    cat fb.anoGam3.chainDm6Link.txt
    # 20168390 bases of 264424304 (7.627%) in intersection

    time (doRecipBest.pl -load -workhorse=hgwdev \
	-buildDir=`pwd` anoGam3 dm6) > rBest.log 2>&1 &
    # real    46m7.441s

    cat fb.anoGam3.chainRBestDm6Link.txt
    # 16040573 bases of 264424304 (6.066%) in intersection

###########################################################################
# LIFTOVER TO anoGam1 (DONE - 2017-12-20 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/anoGam3/bed/blat.anoGam1.2017-12-20
    cd /hive/data/genomes/anoGam3/bed/blat.anoGam1.2017-12-20
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/anoGam3/jkStuff/anoGam3.11.ooc \
         anoGam3 anoGam1) > doLiftOverToAnoGam1.log 2>&1
    # real    20m45.302s

    # see if the liftOver menus function in the browser from anoGam3 to anoGam1

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2017-12-20 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev
   # Starting trans gfServer for anoGam3 on host blat1d, port 17890
   # Starting untrans gfServer for anoGam3 on host blat1d, port 17891

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("anoGam3", "blat1d", "17890", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("anoGam3", "blat1d", "17891", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## reset default position to same as anoGam1
##  (DONE - 2017-12-21 - Hiram)

    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chr2R:6479796-6482329"
	where name="anoGam3";' hgcentraltest

############################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2017-12-20 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=anoGam3 -tableCoverage all.joiner
    joinerCheck -database=anoGam3 -times all.joiner
    joinerCheck -database=anoGam3 -keys all.joiner

    cd /hive/data/genomes/anoGam3
    time (makeDownloads.pl anoGam3) > downloads.log 2>&1
    #  real    2m14.288s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/anoGam3/pushQ
    cd /hive/data/genomes/anoGam3/pushQ
    time (makePushQSql.pl -redmineList anoGam3) > anoGam3.pushQ.sql 2> stderr.out
    # real    3m51.650s

    # the chainRBest and chainSyn tables do not go out, they are
    # not on the redmine tables listing.  The reciprocalBest download
    # files are released and are on the redmine files listing
# WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
# supporting and genbank tables) which tracks to assign these tables to:
#   chainRBestDm6
#   chainRBestDm6Link
#   chainSynDm6
#   chainSynDm6Link
#   netRBestDm6
#   netSynDm6

    # verify the file listings are valid, should be no output to stderr:
    cat redmine.anoGam3.file.list \
	| while read L; do ls -ogL $L; done  > /dev/null

    # to verify the database.table list is correct, should be the same
    # line count for these two commands:
    wc -l redmine.anoGam3.table.list
    # 62 redmine.anoGam3.table.list
    awk -F'.' '{
printf "hgsql -N -e \"show table status like '"'"'%s'"'"';\" %s\n", $2, $1
}' redmine.anoGam3.table.list | while read L; do eval $L; done | wc -l
    # 62

    # Put a path to the files produced here in the redmine ticket

#############################################################################
# Ensembl genes (DONE - 2018-01-06 - Hiram)
    # after chromAlias work is done:

    cd /hive/data/genomes/anoGam3/jkStuff
    join -t$'\t' <(sort -k1,1 ../chrom.sizes) \
      <(sort ../bed/chromAlias/ucsc.ensembl.tab) \
        | awk '{printf "0\t%s\t%d\t%s\t%d\n", $3,$2,$1,$2}' > ensToUcsc.lift

    mkdir /hive/data/genomes/anoGam3/bed/ensGene.37/download
    cd /hive/data/genomes/anoGam3/bed/ensGene.37/download

    wget --tries=2 --user=anonymous --password=ucscGenomeBrowser@ucsc.edu \
ftp://ftp.ensemblgenomes.org/pub/metazoa/release-37/gtf/anopheles_gambiae/Anopheles_gambiae.AgamP4.37.gtf.gz \
-O Anopheles_gambiae.AgamP4.37.gtf.gz

    wget --tries=2 --user=anonymous --password=ucscGenomeBrowser@ucsc.edu \
ftp://ftp.ensemblgenomes.org/pub/metazoa/release-37/fasta/anopheles_gambiae/pep/Anopheles_gambiae.AgamP4.pep.all.fa.gz \
-O Anopheles_gambiae.AgamP4.pep.all.fa.gz


    mkdir /hive/data/genomes/anoGam3/bed/ensGene.37/process
    cd /hive/data/genomes/anoGam3/bed/ensGene.37/process
    printf '#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "<ucscToFlyBase.lift") or die "can not read ucscToFlyBase.lift";
while (my $line = <FH>) {
  chomp $line;
  my ($start, $contig, $size, $flyName, $flySize) = split('"'"'\\t'"'"', $line);
  printf "%%s\\t%%d\\t%%d\\t%%s\\t0\\t+\\n", $flyName, $start, $start+$size, $contig;
}
close (FH);
' > liftToLiftAcross.pl

    chmod +x liftToLiftAcross.pl
    ln -s ../../flybase/ucscToFlyBase.lift .

    # procedure here is patterned after:
    #     /hive/data/genomes/vicPac1/bed/ensGene.91/process/doProcess.csh
    # with the names changed and added special liftAcross/liftUp

zcat ../download/Anopheles_gambiae.AgamP4.37.gtf.gz \
        | gzip > allGenes.gtf.gz
gtfToGenePred -includeVersion -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
    | gzip > anoGam3.allGenes.gp.gz
/cluster/bin/scripts/extractGtf.pl infoOut.txt > ensGtp.tab
/cluster/bin/scripts/ensemblInfo.pl infoOut.txt > ensemblToGeneName.tab
/cluster/bin/scripts/extractSource.pl allGenes.gtf.gz | sort -u > ensemblSource.tab
set NL = `awk 'BEGIN{max=0}{if (length($1) > max) max=length($1)}END{print max}' ensemblToGeneName.tab`
set VL = `awk 'BEGIN{max=0}{if (length($2) > max) max=length($2)}END{print max}' ensemblToGeneName.tab`
sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \
      /cluster/home/hiram/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql
sed -e "s/name(15)/name($NL)/" \
      /cluster/home/hiram/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql
mv anoGam3.allGenes.gp.gz anoGam3.allGenes.beforeLiftAcross.gp.gz

./liftToLiftAcross.pl | gzip -c > anoGam3.ensGene.liftAcross.gz
liftAcross -warn -bedOut=ensemblGeneScaffolds.anoGam3.bed \
    anoGam3.ensGene.liftAcross.gz \
        anoGam3.allGenes.beforeLiftAcross.gp.gz \
            anoGam3.allGenes.beforeLiftUp.gp >& liftAcross.err.out

# one gene has bad cds business
# Error: anoGam3.allGenes.gp:15373: AGAP012697-RA cdsEnd 3384 not in tx bounds 1961-2981
# Error: anoGam3.allGenes.gp:15373: AGAP012697-RA cdsEnd 3384 not in an exon
# checked: 15633 failed: 2

liftUp -extGenePred -type=.gp stdout \
  /hive/data/genomes/anoGam3/jkStuff/ensToUcsc.lift carry \
    anoGam3.allGenes.beforeLiftUp.gp | grep -v AGAP012697-RA \
       > anoGam3.allGenes.gp

gzip ensemblGeneScaffolds.anoGam3.bed anoGam3.allGenes.gp liftAcross.err.out \
    anoGam3.allGenes.beforeLiftUp.gp

genePredCheck -db=anoGam3 anoGam3.allGenes.gp.gz
# construct bigBed and index for assembly hub
mkdir -p bbi
genePredToBed anoGam3.allGenes.gp.gz stdout | sort -k1,1 -k2,2n > anoGam3.ensGene.bed
bedToBigBed -extraIndex=name anoGam3.ensGene.bed ../../../chrom.sizes bbi/anoGam3.ensGene.bb
grep -v "^#" infoOut.txt | awk '{printf "%s\t%s,%s,%s,%s,%s\n", $1,$2,$3,$8,$9,$10}' > anoGam3.ensGene.nameIndex.txt
ixIxx anoGam3.ensGene.nameIndex.txt anoGam3.ensGene.name.ix anoGam3.ensGene.name.ixx

    # procedure here is patterned after:
    #     /hive/data/genomes/vicPac1/bed/ensGene.91/doLoad.csh
hgLoadGenePred  -genePredExt anoGam3 ensGene process/anoGam3.allGenes.gp.gz \
        >& loadGenePred.errors.txt
checkTableCoords anoGam3 -table=ensGene
genePredCheck -db=anoGam3 ensGene
zcat process/ensemblGeneScaffolds.anoGam3.bed.gz | sort > to.clean.GeneScaffolds.bed
cut -f1 /hive/data/genomes/anoGam3/chrom.sizes | sort > legitimate.names
join -t'        ' legitimate.names to.clean.GeneScaffolds.bed \
    | sed -e "s/GeneScaffold/GS/" | hgLoadBed anoGam3 ensemblGeneScaffold stdin
checkTableCoords anoGam3 -table=ensemblGeneScaffold
zcat download/Anopheles_gambiae.AgamP4.pep.all.fa.gz \
        | sed -e 's/^>.* transcript:/>/; s/ CCDS.*$//; s/ .*$//' | gzip > ensPep.txt.gz
zcat ensPep.txt.gz \
    | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
         | sed -e '/^$/d; s/*$//' | sort > ensPep.anoGam3.fa.tab
hgPepPred anoGam3 tab ensPep ensPep.anoGam3.fa.tab
hgLoadSqlTab anoGam3 ensGtp ~/kent/src/hg/lib/ensGtp.sql process/ensGtp.tab
hgLoadSqlTab anoGam3 ensemblToGeneName process/ensemblToGeneName.sql process/ensemblToGeneName.tab
hgLoadSqlTab anoGam3 ensemblSource process/ensemblSource.sql process/ensemblSource.tab
# verify names in ensGene is a superset of names in ensPep
hgsql -N -e "select name from ensPep;" anoGam3 | sort > ensPep.name
hgsql -N -e "select name from ensGene;" anoGam3 | sort > ensGene.name
set geneCount = `cat ensGene.name | wc -l`
set pepCount = `cat ensPep.name | wc -l`
set commonCount = `comm -12 ensPep.name ensGene.name | wc -l`
set percentId = \
    `echo $commonCount $pepCount | awk '{printf "%d", 100.0*$1/$2}'`
echo "gene count: $geneCount, peptide count: $pepCount, common name count: $commonCount"
echo "percentId: $percentId"
if (! ($percentId > 95)) then
    echo "ERROR: percent coverage of peptides to genes: $percentId"
    echo "ERROR: should be greater than 95"
    exit 255
endif

exit $status

XXX need to fixup this entry:

hgsql -e 'INSERT INTO trackVersion \
    (db, name, who, version, updateTime, comment, source, dateReference) \
    VALUES("anoGam3", "ensGene", "hiram", "91", now(), \
        "with peptides Vicugna_pacos.anoGam3.pep.all.fa.gz", \
        "ftp://ftp.ensembl.org/pub/release-91/gtf/vicugna_pacos/Vicugna_pacos.anoGam3.91.gtf.gz", \
        "dec2017" );' hgFixed


    cd /hive/data/genomes/cavApe1
    printf "# required db variable
db cavApe1
# specific lifting to translate names:
liftUp /hive/data/genomes/cavApe1/jkStuff/ensToUcsc.lift
" > cavApe1.ensGene.ra

    time (doEnsGeneUpdate.pl -ensVersion=91 cavApe1.ensGene.ra) \
	> ensGene.91.log 2>&1
    # real    1m31.613s

    featureBits cavApe1 ensGene
    # 21842310 bases of 2716396567 (0.804%) in intersection

##############################################################################
# crispr10K track (DONE - 2018-01-06 - Hiram)
    mkdir /hive/data/genomes/anoGam3/bed/crispr.2018-01-06
    cd /hive/data/genomes/anoGam3/bed/crispr.2018-01-06

    # construct index for new genome, the index ends up in:
    #    /hive/data/outside/crisprTrack/crispor/genomes/anoGam3/

    export PATH=/hive/data/outside/crisprTrack/crispor/tools/usrLocalBin:$PATH
    export TMPDIR=/dev/shm

    time (/hive/data/outside/crisprTrack/crispor/tools/crisprAddGenome \
	ucscLocal anoGam3 --geneTable=ensGene --baseDir \
	    /hive/data/outside/crisprTrack/crispor/genomes) \
        > createIndex.log 2>&1
    # real    7m53.047s
    # result in /hive/data/outside/crisprTrack/crispor/genomes/anoGam3/
# total 1050624
# -rw-rw-r-- 1  68409555 Jan  6 22:21 anoGam3.2bit
# -rw-rw-r-- 1 264974392 Jan  6 22:26 anoGam3.fa.bwt
# -rw-rw-r-- 1  66243578 Jan  6 22:26 anoGam3.fa.pac
# -rw-rw-r-- 1    354910 Jan  6 22:26 anoGam3.fa.ann
# -rw-rw-r-- 1    135831 Jan  6 22:26 anoGam3.fa.amb
# -rw-rw-r-- 1 132487208 Jan  6 22:28 anoGam3.fa.sa
# -rw-rw-r-- 1    185978 Jan  6 22:28 anoGam3.sizes
# -rw-rw-r-- 1   5007758 Jan  6 22:28 anoGam3.segments.bed
# -rw-rw-r-- 1       371 Jan  6 22:28 genomeInfo.tab

    # running step wise to be careful and see if this works OK

    time (~/kent/src/hg/utils/automation/doCrispr.pl -stop=ranges \
	-buildDir=`pwd` anoGam3 ensGene) > ranges.log 2>&1
    # real    0m8.964s

    time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=guides \
	-stop=guides -buildDir=`pwd` anoGam3 ensGene) > guides.log 2>&1
    # real    0m57.600s
    cat guides/run.time | sed -e 's/^/# /;' | head -6
# Completed: 99 of 99 jobs
# CPU time in finished jobs:        742s      12.36m     0.21h    0.01d  0.000 y
# IO & Wait Time:                   228s       3.81m     0.06h    0.00d  0.000 y
# Average job time:                  10s       0.16m     0.00h    0.00d
# Longest finished job:              13s       0.22m     0.00h    0.00d
# Submission to last job:            25s       0.42m     0.01h    0.00d

    # this is the big cluster job
    time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScores \
	-stop=specScores -buildDir=`pwd` anoGam3 ensGene) > specScores.log 2>&1
    # real    1110m57.233s
# Completed: 186799 of 186799 jobs
# CPU time in finished jobs:   12910905s  215181.76m  3586.36h  149.43d  0.409 y
# IO & Wait Time:              47964247s  799404.11m 13323.40h  555.14d  1.521 y
# Average job time:                 326s       5.43m     0.09h    0.00d
# Longest finished job:             580s       9.67m     0.16h    0.01d
# Submission to last job:         63574s    1059.57m    17.66h    0.74d


    time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=effScores \
	-stop=offTargets -buildDir=`pwd` anoGam3 ensGene) \
	> effScores.offTargets.log 2>&1
    # real    251m21.154s
# effScores
# Completed: 1413 of 1413 jobs
# CPU time in finished jobs:     793726s   13228.77m   220.48h    9.19d  0.025 y
# IO & Wait Time:                 10053s     167.55m     2.79h    0.12d  0.000 y
# Average job time:                 569s       9.48m     0.16h    0.01d
# Longest finished job:           13440s     224.00m     3.73h    0.16d
# Submission to last job:         14019s     233.65m     3.89h    0.16d

# offTargets
# Completed: 9340 of 9340 jobs
# CPU time in finished jobs:      14042s     234.03m     3.90h    0.16d  0.000 y
# IO & Wait Time:                 37970s     632.84m    10.55h    0.44d  0.001 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest finished job:              10s       0.17m     0.00h    0.00d
# Submission to last job:            66s       1.10m     0.02h    0.00d

    time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=load \
	-stop=load -buildDir=`pwd` anoGam3 ensGene) \
	> load.log 2>&1
    # real    13m52.830s

    featureBits -countGaps anoGam3 crispr10KRanges
    # 147498676 bases of 264974304 (55.665%) in intersection

    # XXX this does not work: Mon Jan  8 13:29:18 PST 2018
    ~/kent/src/utils/doLocusName -o anoGam3 ensGene

sh: tabRepl: command not found
Usage: bedOverlapMerge [options] file - merge overlapping bed features, join their names

bedOverlapMerge: error: no such option: -o
Usage: bedBetween [options] inSortedBedFile outfile (stdout ok): given sorted feats., create features between them and annotated these with the neighboring bednames. Regions around chromosomes are limited to 50kbp.

bedBetween: error: no such option: -m


##############################################################################