# for emacs: -*- mode: sh; -*- # This file describes browser build for the balAcu1 # Balaenoptera acutorostrata scammoni - Minke whale # DATE: 31-Oct-2013 # ORGANISM: Balaenoptera acutorostrata scammoni # TAXID: 310752 # ASSEMBLY LONG NAME: BalAcu1.0 # ASSEMBLY SHORT NAME: BalAcu1.0 # ASSEMBLY SUBMITTER: Korea Ocean Research & Development Institute # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000493695.1 # FTP-RELEASE DATE: 07-Nov-2013 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Balaenoptera_acutorostrata/BalAcu1.0/ # Mitochondrial sequence: NC_005271.1 ############################################################################# # fetch sequence from genbank (DONE - 2013-03-26 - Hiram) mkdir -p /hive/data/genomes/balAcu1/genbank cd /hive/data/genomes/balAcu1/genbank rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Balaenoptera_acutorostrata/BalAcu1.0/ ./ # measure sequence to be used here (there will be the chrMT also ...) faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2431671281 bases (145030652 N's 2286640629 real 2286640629 upper # 0 lower) in 10775 sequences in 1 files # Total size: mean 225677.1 sd 1878463.2 # min 200 (gi|555415911|gb|ATDI01158464.1|) # max 51448160 (gi|555757145|gb|KI537095.1|) median 394 ############################################################################# # fixup names for UCSC standards (DONE - 2014-03-26 - Hiram) cd /hive/data/genomes/balAcu1 $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl balAcu1 # constructs ./ucsc/ directory here: # -rw-rw-r-- 1 20508313 Mar 26 18:04 balAcu1.ucsc.agp # -rw-rw-r-- 1 700587893 Mar 26 18:18 balAcu1.ucsc.fa.gz # -rw-rw-r-- 1 206 Mar 26 18:20 checkAgp.result.txt ############################################################################# # Initial database build (DONE - 2014-03-27 - Hiram) cd /hive/data/genomes/balAcu1 cat << '_EOF_' > balAcu1.config.ra # Config parameters for makeGenomeDb.pl: db balAcu1 clade mammal genomeCladePriority 35 scientificName Balaenoptera acutorostrata scammoni commonName Minke whale assemblyDate Oct. 2013 assemblyLabel Korea Ocean Research & Development Institute assemblyShortLabel BalAcu1.0 orderKey 2369 mitoAcc NC_005271.1 fastaFiles /hive/data/genomes/balAcu1/ucsc/balAcu1.ucsc.fa.gz agpFiles /hive/data/genomes/balAcu1/ucsc/balAcu1.ucsc.agp dbDbSpeciesDir balAcu photoCreditURL http://www.noaa.gov/ photoCreditName NOAA - National Oceanic and Atmospheric Administration ncbiGenomeId 10769 ncbiAssemblyId 78761 ncbiAssemblyName BalAcu1.0 ncbiBioProject 237330 genBankAccessionID GCA_000493695.1 taxId 310752 '_EOF_' # << happy emacs # stepwise, the first 'seq' step didn't work with the # scientificName above because the chrM sequence did not have # the scammoni on the name, use a config.ra for that step without # that part of the name, then continue with the full name: makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=seq balAcu1.config.ra > seq.log 2>&1 # verify sequence and AGP are OK: makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -continue=agp -stop=agp balAcu1.config.ra > agp.log 2>&1 # then finish it off: makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -continue=db balAcu1.config.ra > db.log 2>&1 # real 22m15.793s ########################################################################## # running repeat masker (DONE - 2014-03-27 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/repeatMasker cd /hive/data/genomes/balAcu1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku balAcu1 > do.log 2>&1 & # real 1125m12.023s cat faSize.rmsk.txt # 2431687698 bases (145030652 N's 2286657046 real 1334394119 upper # 952262927 lower) in 10776 sequences in 1 files # Total size: mean 225657.7 sd 1878377.1 min 200 (ATDI01158464) # max 51448160 (KI537095) median 394 # %39.16 masked total, %41.64 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.3 # June 20 2013 (open-4-0-3) version of RepeatMasker # CC RELEASE 20130422; featureBits -countGaps balAcu1 rmsk # 954201519 bases of 2431687698 (39.240%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2014-03-27 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/simpleRepeat cd /hive/data/genomes/balAcu1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ balAcu1 > do.log 2>&1 & # real 4m49.523s cat fb.simpleRepeat # 35896803 bases of 2286657046 (1.570%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/balAcu1 twoBitMask balAcu1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed balAcu1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa balAcu1.2bit stdout | faSize stdin > faSize.balAcu1.2bit.txt cat faSize.balAcu1.2bit.txt # 2431687698 bases (145030652 N's 2286657046 real 1333528377 upper # 953128669 lower) in 10776 sequences in 1 files # Total size: mean 225657.7 sd 1878377.1 min 200 (ATDI01158464) # max 51448160 (KI537095) median 394 # %39.20 masked total, %41.68 masked real rm /gbdb/balAcu1/balAcu1.2bit ln -s `pwd`/balAcu1.2bit /gbdb/balAcu1/balAcu1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2014-03-27 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/gap cd /hive/data/genomes/balAcu1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../balAcu1.unmasked.2bit > findMotif.txt 2>&1 # real 0m53.846s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits balAcu1 -not gap -bed=notGap.bed # 2403678276 bases of 2403678276 (100.000%) in intersection # real 0m12.025s time featureBits balAcu1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2286657046 (0.000%) in intersection # real 12m56.037s # no new gaps, nothing to do, take a look at felCat5.txt for an example # of what to do here with the new gaps ########################################################################## ## WINDOWMASKER (DONE - 2014-03-27 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/windowMasker cd /hive/data/genomes/balAcu1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev balAcu1 > do.log 2>&1 & # real 205m27.180s # Masking statistics cat faSize.balAcu1.cleanWMSdust.txt # 2431687698 bases (145030652 N's 2286657046 real 1533887679 upper # 752769367 lower) in 10776 sequences in 1 files # Total size: mean 225657.7 sd 1878377.1 min 200 (ATDI01158464) # max 51448160 (KI537095) median 394 # %30.96 masked total, %32.92 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps balAcu1 rmsk windowmaskerSdust # 491805301 bases of 2431687698 (20.225%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2014-04-01 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/cytoBand cd /hive/data/genomes/balAcu1/bed/cytoBand makeCytoBandIdeo.csh balAcu1 ########################################################################## # cpgIslands - (DONE - 2014-04-01 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/cpgIslands cd /hive/data/genomes/balAcu1/bed/cpgIslands time doCpgIslands.pl balAcu1 > do.log 2>&1 & # real 5m28.908s cat fb.balAcu1.cpgIslandExt.txt # 26505589 bases of 2286657046 (1.159%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2014-04-01 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/cpgIslandsUnmasked cd /hive/data/genomes/balAcu1/bed/cpgIslandsUnmasked # run stepwise so the loading can be done in a different table time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -stop=makeBed \ -maskedSeq=/hive/data/genomes/balAcu1/balAcu1.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku balAcu1 > makeBed.log 2>&1 # real 4m40.925s # debug load step so it can be loaded into a separate table: time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -debug -continue=load \ -maskedSeq=/hive/data/genomes/balAcu1/balAcu1.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku balAcu1 # edit and change the table name to load: cpgIslandExtUnmasked time ./doLoadCpg.csh > load.log 2>&1 # Read 65448 elements of size 10 from cpgIsland.bed cat fb.balAcu1.cpgIslandExtUnmasked.txt $ 33531280 bases of 2286657046 (1.466%) in intersection ######################################################################### ######################################################################### # genscan - (DONE - 2014-04-01 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/genscan cd /hive/data/genomes/balAcu1/bed/genscan time doGenscan.pl balAcu1 > do.log 2>&1 & # real 24m47.395s cat fb.balAcu1.genscan.txt # 53624691 bases of 2286657046 (2.345%) in intersection cat fb.balAcu1.genscanSubopt.txt # 53864323 bases of 2286657046 (2.356%) in intersection ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2014-04-01 - Hiram) # Use -repMatch=750, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2286657046 / 2897316137 \) \* 1024 # ( 2286657046 / 2897316137 ) * 1024 = 808.174429 # round up to 850 cd /hive/data/genomes/balAcu1 blat balAcu1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/balAcu1.11.ooc -repMatch=850 # Wrote 26442 overused 11-mers to jkStuff/balAcu1.11.ooc # there are *only* bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" balAcu1 | sort | uniq -c # 173296 yes ######################################################################### # AUTO UPDATE GENBANK (DONE - 2014-04-02 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Balaenoptera acutorostrata 27 0 0 # Balaenoptera acutorostrata scammoni 0 0 1 # Balaenoptera borealis 5 0 0 # Balaenoptera brydei 5 0 0 # Balaenoptera edeni 1 0 0 # Balaenoptera omurai 3 0 0 # Balaenoptera physalus 12 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf # this appears that balAcu1 has effectively zero native est's ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add balAcu1 following vicPac1 # balAcu1 (minke whale) balAcu1.serverGenome = /hive/data/genomes/balAcu1/balAcu1.2bit balAcu1.clusterGenome = /hive/data/genomes/balAcu1/balAcu1.2bit balAcu1.ooc = /hive/data/genomes/balAcu1/jkStuff/balAcu1.11.ooc balAcu1.lift = no balAcu1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} balAcu1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} balAcu1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} balAcu1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} balAcu1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} balAcu1.refseq.mrna.native.load = no balAcu1.refseq.mrna.xeno.load = yes balAcu1.genbank.mrna.xeno.load = no balAcu1.genbank.est.native.load = no balAcu1.genbank.mrna.native.load = no balAcu1.genbank.mrna.native.loadDesc = no balAcu1.downloadDir = balAcu1 balAcu1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding balAcu1 Minke whale refs #12977" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for balAcuNames refs 12977" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genbank time ./bin/gbAlignStep -initial balAcu1 & # var/build/logs/2014.04.02-09:17:41.balAcu1.initalign.log # real 87m5.013s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad balAcu1 & # logFile: var/dbload/hgwdev/logs/2014.04.02-11:43:24.balAcu1.dbload.log # real 5m20.938s # enable daily alignment and update of hgwdev (TBD - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add balAcu1 to: etc/align.dbs etc/hgwdev.dbs vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added balAcu1 to daily hgwdev build refs #12977" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # set default position on Myoglobin gene displays (DONE - 2014-04-02 - Hiram) hgsql -e \ 'update dbDb set defaultPos="KI537194:1947619-1969416" where name="balAcu1";' \ hgcentraltest ######################################################################### # create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram) mkdir /hive/data/genomes/balAcu1/bed/ucscToINSDC cd /hive/data/genomes/balAcu1/bed/ucscToINSDC # this script has been maturing over time, it is close to complete. # to find a latest copy of it: # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh cp -p /hive/data/genomes/papAnu2/bed/ucscToINSDC/translateNames.sh . ./translateNames.sh # it says: # need to find chrM accessions # so add this one: echo -e 'chrM\tNC_005271.1' >> ucscToINSDC.txt # needs to be sorted to work with join sort ucscToINSDC.txt > ucscToINSDC.tab awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \ > name.coordinate.tab join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 12 # use the 12 in this sed: sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab balAcu1 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords balAcu1 ucscToINSDC featureBits -countGaps balAcu1 ucscToINSDC # 2431687698 bases of 2431687698 (100.000%) in intersection ############################################################################## # construct download files (DONE - 2014-04-02 - Hiram) # after db name has been added to all.joiner and # joinerCheck -database=balAcu1 -keys all.joiner # is clean cd /hive/data/genomes/balAcu1 time makeDownloads.pl -workhorse=hgwdev -dbHost=hgwdev balAcu1 \ > downloads.log 2>&1 # real 23m14.645s ############################################################################## # pushQ entry (DONE - 2014-04-02 - Hiram) mkdir /hive/data/genomes/balAcu1/pushQ cd /hive/data/genomes/balAcu1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl balAcu1 2> stderr.txt | grep -v transMap > balAcu1.sql # real 3m55.511s hgwbeta hgsql -h mysqlbeta -N qapushq -e 'select rank from pushQ order by rank desc limit 1' scp -p balAcu1.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/balAcu1.sql' ########################################################################### # fixup search rule for assembly track/gold table (DONE - 2018-06-13 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/balAcu/balAcu1 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" balAcu1 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c | sed -e 's/^/#\t/;' # 184071 ATDI.1 # 1 NC_.1 # implies a rule: '[AN][TC][D_][I0-9]+(\.[0-9]+[0-9_]*)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" balAcu1 | wc -l # 184072 hgsql -N -e "select frag from gold;" balAcu1 \ | egrep -e '[AN][TC][D_][I0-9]+(\.[0-9]+[0-9_]*)?' | wc -l # 184072 hgsql -N -e "select frag from gold;" balAcu1 \ | egrep -v -e '[AN][TC][D_][I0-9]+(\.[0-9]+[0-9_]*)?' | wc -l # 0 # hence, add to trackDb/balAcu/balAcu1/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][TC][D_][I0-9]+(\.[0-9]+[0-9_]*)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ############################################################################# # lastz/chainNet swap from human/hg38 (DONE - 2018-06-13 - Hiram) # alignment to human: cd /hive/data/genomes/hg38/bed/lastzBalAcu1.2018-06-11 cat fb.hg38.chainBalAcu1Link.txt # 1571134497 bases of 3049335806 (51.524%) in intersection cat fb.hg38.chainSynBalAcu1Link.txt # 1509091721 bases of 3049335806 (49.489%) in intersection cat fb.hg38.chainRBest.BalAcu1.txt # 1443423443 bases of 3049335806 (47.336%) in intersection # and for the swap: mkdir /hive/data/genomes/balAcu1/bed/blastz.hg38.swap cd /hive/data/genomes/balAcu1/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzBalAcu1.2018-06-11/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 105m38.443s cat fb.balAcu1.chainHg38Link.txt # 1485701363 bases of 2286657046 (64.973%) in intersection cat fb.balAcu1.chainSynHg38Link.txt # 1444079345 bases of 2286657046 (63.152%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` balAcu1 hg38) > rbest.log 2>&1 & # real 422m43.817s cat fb.balAcu1.chainRBest.Hg38.txt # 1443200492 bases of 2286657046 (63.114%) in intersection #############################################################################