# for emacs: -*- mode: sh; -*- # This file describes how the browser for B. xylophilus WS245 version is built ############################################################################## # download sequence, create UCSC sequence (DONE - 2015-06-24 - Hiram) mkdir -p /hive/data/genomes/burXyl1/ws245 cd /hive/data/genomes/burXyl1/ws245 wget --no-parent --timestamping -m -nH --cut-dirs=6 \ ftp://ftp.sanger.ac.uk/pub/wormbase/releases/WS245/species/PRJEA64437 mkdir /hive/data/genomes/burXyl1/ucsc cd /hive/data/genomes/burXyl1/ucsc # WormBase contig names are of the pattern: # >scaffold01652 # >scaffold01653 # >contig00008 # >contig00013 # ... # no conversion of names to maintain equivalence with WormBase: zcat ../ws245/PRJEA64437/b_xylophilus.PRJEA64437.WS245.genomic.fa.gz \ | gzip -c > burXyl1.fa.gz hgFakeAgp burXyl1.fa.gz burXyl1.agp # photo from WikiMedia Commons from USDA wget --timestamping \ https://upload.wikimedia.org/wikipedia/commons/0/0f/Bursaphelenchus_xylophilus.jpg # http://www.forestryimages.org/browse/detail.cfm?imgnum=4387005 # L.D. Dwinell USDA Forest Service mv Bursaphelenchus_xylophilus.jpg usda.Bursaphelenchus_xylophilus.jpg convert -quality 80 -geometry 400x300 usda.Bursaphelenchus_xylophilus.jpg \ Bursaphelenchus_xylophilus.jpg identify Bursaphelenchus_xylophilus.jpg Bursaphelenchus_xylophilus.jpg JPEG 400x271 400x271 ############################################################################# # Initial database build (DONE - 2015-07-09 - Hiram) cd /hive/data/genomes/burXyl1 cat << '_EOF_' > burXyl1.config.ra # Config parameters for makeGenomeDb.pl: db burXyl1 clade worm genomeCladePriority 68 scientificName Bursaphelenchus xylophilus commonName Pine wood nematode assemblyDate Nov. 2011 assemblyLabel Wellcome Trust Sanger Institute B. xylophilus genome project assemblyShortLabel B. xylophilus Ka4C1 orderKey 16350 mitoAcc GQ332424.1 fastaFiles /hive/data/genomes/strRat2/ucsc/burXyl1.fa.gz agpFiles /hive/data/genomes/strRat2/ucsc/burXyl1.agp # qualFiles none dbDbSpeciesDir worm photoCreditURL https://www.wormbase.org/species/b_xylophilus photoCreditName photo TBD ncbiGenomeId 11822 ncbiAssemblyId 310678 ncbiAssemblyName ASM23113v1 ncbiBioProject 64437 genBankAccessionID GCA_000231135.1 taxId 6326 '_EOF_' # << happy emacs # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp burXyl1.config.ra) > agp.log 2>&1 # *** All done! (through the 'agp' step) # real 0m58.936s # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db burXyl1.config.ra) > db.log 2>&1 # real 1m1.272s # check in the trackDb files created and add to trackDb/makefile ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/cpgIslandsUnmasked cd /hive/data/genomes/burXyl1/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/burXyl1/burXyl1.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku burXyl1) > do.log 2>&1 # real 5m21.014s cat fb.burXyl1.cpgIslandExtUnmasked.txt # 3488944 bases of 73100506 (4.773%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/cytoBand cd /hive/data/genomes/burXyl1/bed/cytoBand makeCytoBandIdeo.csh burXyl1 ######################################################################### # ucscToINSDC table/track (TBD - 2015-03-20 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/ucscToINSDC cd /hive/data/genomes/burXyl1/bed/ucscToINSDC ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../genbank/GCA_*assembly_structure/Primary_Assembly awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \ > ucscToINSDC.bed # verify all names are coming through, should be same line count: wc -l * # 25187 name.coordinate.tab # 25187 ucscToINSDC.bed # 25187 ucscToINSDC.txt cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 14 # use the 14 in this sed sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab burXyl1 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords burXyl1 # should cover %100 entirely: featureBits -countGaps burXyl1 ucscToINSDC # 2053849526 bases of 2053849526 (100.000%) in intersection ######################################################################### # fixup search rule for assembly track/gold table (TBD - 2015-06-01 - Hiram) hgsql -N -e "select frag from gold;" burXyl1 | sort | head -3 JF414117.1 Ppa_Contig0_1 Ppa_Contig0_10 [JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)? hgsql -N -e "select frag from gold;" burXyl1 | sort | tail -2 Ppa_Contig9_98 Ppa_Contig9_99 # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" burXyl1 | wc -l # 28759 hgsql -N -e "select frag from gold;" burXyl1 \ | egrep -e '[JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)?' \ | wc -l # 28759 hgsql -N -e "select frag from gold;" burXyl1 \ | egrep -v -e '[JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)?' \ | wc -l # 0 # hence, add to trackDb/worm/burXyl1/trackDb.ra searchTable gold shortCircuit 1 termRegex [JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ########################################################################## # running repeat masker (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/repeatMasker cd /hive/data/genomes/burXyl1/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku burXyl1) > do.log 2>&1 # real 53m37.447s cat faSize.rmsk.txt # 74576239 bases (1475733 N's 73100506 real 72631263 upper 469243 lower) # in 5528 sequences in 1 files # Total size: mean 13490.6 sd 127730.2 min 444 (contig15090) # max 3612001 (scaffold00713) median 1253 # %0.63 masked total, %0.64 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; time featureBits -countGaps burXyl1 rmsk # 469243 bases of 74576239 (0.629%) in intersection # real 0m1.897s ########################################################################## # running simple repeat (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/simpleRepeat cd /hive/data/genomes/burXyl1/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ burXyl1) > do.log 2>&1 # real 8m14.637s cat fb.simpleRepeat # 1676598 bases of 73100506 (2.294%) in intersection # using the Window Masker result as indicated below ########################################################################## # CREATE MICROSAT TRACK (DONE - 2015-07-09 - Hiram) ssh hgwdev mkdir /cluster/data/burXyl1/bed/microsat cd /cluster/data/burXyl1/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed burXyl1 microsat microsat.bed # Read 17 elements of size 4 from microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/windowMasker cd /hive/data/genomes/burXyl1/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev burXyl1) > do.log 2>&1 # real 4m27.586s # Masking statistics cat faSize.burXyl1.cleanWMSdust.txt # 74576239 bases (1475733 N's 73100506 real 51958319 upper # 21142187 lower) in 5528 sequences in 1 files # Total size: mean 13490.6 sd 127730.2 min 444 (contig15090) # max 3612001 (scaffold00713) median 1253 # %28.35 masked total, %28.92 masked real cat fb.burXyl1.rmsk.windowmaskerSdust.txt # 350035 bases of 74576239 (0.469%) in intersection # using this Window Masker result for final masking:: cd /hive/data/genomes/burXyl1 # you can safely ignore the warning about fields >= 13 twoBitMask bed/windowMasker/burXyl1.cleanWMSdust.2bit \ -add bed/simpleRepeat/trfMask.bed burXyl1.2bit # measure the final masking: twoBitToFa burXyl1.2bit stdout | faSize stdin > faSize.burXyl1.2bit.txt cat faSize.burXyl1.2bit.txt # 74576239 bases (1475733 N's 73100506 real 51950819 upper # 21149687 lower) in 5528 sequences in 1 files # Total size: mean 13490.6 sd 127730.2 min 444 (contig15090) # max 3612001 (scaffold00713) median 1253 # %28.36 masked total, %28.93 masked real # and reset the symlink rm /gbdb/burXyl1/burXyl1.2bit ln -s /hive/data/genomes/burXyl1/burXyl1.2bit /gbdb/burXyl1/burXyl1.2bit ########################################################################## # cpgIslands - (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/cpgIslands cd /hive/data/genomes/burXyl1/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku burXyl1) > do.log 2>&1 & # real 5m4.279s cat fb.burXyl1.cpgIslandExt.txt # 2808031 bases of 73100506 (3.841%) in intersection ######################################################################### # augustus - (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/augustus cd /hive/data/genomes/burXyl1/bed/augustus # XXX this is not specifically correct, the species caenorhabditis # is not necessarily accurate here time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=caenorhabditis -dbHost=hgwdev \ -workhorse=hgwdev burXyl1) > do.log 2>&1 # real 50m13.577s cat fb.burXyl1.augustusGene.txt # 18007597 bases of 73100506 (24.634%) in intersection ######################################################################### # genscan - (DONE - 2015-07-09 - Hiram) mkdir /hive/data/genomes/burXyl1/bed/genscan cd /hive/data/genomes/burXyl1/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku burXyl1) > do.log 2>&1 # real 7m36.101s cat fb.burXyl1.genscan.txt # 9924749 bases of 73100506 (13.577%) in intersection cat fb.burXyl1.genscanSubopt.txt # 3654432 bases of 73100506 (4.999%) in intersection ######################################################################## # Create kluster run files (TBD - 2015-07-08 - Hiram) cd /hive/data/genomes/burXyl1 # numerator is burXyl1 gapless bases "real" as reported by: head -1 faSize.burXyl1.2bit.txt # 94076581 bases (7759220 N's 86317361 real 53341517 upper 32975844 lower) # in 9780 sequences in 1 files # numerator is 'real' base count # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 86317361 / 2861349177 \) \* 1024 # ( 86317361 / 2861349177 ) * 1024 = 30.890665 # ==> use -repMatch=100 since 30 or 50 masks too much cd /hive/data/genomes/burXyl1 time blat burXyl1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/burXyl1.11.ooc \ -repMatch=100 # Wrote 8527 overused 11-mers to jkStuff/burXyl1.11.ooc # real 0m1.846s # there are a few non-bridged gaps # check non-bridged gaps to see what the typical size is: hgsql -N -e 'select * from gap where bridge="no" order by size;' burXyl1 \ | ave -tableOut -col=7 stdin # min Q1 median Q3 max mean N sum stddev # 78831 79056 79362 79503 79503 79162.5 4 316650 320.991 # note the minimum non-bridged gap size is 78,831 gapToLift -verbose=2 -minGap=50000 burXyl1 jkStuff/burXyl1.nonBridged.lft \ -bedFile=jkStuff/burXyl1.nonBridged.bed # survey sizes: n50.pl chrom.sizes # reading: chrom.sizes # contig count: 9780, total size: 94076581, one half size: 47038290 # cumulative N50 count contig contig size # 46960459 61 Bmal_v3_scaffold61 194773 # 47038290 one half size # 47151548 62 Bmal_v3_scaffold62 191089 ############################################################################# # GENBANK AUTO UPDATE (TBD - 2015-06-09 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Pristionchus pacificus 97 37470 0 # edit etc/genbank.conf to add burXyl1 just before priPac1 # burXyl1 (P. pacificus) burXyl1.serverGenome = /hive/data/genomes/burXyl1/burXyl1.2bit burXyl1.clusterGenome = /hive/data/genomes/burXyl1/burXyl1.2bit burXyl1.ooc = /hive/data/genomes/burXyl1/jkStuff/burXyl1.11.ooc burXyl1.lift = /hive/data/genomes/burXyl1/jkStuff/burXyl1.nonBridged.lft burXyl1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} burXyl1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} burXyl1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} burXyl1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} burXyl1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} burXyl1.refseq.mrna.native.load = yes burXyl1.refseq.mrna.xeno.load = yes burXyl1.refseq.mrna.xeno.loadDesc = yes # DO NOT NEED genbank.mrna.xeno except for human, mouse burXyl1.genbank.mrna.xeno.load = no burXyl1.genbank.est.native.load = yes burXyl1.genbank.est.native.loadDesc = no burXyl1.downloadDir = burXyl1 burXyl1.perChromTables = no git commit -m "Added burXyl1 - P. pacificus refs #15209" etc/genbank.conf git push # update /cluster/data/genbank/etc/: make etc-update screen # control this business with a screen since it takes a while cd /cluster/data/genbank time ./bin/gbAlignStep -initial burXyl1 # logFile: var/build/logs/2015.07.02-11:39:01.burXyl1.initalign.log # real 91m20.362s # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.burXyl1 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad burXyl1 # logFile: var/dbload/hgwdev/logs/2015.07.06-09:51:31.burXyl1.dbload.log # real 18m34.687s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add burXyl1 to: # vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added burXyl1 - Pristionchus pacificus refs #15209" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # all.joiner update, downloads and in pushQ - (TBD - 2015-06-22 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=burXyl1 -tableCoverage all.joiner joinerCheck -database=burXyl1 -times all.joiner joinerCheck -database=burXyl1 -keys all.joiner cd /hive/data/genomes/burXyl1 time makeDownloads.pl burXyl1 > downloads.log 2>&1 # real 13m42.027s # now ready for pushQ entry mkdir /hive/data/genomes/burXyl1/pushQ cd /hive/data/genomes/burXyl1/pushQ makePushQSql.pl burXyl1 > burXyl1.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/burXyl1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/burXyl1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/burXyl1/bbi/qualityBw/quality.bw # WARNING: burXyl1 does not have seq # WARNING: burXyl1 does not have extFile # WARNING: burXyl1 does not have estOrientInfo # WARNING: burXyl1 does not have mrnaOrientInfo # copy it to hgwbeta scp -p burXyl1.pushQ.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/burXyl1.pushQ.sql" # in that pushQ entry walk through each entry and see if the # sizes will set properly #############################################################################