# for emacs: -*- mode: sh; -*- # This file describes browser build for the tarSyr2 # Tarsius syrichta -- WashU Tarsius_syrichta-2.0.1 Tarsius syrichta ############################################################################# # fetch sequence from new style download directory (DONE - 2014-11-20 - Hiram) # NCBI has redesigned their FTP download site, new type of address # and naming scheme mkdir -p /hive/data/genomes/tarSyr2/genbank cd /hive/data/genomes/tarSyr2/genbank rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Tarsius_syrichta/all_assembly_versions/GCA_000164805.2_Tarsius_syrichta-2.0.1/ ./ # measure what we have here: faSize GCA_000164805.2_Tarsius_syrichta-2.0.1_genomic.fna.gz # 3453847770 bases (48109210 N's 3405738560 real 2042571886 # upper 1363166674 lower) in 337188 sequences in 1 files # Total size: mean 10243.1 sd 72186.0 min 202 (ABRT02344103.1) # max 4032379 (KE950501.1) median 773 # %39.47 masked total, %40.03 masked real # note that these totals do not include chrM since the GenBank ftp directory # did not include a non-nuclear directory ############################################################################# # fixup to UCSC naming scheme (DONE - 2014-11-20 - Hiram) mkdir /hive/data/genomes/tarSyr2/ucsc cd /hive/data/genomes/tarSyr2/ucsc # this one is unusually simple because there is just one file of # contigs, and one AGP file. Even the contig names are already # almost in UCSC format. Just need to substitute v where . is in # the accession suffix zcat ../genbank/GCA_000164805.2_Tarsius_syrichta-2.0.1_genomic.fna.gz \ | sed -e 's/ Tarsius syrichta.*//;' \ | sed -e 's/\./v/;' | gzip -c > tarSyr2.fa.gz zcat ../genbank/GCA_000164805.2_Tarsius_syrichta-2.0.1_assembly_structure/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | sed -e 's/\./v/;' > tarSyr2.agp # verify nothing lost compared to the genbank source: faSize tarSyr2.fa.gz # 3453847770 bases (48109210 N's 3405738560 real 2042571886 # upper 1363166674 lower) in 337188 sequences in 1 files # Total size: mean 10243.1 sd 72186.0 min 202 (ABRT02344103v1) # max 4032379 (KE950501v1) median 773 # %39.47 masked total, %40.03 masked real # same numbers as above. We don't need the masking, but that will be # overwritten when the 2bit file is constructed below ############################################################################# # Initial database build (DONE - 2014-11-20 - Hiram) cd /hive/data/genomes/tarSyr2 cat << '_EOF_' > tarSyr2.config.ra # Config parameters for makeGenomeDb.pl: db tarSyr2 clade mammal genomeCladePriority 35 scientificName Tarsius syrichta commonName Tarsier assemblyDate Sep. 2013 assemblyLabel Washington University assemblyShortLabel Tarsius_syrichta-2.0.1 orderKey 409 # chrM bioProject: 38399 -> NC_012774.1 mitoAcc NC_012774.1 fastaFiles /cluster/data/tarSyr2/ucsc/tarSyr2.fa.gz agpFiles /cluster/data/tarSyr2/ucsc/tarSyr2.agp # qualFiles /cluster/data/tarSyr2/ucsc/tarSyr2.qual.qac dbDbSpeciesDir tarsier photoCreditURL http://en.wikipedia.org/wiki/File:Tarsier_Hugs_Mossy_Branch.jpg photoCreditName Waldir, Wikipedia Commons ncbiGenomeId 766 ncbiAssemblyId 784738 ncbiAssemblyName Tarsius_syrichta-2.0.1 ncbiBioProject 20339 genBankAccessionID GCA_000164805.2 taxId 9478 '_EOF_' # << happy emacs # verify sequence and AGP are OK: time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp tarSyr2.config.ra > agp.log 2>&1 # real 2m22.582s # then finish it off: time nice -n +19 makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db tarSyr2.config.ra > db.log 2>&1 # there was a problem with the last step trackDb, fixed makeGenomeDb.pl # and completed: time nice -n +19 makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=trackDb tarSyr2.config.ra > trackDb.log 2>&1 ########################################################################## # running repeat masker (DONE - 2014-11-21 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/repeatMasker cd /hive/data/genomes/tarSyr2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku tarSyr2 > do.log 2>&1 # real 750m50.942s cat faSize.rmsk.txt # 3453864774 bases (48109210 N's 3405755564 real 1778461643 upper # 1627293921 lower) in 337189 sequences in 1 files # Total size: mean 10243.1 sd 72185.9 min 202 (ABRT02344103v1) # max 4032379 (KE950501v1) median 773 # %47.12 masked total, %47.78 masked real egrep -i "versi|relea" do.log # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; time featureBits -countGaps tarSyr2 rmsk # 1631014044 bases of 3453864774 (47.223%) in intersection # real 3m0.934s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE 2014-11-21 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/simpleRepeat cd /hive/data/genomes/tarSyr2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ tarSyr2 > do.log 2>&1 # real 75m31.505s cat fb.simpleRepeat # 85367852 bases of 3405755564 (2.507%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/tarSyr2 twoBitMask tarSyr2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed tarSyr2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa tarSyr2.2bit stdout | faSize stdin > faSize.tarSyr2.2bit.txt cat faSize.tarSyr2.2bit.txt # 3453864774 bases (48109210 N's 3405755564 real 1776164022 upper # 1629591542 lower) in 337189 sequences in 1 files # Total size: mean 10243.1 sd 72185.9 min 202 (ABRT02344103v1) # max 4032379 (KE950501v1) median 773 # %47.18 masked total, %47.85 masked real # 2670044500 bases (20740269 N's 2649304231 real 1359475670 upper # 1289828561 lower) in 3179 sequences in 1 files # Total size: mean 839900.8 sd 9113794.6 min 224 (chrUn_GJ060330v1) # max 158337067 (chr1) median 1356 # %48.31 masked total, %48.69 masked real rm /gbdb/tarSyr2/tarSyr2.2bit ln -s `pwd`/tarSyr2.2bit /gbdb/tarSyr2/tarSyr2.2bit ########################################################################## ## WINDOWMASKER (DONE - 2014-11-21 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/windowMasker cd /hive/data/genomes/tarSyr2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev tarSyr2 > do.log 2>&1 # real 801m33.192s # Masking statistics cat faSize.tarSyr2.cleanWMSdust.txt # 3453864774 bases (48109210 N's 3405755564 real 2023347923 upper # 1382407641 lower) in 337189 sequences in 1 files # Total size: mean 10243.1 sd 72185.9 min 202 (ABRT02344103v1) # max 4032379 (KE950501v1) median 773 # %40.02 masked total, %40.59 masked real cat fb.tarSyr2.rmsk.windowmaskerSdust.txt # 976090281 bases of 3453864774 (28.261%) in intersection ########################################################################## # cpgIslands - (DONE - 2014-11-25,27 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/cpgIslands cd /hive/data/genomes/tarSyr2/bed/cpgIslands time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku tarSyr2 > do.log 2>&1 # real 2250m21.599s cat fb.tarSyr2.cpgIslandExt.txt # 23569257 bases of 3405755564 (0.692%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2014-11-25,27 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/cpgIslandsUnmasked cd /hive/data/genomes/tarSyr2/bed/cpgIslandsUnmasked # run stepwise so the loading can be done in a different table time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/tarSyr2/tarSyr2.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku tarSyr2 > do.log 2>&1 # real 2229m13.108s cat fb.tarSyr2.cpgIslandExtUnmasked.txt # 35810524 bases of 3405755564 (1.051%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2014-11-25 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/cytoBand cd /hive/data/genomes/tarSyr2/bed/cytoBand makeCytoBandIdeo.csh tarSyr2 ######################################################################### # genscan - (DONE 2014-11-25,27 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/genscan cd /hive/data/genomes/tarSyr2/bed/genscan time doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku tarSyr2 > do.log 2>&1 # real 2786m14.622s cat fb.tarSyr2.genscan.txt # 63545533 bases of 3405755564 (1.866%) in intersection cat fb.tarSyr2.genscanSubopt.txt # 71743479 bases of 3405755564 (2.107%) in intersection ######################################################################## # Create kluster run files (DONE - 2014-12-03 - Hiram) # numerator is tarSyr2 gapless bases "real" as reported by: featureBits -noRandom -noHap tarSyr2 gap # 48109210 bases of 3405755564 (1.413%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 3405755564 / 2861349177 \) \* 1024 # ( 3405755564 / 2861349177 ) * 1024 = 1218.828420 # ==> use -repMatch=1200 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/tarSyr2 time blat tarSyr2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/tarSyr2.11.ooc \ -repMatch=1200 # Wrote 32323 overused 11-mers to jkStuff/tarSyr2.11.ooc # real 0m57.833s # there are no non-bridged gaps, don't need this # check non-bridged gaps to see what the typical size is: hgsql -N -e 'select * from gap where bridge="no" order by size;' \ tarSyr2 | ave -col=7 stdin # this does nothing # gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \ # -bedFile=jkStuff/nonBridged.bed ######################################################################## # GENBANK AUTO UPDATE (DONE - 2014-12-11 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Tarsius syrichta 8 0 0 # edit etc/genbank.conf to add tarSyr2 just before tarSyr1 # tarSyr2 (Tarsier) tarSyr2.serverGenome = /hive/data/genomes/tarSyr2/tarSyr2.2bit tarSyr2.clusterGenome = /hive/data/genomes/tarSyr2/tarSyr2.2bit tarSyr2.ooc = /hive/data/genomes/tarSyr2/jkStuff/tarSyr2.11.ooc tarSyr2.lift = no tarSyr2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} tarSyr2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} tarSyr2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} tarSyr2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} tarSyr2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} tarSyr2.refseq.mrna.native.load = no tarSyr2.refseq.mrna.xeno.load = yes tarSyr2.genbank.mrna.xeno.load = no tarSyr2.genbank.est.native.load = no tarSyr2.downloadDir = tarSyr2 tarSyr2.perChromTables = no git commit -m "Added tarSyr2; refs #14410" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. Skipped screen # control this business with a screen since it takes a while cd /cluster/data/genbank time ./bin/gbAlignStep -initial tarSyr2 # logFile: var/build/logs/2014.12.03-15:37:55.tarSyr2.initalign.log # real 369m37.727s # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.tarSyr2 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad tarSyr2 # logFile: var/dbload/hgwdev/logs/2014.12.11-11:06:27.tarSyr2.dbload.log # real 35m42.214s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add tarSyr2 to: # vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added tarSyr2 - Tarsier refs #14410" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # ucscToINSDC table/track (DONE - 2014-12-11 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/ucscToINSDC cd /hive/data/genomes/tarSyr2/bed/ucscToINSDC # check for chrM in assembly: grep chrM ../../tarSyr2.agp # chrM 1 17004 12 F NC_012774.1 1 17004 + # use the accession name from there in this command (blank if none) ~/kent/src/hg/makeDb/doc/felCat8/ucscToINSDC.sh \ ../../genbank/GCA_*assembly_structure/Primary_Assembly NC_012774.1 awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \ > ucscToINSDC.bed # should all be the same line count: wc -l * # 337189 name.coordinate.tab # 337189 ucscToINSDC.bed # 337189 ucscToINSDC.txt cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 14 # use the 14 in this sed sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab tarSyr2 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords tarSyr2 # should cover %100 entirely: featureBits -countGaps tarSyr2 ucscToINSDC # 3453864774 bases of 3453864774 (100.000%) in intersection ######################################################################### # add chromAlias table (DONE - 2016-10-20 - Hiram) mkdir /hive/data/genomes/tarSyr2/bed/chromAlias cd /hive/data/genomes/tarSyr2/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' tarSyr2 \ > ucsc.refseq.tab printf "chrM\tNC_012774.1\n" >> ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' tarSyr2 \ | grep -v chrM > ucsc.genbank.tab printf "chrM\tAB371090.1\n" >> ucsc.genbank.tab join -t$'\t' ../idKeys/tarSyr2.idKeys.txt \ ../../ensembl/ensemblTarSyr2.idKeys.txt \ | cut -f2,3 | sort > ucsc.ensembl.tab # verify chrM is correct: grep chrM *.tab ucsc.ensembl.tab:chrM MT ucsc.genbank.tab:chrM AB371090.1 ucsc.refseq.tab:chrM NC_012774.1 ~/kent/src/hg/utils/automation/chromAlias.pl sort -o tarSyr2.chromAlias.tab tarSyr2.chromAlias.tab for t in refseq genbank ensembl do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t tarSyr2.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 337189 =? 337189 OK # checking genbank: 337189 =? 337189 OK # checking ensembl: 337189 =? 337189 OK hgLoadSqlTab tarSyr2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ tarSyr2.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram) hgsql -N -e "select frag from gold;" tarSyr2 | sort | head -1 ABRT02000001.1 hgsql -N -e "select frag from gold;" tarSyr2 | sort | tail -2 ABRT02492902.1 NC_012774.1 # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" tarSyr2 | wc -l # 492903 hgsql -N -e "select frag from gold;" tarSyr2 \ | egrep -e '[AN][BC][R_][T0][0-9]+(\.1)?' | wc -l # 492903 hgsql -N -e "select frag from gold;" tarSyr2 \ | egrep -v -e '[AN][BC][R_][T0][0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/tarsier/tarSyr2/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][BC][R_][T0][0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ######################################################################### # LIFTOVER TO tarSyr1 (DONE - 2014-12-11 - Hiram) ssh hgwdev mkdir /hive/data/genomes/tarSyr2/bed/blat.tarSyr1.2014-12-11 cd /hive/data/genomes/tarSyr2/bed/blat.tarSyr1.2014-12-11 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/tarSyr2/jkStuff/tarSyr2.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ tarSyr2 tarSyr1) > do.log 2>&1 # real 13948m39.233s # this was a tough job, that long time was not necessarily due # to cluster contention. the blat run took over 9 days: # Completed: 361089 of 361089 jobs # CPU time in finished jobs: 229192746s 3819879.10m 63664.65h 2652.69d 7.268 y # IO & Wait Time: 11337726s 188962.10m 3149.37h 131.22d 0.360 y # Average job time: 666s 11.10m 0.19h 0.01d # Longest finished job: 1492s 24.87m 0.41h 0.02d # Submission to last job: 811729s 13528.82m 225.48h 9.40d # verify the convert link on the test browser is now active from tarSyr2 to # tarSyr1 ######################################################################### # all.joiner update, downloads and in pushQ - (DONE 2014-12-29 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=tarSyr2 -keys all.joiner # about 50 minutes joinerCheck -database=tarSyr2 -tableCoverage all.joiner joinerCheck -database=tarSyr2 -times all.joiner cd /hive/data/genomes/tarSyr2 time makeDownloads.pl tarSyr2 > downloads.log 2>&1 # real 22m17.544s # now ready for pushQ entry mkdir /hive/data/genomes/tarSyr2/pushQ cd /hive/data/genomes/tarSyr2/pushQ time makePushQSql.pl tarSyr2 > tarSyr2.pushQ.sql 2> stderr.out # real 5m47.086s # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/tarSyr2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/tarSyr2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/tarSyr2/bbi/qualityBw/quality.bw # WARNING: tarSyr2 does not have seq # WARNING: tarSyr2 does not have estOrientInfo # copy it to hgwbeta scp -p tarSyr2.pushQ.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/tarSyr2.pushQ.sql" # in that pushQ entry walk through each entry and see if the # sizes will set properly ######################################################################### # UCSC to RefSeq name correspondence (DONE - 2015-04-28 - braney) mkdir /hive/data/genomes/tarSyr2/bed/ucscToRefSeq cd /hive/data/genomes/tarSyr2/bed/ucscToRefSeq # columns 5 and 7 are the INSDC and RefSeq names grep -v "^#" ../../genbank/GCA_000164805.2_Tarsius_syrichta-2.0.1_assembly_report.txt | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' | sort > insdc.refSeq.tab hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' tarSyr2 | sort > insdc.ucsc.tab join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' | cut -f2- > ucsc.refSeq.tab export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab tarSyr2 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab checkTableCoords tarSyr2 -table=ucscToRefSeq