# for emacs: -*- mode: sh; -*- # This file describes browser build for the anoGam3 # # Can use existing photograph (otherwise find one before starting here) ######################################################################### # Initial steps, find photograph (DONE - 2017-12-19 - Hiram) # Photo obtained from CDC: https://phil.cdc.gov/Details.aspx?pid=444 # To start this initialBuild.txt document, from a previous assembly document: mkdir ~/kent/src/hg/makeDb/doc/anoGam3 cd ~/kent/src/hg/makeDb/doc/anoGam3 sed -e 's/danRer11/anoGam3/g; s/DanRer11/AnoGam3/g; s/DONE/TBD/g;' ../danRer11/initialBuild.txt > initialBuild.txt mkdir -p /hive/data/genomes/anoGam3/refseq cd /hive/data/genomes/anoGam3/refseq time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Anopheles_gambiae/all_assembly_versions/GCF_000005575.2_AgamP3/ ./ # sent 871 bytes received 334800319 bytes 13665354.69 bytes/sec # total size is 334755420 speedup is 1.00 # real 0m24.010s printf "photoCreditURL\thttps://phil.cdc.gov/Details.aspx?pid=444 photoCreditName\tphoto courtesy CDC/James D. Gathany\n" > photoReference.txt # From top of refseq/GCF_000005575.2_AgamP3_assembly_report.txt # Assembly name: AgamP3 # Organism name: Anopheles gambiae str. PEST (African malaria mosquito) # Infraspecific name: strain=PEST # Taxid: 180454 # BioSample: SAMN02952903 # BioProject: PRJNA1438 # Submitter: The International Consortium for the Sequencing of Anopheles Genome # Date: 2006-10-16 # Assembly type: haploid # Release type: major # Assembly level: Chromosome # Genome representation: full # WGS project: AAAB01 # RefSeq category: Representative Genome # GenBank assembly accession: GCA_000005575.1 # RefSeq assembly accession: GCF_000005575.2 # RefSeq assembly and GenBank assemblies identical: no # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000005585.1 GCF_000005585.2 Primary Assembly ## GCF_000183175.1 non-nuclear ############################################################################# # establish config.ra file (DONE - 2017-12-19 - Hiram) cd /hive/data/genomes/anoGam3 ~/kent/src/hg/utils/automation/prepConfig.pl anoGam3 invertebrate anopheles \ refseq/*_assembly_report.txt > anoGam3.config.ra # change the scientificName from Anopheles gambiae str. PEST # to simply: Anopheles gambiae # and reset the fastaFiles and agpFiles names to: # ucsc/chr*.fa.gz and ucsc/chr*.agp since there will be some extra files # in ucsc/* # verify it looks sane cat anoGam3.config.ra # config parameters for makeGenomeDb.pl: db anoGam3 clade insect genomeCladePriority 70 scientificName Anopheles gambiae commonName African malaria mosquito assemblyDate Oct. 2006 assemblyLabel The International Consortium for the Sequencing of Anopheles Genome assemblyShortLabel AgamP3 orderKey 1237 # mitochondrial sequence included in refseq release # mitoAcc NC_002084.1 mitoAcc none fastaFiles /hive/data/genomes/anoGam3/ucsc/chr*.fa.gz agpFiles /hive/data/genomes/anoGam3/ucsc/chr*.agp # qualFiles none dbDbSpeciesDir anopheles photoCreditURL https://phil.cdc.gov/Details.aspx?pid=444 photoCreditName photo courtesy CDC/James D. Gathany ncbiGenomeId 46 ncbiAssemblyId 305108 ncbiAssemblyName AgamP3 ncbiBioProject 1438 ncbiBioSample SAMN02952903 genBankAccessionID GCF_000005575.2 taxId 180454 ############################################################################# # setup UCSC named files (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/ucsc cd /hive/data/genomes/anoGam3/ucsc # measure what is in the refseq release: faSize ../refseq/*3_genomic.fna.gz # 265027044 bases (12572948 N's 252454096 real 197549571 upper 54904525 lower) in 8090 sequences in 1 files # Total size: mean 32759.8 sd 1187387.3 min 349 (NW_158375.1) max 61545105 (NT_078266.2) median 1523 # %20.72 masked total, %21.75 masked real # check for duplicate sequences: time faToTwoBit -noMask ../refseq/*3_genomic.fna.gz refseq.2bit # real 0m52.302s twoBitDup refseq.2bit # there are 49 duplicates, will need to eliminate them twoBitDup refseq.2bit > dup.list awk '{print $1}' dup.list | sort > eliminate.list # new option required to ucscCompositeAgp.pl 2016-04-13 time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ ../refseq/*3_genomic.fna.gz ../refseq/*_assembly_structure/Primary_Assembly # NC_004818.2 chrX # NT_078265.2 chr2L # NT_078266.2 chr2R # NT_078267.5 chr3L # NT_078268.4 chr3R # real 1m21.125s time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly # the chrUn sequence has the duplicates: faToTwoBit chrUn.fa.gz chrUn.2bit twoBitDup chrUn.2bit | awk '{print $1}' > duplicates.chrUn.list mv chrUn.fa.gz dups.chrUn.fa.gz faSomeRecords -exclude dups.chrUn.fa.gz \ duplicates.chrUn.list stdout | gzip -c > chrUn.fa.gz mv chrUn.agp dups.chrUn.agp egrep -v -f duplicates.chrUn.list dups.chrUn.agp > chrUn.agp rm -f chrUn.2bit # also need chrM: echo ">chrM" > chrM.fa zgrep -v "^>" ../refseq/GCF_000005575.2_AgamP3_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz >> chrM.fa gzip chrM.fa zgrep -v "^#" ../refseq/GCF_000005575.2_AgamP3_assembly_structure/non-nuclear/assembled_chromosomes/AGP/chrMT.comp.agp.gz \ | sed -e 's/NC_002084.1/chrM/;' > chrM.agp # verify sequence and AGP match: faToTwoBit chr*.fa.gz test.2bit cat chr*.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -2 # Valid Fasta file entry # All AGP and FASTA entries agree - both files are valid # and we have all the sequence as the original (minus the duplicates) twoBitToFa test.2bit stdout | faSize stdin # 264974304 bases (12572948 N's 252401356 real 252401356 upper 0 lower) # in 8041 sequences in 1 files # original refseq was: # 265027044 bases (12572948 N's 252454096 real 197549571 upper 54904525 lower) # in 8090 sequences in 1 files # 49 missing sequences is correct: calc 8090 - 8041 # 8090 - 8041 = 49.000000 # size of duplicate sequence: faSomeRecords dups.chrUn.fa.gz duplicates.chrUn.list stdout | faSize stdin # 52740 bases (0 N's 52740 real 52740 upper 0 lower) in 49 sequences in 1 files # amount of removed sequence is correct: calc 265027044 - 52740 # 265027044 - 52740 = 264974304.000000 # no longer need these rm refseq.2bit test.2bit ############################################################################# # Initial database build (DONE - 2017-12-19 - Hiram) cd /hive/data/genomes/anoGam3 # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp anoGam3.config.ra) > agp.log 2>&1 # verify OK: tail agp.log # *** All done! (through the 'agp' step) # real 0m26.234s # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db anoGam3.config.ra) > db.log 2>&1 # real 11m9.465s # had to fixup names in dbDb to get it into the same pull-down menu # on the gateway page as is anoGam1: hgsql -e 'update dbDb set organism="A. gambiae" where name like "anoGam3";' hgcentraltest hgsql -e 'update dbDb set genome="A. gambiae" where name like "anoGam3";' hgcentraltest # check in the trackDb files created in TemporaryTrackDbCheckout/ # and add anoGam3 to trackDb/makefile # temporary symlink until masked sequence is available cd /hive/data/genomes/anoGam3 ln -s `pwd`/anoGam3.unmasked.2bit /gbdb/anoGam3/anoGam3.2bit ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/cpgIslandsUnmasked cd /hive/data/genomes/anoGam3/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/anoGam3/anoGam3.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku anoGam3) > do.log 2>&1 # real 1m45.616s cat fb.anoGam3.cpgIslandExtUnmasked.txt # 21871673 bases of 264424304 (8.271%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/cytoBand cd /hive/data/genomes/anoGam3/bed/cytoBand makeCytoBandIdeo.csh anoGam3 ######################################################################### # ucscToRefSeq table/track (DONE - 2017-12-19 - Hiram) # the sequence here is working for a 'refseq' assembly with a chrM # situation may be specific depending upon what is available in the assembly mkdir /hive/data/genomes/anoGam3/bed/ucscToRefSeq cd /hive/data/genomes/anoGam3/bed/ucscToRefSeq # find accession for chrM grep chrM ../../anoGam3.agp # chrM 1 15363 1 O NC_002084.1 1 15363 + # use that accession here: ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../refseq/GCF_*structure/Primary_Assembly NC_002084.1 # this is actually refseq, not genbank, and need to remove duplicates: egrep -v -f ../../ucsc/duplicates.chrUn.list ucscToINSDC.txt \ > ucscToRefSeq.txt rm ucscToINSDC.txt awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join -t$'\t' name.coordinate.tab ucscToRefSeq.txt > ucscToRefSeq.bed # should all be the same number of items: wc -l * # 8041 name.coordinate.tab # 8041 ucscToRefSeq.bed # 8041 ucscToRefSeq.txt cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1 # 23 # use the 23 in this sed sed -e "s/21/23/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab anoGam3 ucscToRefSeq stdin ucscToRefSeq.bed checkTableCoords anoGam3 # should cover %100 entirely: featureBits -countGaps anoGam3 ucscToRefSeq # 264974304 bases of 264974304 (100.000%) in intersection ######################################################################### # ucscToINSDC table/track (DONE - 2017-12-19 - Hiram) # using the idKeys constructed mkdir /hive/data/genomes/anoGam3/bed/ucscToINSDC cd /hive/data/genomes/anoGam3/bed/ucscToINSDC twoBitDup ../idKeys/genbank/genbank.anoGam3.2bit \ | awk '{print $1}' > genbank.duplicate.list join -t$'\t' ../idKeys/anoGam3.idKeys.txt \ ../idKeys/genbank/refseqAnoGam3.idKeys.txt \ | egrep -v -f genbank.duplicate.list | cut -f2- \ | sort > ucscToINSDC.txt awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join -t$'\t' name.coordinate.tab ucscToINSDC.txt > ucscToINSDC.bed # lookup the chrM sequence an Entrez nuclotide NC_002084.1 -> L20934.1 # adding this chrM to the .bed file: grep chrM ../../*.sizes # chrM 15363 printf "chrM\t0\t15363\tL20934.1\n" >> ucscToINSDC.bed # should all be the same row counts wc -l * # 8041 name.coordinate.tab # 8041 ucscToINSDC.bed # 8040 ucscToINSDC.txt # the ucscToINSDC.txt is missing the chrM identifier cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 23 # use the 23 in this sed sed -e "s/21/23/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab anoGam3 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords anoGam3 # should cover %100 entirely: featureBits -countGaps anoGam3 ucscToINSDC # 264974304 bases of 264974304 (100.000%) in intersection ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2017-12-19 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/anopheles/anoGam3 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" anoGam3 \ | sed -e 's/[0-9][0-9]*//; s/\.[0-9]\+$//' | sort | uniq -c # 8115 AAAB # 1 NC_ # implies a rule: '[AN][ABC_]+[0-9]+(\.[0-9]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" anoGam3 | wc -l # 8116 hgsql -N -e "select frag from gold;" anoGam3 \ | egrep -e '[AN][ABC_]+[0-9]+(\.[0-9]+)?' | wc -l # 8116 hgsql -N -e "select frag from gold;" anoGam3 \ | egrep -v -e '[AN][ABC_]+[0-9]+(\.[0-9]+)?' | wc -l # 0 # hence, add to trackDb/zebrafish/anoGam3/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][ABC_]+[0-9]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ########################################################################## # running repeat masker (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/repeatMasker cd /hive/data/genomes/anoGam3/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku anoGam3) > do.log 2>&1 # real 60m3.969s cat faSize.rmsk.txt # 264974304 bases (12572948 N's 252401356 real 216585988 upper 35815368 lower) in 8041 sequences in 1 files # Total size: mean 32952.9 sd 1190997.5 min 349 (chrUn_NW_158375v1) max 61545105 (chr2R) median 1529 # %13.52 masked total, %14.19 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker # January 31 2015 (open-4-0-5) version of RepeatMasker # grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl # CC RELEASE 20140131; time featureBits -countGaps anoGam3 rmsk # 35874857 bases of 264974304 (13.539%) in intersection # real 0m5.461s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result for high contig count assemblies: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' anoGam3 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 35874857.000000 # real 0m1.798s ########################################################################## # running simple repeat (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/simpleRepeat cd /hive/data/genomes/anoGam3/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409 3 anoGam3) > do.log 2>&1 # real 5m39.782s cat fb.simpleRepeat # 6341048 bases of 264424304 (2.398%) in intersection cd /hive/data/genomes/anoGam3 # using the Window Masker result: #twoBitMask bed/windowMasker/anoGam3.cleanWMSdust.2bit \ # -add bed/simpleRepeat/trfMask.bed anoGam3.2bit # you can safely ignore the warning about fields >= 13 # add to rmsk after it is done: twoBitMask anoGam3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed anoGam3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa anoGam3.2bit stdout | faSize stdin > faSize.anoGam3.2bit.txt cat faSize.anoGam3.2bit.txt # 264974304 bases (12572948 N's 252401356 real 216346487 upper 36054869 lower) in 8041 sequences in 1 files # Total size: mean 32952.9 sd 1190997.5 min 349 (chrUn_NW_158375v1) max 61545105 (chr2R) median 1529 # %13.61 masked total, %14.28 masked real rm /gbdb/anoGam3/anoGam3.2bit ln -s `pwd`/anoGam3.2bit /gbdb/anoGam3/anoGam3.2bit ######################################################################### # CREATE MICROSAT TRACK (DONE - 2017-12-19 - Hiram) ssh hgwdev mkdir /cluster/data/anoGam3/bed/microsat cd /cluster/data/anoGam3/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed anoGam3 microsat microsat.bed # Read 3959 elements of size 4 from microsat.bed featureBits -countGaps anoGam3 microsat # 226144 bases of 264974304 (0.085%) in intersection ########################################################################## ## WINDOWMASKER (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/windowMasker cd /hive/data/genomes/anoGam3/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev anoGam3) > do.log 2>&1 # real 12m20.317s # Masking statistics cat faSize.anoGam3.cleanWMSdust.txt # 264974304 bases (12572948 N's 252401356 real 196003243 upper 56398113 lower) in 8041 sequences in 1 files # Total size: mean 32952.9 sd 1190997.5 min 349 (chrUn_NW_158375v1) max 61545105 (chr2R) median 1529 # %21.28 masked total, %22.34 masked real cat fb.anoGam3.rmsk.windowmaskerSdust.txt # 19279994 bases of 264974304 (7.276%) in intersection ########################################################################## # run up idKeys files for chromAlias (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/idKeys cd /hive/data/genomes/anoGam3/bed/idKeys time (doIdKeys.pl -buildDir=`pwd` -twoBit=/hive/data/genomes/anoGam3/anoGam3.unmasked.2bit anoGam3) > do.log 2>&1 & # real 5m6.175s cat anoGam3.keySignature.txt # ae5975a59b0fd10409ed20280a0188df # genbank and flybase idKeys were also run up here in ./genbank/ # and ./flybase/ ########################################################################## # cpgIslands - (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/cpgIslands cd /hive/data/genomes/anoGam3/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku anoGam3) > do.log 2>&1 # real 1m15.884s cat fb.anoGam3.cpgIslandExt.txt # 19703582 bases of 264424304 (7.452%) in intersection ############################################################################## # genscan - (DONE - 2017-12-19 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/genscan cd /hive/data/genomes/anoGam3/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku anoGam3) > do.log 2>&1 # real 34m10.304s cat fb.anoGam3.genscan.txt # 35064324 bases of 264424304 (13.261%) in intersection cat fb.anoGam3.genscanSubopt.txt # 2488274 bases of 264424304 (0.941%) in intersection ############################################################################# # Create kluster run files (DONE - 2017-12-20 - Hiram) # numerator is anoGam3 gapless bases "real" as reported by: featureBits -noRandom -noHap anoGam3 gap # 550000 bases of 229932020 (0.239%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 229932020 / 2861349177 \) \* 1024 # ( 229932020 / 2861349177 ) * 1024 = 82.286493 # ==> use -repMatch=100 according to size scaled down from 1024 for human. # and rounded up to nearest 100 cd /hive/data/genomes/anoGam3 blat anoGam3.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/anoGam3.11.ooc \ -repMatch=100 # Wrote 10653 overused 11-mers to jkStuff/anoGam3.11.ooc # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' anoGam3 \ | sort -k7,7nr | ave -col=7 stdin # all these non-bridged gaps are 10,000 gapToLift -verbose=2 -minGap=10000 anoGam3 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed ######################################################################## # GENBANK AUTO UPDATE (DONE - 2017-12-20 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # organism mrnaCnt estCnt refSeqCnt # Anopheles gambiae 64151 153332 0 # edit etc/genbank.conf to add anoGam3 just before rheMac2 # anoGam3 (A. gambiae) anoGam3.serverGenome = /hive/data/genomes/anoGam3/anoGam3.2bit anoGam3.clusterGenome = /hive/data/genomes/anoGam3/anoGam3.2bit anoGam3.ooc = /hive/data/genomes/anoGam3/jkStuff/anoGam3.11.ooc anoGam3.lift = /hive/data/genomes/anoGam3/jkStuff/nonBridged.lft anoGam3.perChromTables = no anoGam3.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} anoGam3.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} anoGam3.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} anoGam3.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} anoGam3.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} # DO NOT NEED genbank.mrna.xeno except for human, mouse # anoGam1 had genbank.mrna.xeno.load as yes # defaults yes: genbank.mrna.native.load, genbank.mrna.native.loadDesc, # genbank.est.native.load, refseq.mrna.native.load, refseq.mrna.native.loadDesc, # refseq.mrna.xeno.load , refseq.mrna.xeno.loadDesc anoGam3.genbank.mrna.xeno.load = yes anoGam3.downloadDir = anoGam3 git commit -m "Added anoGam3; no redmine" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update screen # control this business with a screen since it takes a while cd /cluster/data/genbank time ./bin/gbAlignStep -initial anoGam3 # logFile: var/build/logs/2017.12.20-08:12:09.anoGam3.initalign.log tail var/build/logs/2017.12.20-08:12:09.anoGam3.initalign.log # hgwdev 2017.12.20-18:35:09 anoGam3.initalign: Succeeded: anoGam3 # hgwdev 2017.12.20-18:40:13 anoGam3.initalign: finish # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.anoGam3 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad anoGam3 # logFile: var/dbload/hgwdev/logs/2017.12.20-19:54:42.anoGam3.dbload.log # real 71m51.995s tail -1 var/dbload/hgwdev/logs/2017.12.20-19:54:42.anoGam3.dbload.log # hgwdev 2017.12.20-21:06:34 anoGam3.dbload: finish # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add anoGam3 to: # etc/align.dbs etc/hgwdev.dbs git add etc/align.dbs etc/hgwdev.dbs git commit -m "Added anoGam3 - malaria mosquito refs #20743" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################# # augustus gene track (DONE - 2017-12-20 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/augustus cd /hive/data/genomes/anoGam3/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=zebrafish -dbHost=hgwdev \ -workhorse=hgwdev anoGam3) > do.log 2>&1 # real 65m0.735s cat fb.anoGam3.augustusGene.txt # 20832482 bases of 264424304 (7.878%) in intersection ######################################################################### # add chromAlias table (DONE - 2017-12-20 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/chromAlias cd /hive/data/genomes/anoGam3/bed/chromAlias # working up new procedure here to allow in the duplicates from the # others to the single names at UCSC # the corresponding lists with duplicates are made from the idKeys # run up for the various assemblies: join -t$'\t' ../idKeys/anoGam3.idKeys.txt \ ../../../anoGam2/flybase/both.idKeys.txt | cut -f2- \ | sort > ucsc.ensembl.tab (printf "chrM\tL20934.1\n"; join -t$'\t' ../idKeys/anoGam3.idKeys.txt \ ../idKeys/genbank/refseqAnoGam3.idKeys.txt | cut -f2-) \ | sort > ucsc.genbank.tab join -t$'\t' ../idKeys/anoGam3.idKeys.txt \ ../idKeys/refseq/refseqAnoGam3.idKeys.txt | cut -f2- \ | sort > ucsc.refseq.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.genbank.tab \ ucsc.refseq.tab ucsc.ensembl.tab \ > anoGam3.chromAlias.tab # verify chrM is OK: grep chrM anoGam3.chromAlias.tab # L20934.1 chrM genbank # Mt chrM ensembl # NC_002084.1 chrM refseq for t in refseq genbank ensembl do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t anoGam3.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 8090 =? 8090 OK # checking genbank: 8090 =? 8090 OK # checking ensembl: 8090 =? 8090 OK hgLoadSqlTab anoGam3 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ anoGam3.chromAlias.tab ############################################################################## # lastz/chain/net swap D. melanogaster/dm6 (DONE - 2017-12-20 - Hiram) # alignment to dm6 cd /hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20 cat fb.dm6.chainAnoGam3Link.txt # 19641460 bases of 142573024 (13.776%) in intersection cat fb.dm6.chainRBestAnoGam3Link.txt # 15964959 bases of 142573024 (11.198%) in intersection # running the swap mkdir /hive/data/genomes/anoGam3/bed/blastz.dm6.swap cd /hive/data/genomes/anoGam3/bed/blastz.dm6.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > swap.log 2>&1 # real 2m0.901s cat fb.anoGam3.chainDm6Link.txt # 20168390 bases of 264424304 (7.627%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev \ -buildDir=`pwd` anoGam3 dm6) > rBest.log 2>&1 & # real 46m7.441s cat fb.anoGam3.chainRBestDm6Link.txt # 16040573 bases of 264424304 (6.066%) in intersection ########################################################################### # LIFTOVER TO anoGam1 (DONE - 2017-12-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/anoGam3/bed/blat.anoGam1.2017-12-20 cd /hive/data/genomes/anoGam3/bed/blat.anoGam1.2017-12-20 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/anoGam3/jkStuff/anoGam3.11.ooc \ anoGam3 anoGam1) > doLiftOverToAnoGam1.log 2>&1 # real 20m45.302s # see if the liftOver menus function in the browser from anoGam3 to anoGam1 ######################################################################### # BLATSERVERS ENTRY (DONE - 2017-12-20 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev # Starting trans gfServer for anoGam3 on host blat1d, port 17890 # Starting untrans gfServer for anoGam3 on host blat1d, port 17891 hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("anoGam3", "blat1d", "17890", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("anoGam3", "blat1d", "17891", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## reset default position to same as anoGam1 ## (DONE - 2017-12-21 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr2R:6479796-6482329" where name="anoGam3";' hgcentraltest ############################################################################ # all.joiner update, downloads and in pushQ - (DONE - 2017-12-20 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=anoGam3 -tableCoverage all.joiner joinerCheck -database=anoGam3 -times all.joiner joinerCheck -database=anoGam3 -keys all.joiner cd /hive/data/genomes/anoGam3 time (makeDownloads.pl anoGam3) > downloads.log 2>&1 # real 2m14.288s # now ready for pushQ entry mkdir /hive/data/genomes/anoGam3/pushQ cd /hive/data/genomes/anoGam3/pushQ time (makePushQSql.pl -redmineList anoGam3) > anoGam3.pushQ.sql 2> stderr.out # real 3m51.650s # the chainRBest and chainSyn tables do not go out, they are # not on the redmine tables listing. The reciprocalBest download # files are released and are on the redmine files listing # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # chainRBestDm6 # chainRBestDm6Link # chainSynDm6 # chainSynDm6Link # netRBestDm6 # netSynDm6 # verify the file listings are valid, should be no output to stderr: cat redmine.anoGam3.file.list \ | while read L; do ls -ogL $L; done > /dev/null # to verify the database.table list is correct, should be the same # line count for these two commands: wc -l redmine.anoGam3.table.list # 62 redmine.anoGam3.table.list awk -F'.' '{ printf "hgsql -N -e \"show table status like '"'"'%s'"'"';\" %s\n", $2, $1 }' redmine.anoGam3.table.list | while read L; do eval $L; done | wc -l # 62 # Put a path to the files produced here in the redmine ticket ############################################################################# # Ensembl genes (DONE - 2018-01-06 - Hiram) # after chromAlias work is done: cd /hive/data/genomes/anoGam3/jkStuff join -t$'\t' <(sort -k1,1 ../chrom.sizes) \ <(sort ../bed/chromAlias/ucsc.ensembl.tab) \ | awk '{printf "0\t%s\t%d\t%s\t%d\n", $3,$2,$1,$2}' > ensToUcsc.lift mkdir /hive/data/genomes/anoGam3/bed/ensGene.37/download cd /hive/data/genomes/anoGam3/bed/ensGene.37/download wget --tries=2 --user=anonymous --password=ucscGenomeBrowser@ucsc.edu \ ftp://ftp.ensemblgenomes.org/pub/metazoa/release-37/gtf/anopheles_gambiae/Anopheles_gambiae.AgamP4.37.gtf.gz \ -O Anopheles_gambiae.AgamP4.37.gtf.gz wget --tries=2 --user=anonymous --password=ucscGenomeBrowser@ucsc.edu \ ftp://ftp.ensemblgenomes.org/pub/metazoa/release-37/fasta/anopheles_gambiae/pep/Anopheles_gambiae.AgamP4.pep.all.fa.gz \ -O Anopheles_gambiae.AgamP4.pep.all.fa.gz mkdir /hive/data/genomes/anoGam3/bed/ensGene.37/process cd /hive/data/genomes/anoGam3/bed/ensGene.37/process printf '#!/usr/bin/env perl use strict; use warnings; open (FH, ") { chomp $line; my ($start, $contig, $size, $flyName, $flySize) = split('"'"'\\t'"'"', $line); printf "%%s\\t%%d\\t%%d\\t%%s\\t0\\t+\\n", $flyName, $start, $start+$size, $contig; } close (FH); ' > liftToLiftAcross.pl chmod +x liftToLiftAcross.pl ln -s ../../flybase/ucscToFlyBase.lift . # procedure here is patterned after: # /hive/data/genomes/vicPac1/bed/ensGene.91/process/doProcess.csh # with the names changed and added special liftAcross/liftUp zcat ../download/Anopheles_gambiae.AgamP4.37.gtf.gz \ | gzip > allGenes.gtf.gz gtfToGenePred -includeVersion -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \ | gzip > anoGam3.allGenes.gp.gz /cluster/bin/scripts/extractGtf.pl infoOut.txt > ensGtp.tab /cluster/bin/scripts/ensemblInfo.pl infoOut.txt > ensemblToGeneName.tab /cluster/bin/scripts/extractSource.pl allGenes.gtf.gz | sort -u > ensemblSource.tab set NL = `awk 'BEGIN{max=0}{if (length($1) > max) max=length($1)}END{print max}' ensemblToGeneName.tab` set VL = `awk 'BEGIN{max=0}{if (length($2) > max) max=length($2)}END{print max}' ensemblToGeneName.tab` sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \ /cluster/home/hiram/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql sed -e "s/name(15)/name($NL)/" \ /cluster/home/hiram/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql mv anoGam3.allGenes.gp.gz anoGam3.allGenes.beforeLiftAcross.gp.gz ./liftToLiftAcross.pl | gzip -c > anoGam3.ensGene.liftAcross.gz liftAcross -warn -bedOut=ensemblGeneScaffolds.anoGam3.bed \ anoGam3.ensGene.liftAcross.gz \ anoGam3.allGenes.beforeLiftAcross.gp.gz \ anoGam3.allGenes.beforeLiftUp.gp >& liftAcross.err.out # one gene has bad cds business # Error: anoGam3.allGenes.gp:15373: AGAP012697-RA cdsEnd 3384 not in tx bounds 1961-2981 # Error: anoGam3.allGenes.gp:15373: AGAP012697-RA cdsEnd 3384 not in an exon # checked: 15633 failed: 2 liftUp -extGenePred -type=.gp stdout \ /hive/data/genomes/anoGam3/jkStuff/ensToUcsc.lift carry \ anoGam3.allGenes.beforeLiftUp.gp | grep -v AGAP012697-RA \ > anoGam3.allGenes.gp gzip ensemblGeneScaffolds.anoGam3.bed anoGam3.allGenes.gp liftAcross.err.out \ anoGam3.allGenes.beforeLiftUp.gp genePredCheck -db=anoGam3 anoGam3.allGenes.gp.gz # construct bigBed and index for assembly hub mkdir -p bbi genePredToBed anoGam3.allGenes.gp.gz stdout | sort -k1,1 -k2,2n > anoGam3.ensGene.bed bedToBigBed -extraIndex=name anoGam3.ensGene.bed ../../../chrom.sizes bbi/anoGam3.ensGene.bb grep -v "^#" infoOut.txt | awk '{printf "%s\t%s,%s,%s,%s,%s\n", $1,$2,$3,$8,$9,$10}' > anoGam3.ensGene.nameIndex.txt ixIxx anoGam3.ensGene.nameIndex.txt anoGam3.ensGene.name.ix anoGam3.ensGene.name.ixx # procedure here is patterned after: # /hive/data/genomes/vicPac1/bed/ensGene.91/doLoad.csh hgLoadGenePred -genePredExt anoGam3 ensGene process/anoGam3.allGenes.gp.gz \ >& loadGenePred.errors.txt checkTableCoords anoGam3 -table=ensGene genePredCheck -db=anoGam3 ensGene zcat process/ensemblGeneScaffolds.anoGam3.bed.gz | sort > to.clean.GeneScaffolds.bed cut -f1 /hive/data/genomes/anoGam3/chrom.sizes | sort > legitimate.names join -t' ' legitimate.names to.clean.GeneScaffolds.bed \ | sed -e "s/GeneScaffold/GS/" | hgLoadBed anoGam3 ensemblGeneScaffold stdin checkTableCoords anoGam3 -table=ensemblGeneScaffold zcat download/Anopheles_gambiae.AgamP4.pep.all.fa.gz \ | sed -e 's/^>.* transcript:/>/; s/ CCDS.*$//; s/ .*$//' | gzip > ensPep.txt.gz zcat ensPep.txt.gz \ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ | sed -e '/^$/d; s/*$//' | sort > ensPep.anoGam3.fa.tab hgPepPred anoGam3 tab ensPep ensPep.anoGam3.fa.tab hgLoadSqlTab anoGam3 ensGtp ~/kent/src/hg/lib/ensGtp.sql process/ensGtp.tab hgLoadSqlTab anoGam3 ensemblToGeneName process/ensemblToGeneName.sql process/ensemblToGeneName.tab hgLoadSqlTab anoGam3 ensemblSource process/ensemblSource.sql process/ensemblSource.tab # verify names in ensGene is a superset of names in ensPep hgsql -N -e "select name from ensPep;" anoGam3 | sort > ensPep.name hgsql -N -e "select name from ensGene;" anoGam3 | sort > ensGene.name set geneCount = `cat ensGene.name | wc -l` set pepCount = `cat ensPep.name | wc -l` set commonCount = `comm -12 ensPep.name ensGene.name | wc -l` set percentId = \ `echo $commonCount $pepCount | awk '{printf "%d", 100.0*$1/$2}'` echo "gene count: $geneCount, peptide count: $pepCount, common name count: $commonCount" echo "percentId: $percentId" if (! ($percentId > 95)) then echo "ERROR: percent coverage of peptides to genes: $percentId" echo "ERROR: should be greater than 95" exit 255 endif exit $status XXX need to fixup this entry: hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("anoGam3", "ensGene", "hiram", "91", now(), \ "with peptides Vicugna_pacos.anoGam3.pep.all.fa.gz", \ "ftp://ftp.ensembl.org/pub/release-91/gtf/vicugna_pacos/Vicugna_pacos.anoGam3.91.gtf.gz", \ "dec2017" );' hgFixed cd /hive/data/genomes/cavApe1 printf "# required db variable db cavApe1 # specific lifting to translate names: liftUp /hive/data/genomes/cavApe1/jkStuff/ensToUcsc.lift " > cavApe1.ensGene.ra time (doEnsGeneUpdate.pl -ensVersion=91 cavApe1.ensGene.ra) \ > ensGene.91.log 2>&1 # real 1m31.613s featureBits cavApe1 ensGene # 21842310 bases of 2716396567 (0.804%) in intersection ############################################################################## # crispr10K track (DONE - 2018-01-06 - Hiram) mkdir /hive/data/genomes/anoGam3/bed/crispr.2018-01-06 cd /hive/data/genomes/anoGam3/bed/crispr.2018-01-06 # construct index for new genome, the index ends up in: # /hive/data/outside/crisprTrack/crispor/genomes/anoGam3/ export PATH=/hive/data/outside/crisprTrack/crispor/tools/usrLocalBin:$PATH export TMPDIR=/dev/shm time (/hive/data/outside/crisprTrack/crispor/tools/crisprAddGenome \ ucscLocal anoGam3 --geneTable=ensGene --baseDir \ /hive/data/outside/crisprTrack/crispor/genomes) \ > createIndex.log 2>&1 # real 7m53.047s # result in /hive/data/outside/crisprTrack/crispor/genomes/anoGam3/ # total 1050624 # -rw-rw-r-- 1 68409555 Jan 6 22:21 anoGam3.2bit # -rw-rw-r-- 1 264974392 Jan 6 22:26 anoGam3.fa.bwt # -rw-rw-r-- 1 66243578 Jan 6 22:26 anoGam3.fa.pac # -rw-rw-r-- 1 354910 Jan 6 22:26 anoGam3.fa.ann # -rw-rw-r-- 1 135831 Jan 6 22:26 anoGam3.fa.amb # -rw-rw-r-- 1 132487208 Jan 6 22:28 anoGam3.fa.sa # -rw-rw-r-- 1 185978 Jan 6 22:28 anoGam3.sizes # -rw-rw-r-- 1 5007758 Jan 6 22:28 anoGam3.segments.bed # -rw-rw-r-- 1 371 Jan 6 22:28 genomeInfo.tab # running step wise to be careful and see if this works OK time (~/kent/src/hg/utils/automation/doCrispr.pl -stop=ranges \ -buildDir=`pwd` anoGam3 ensGene) > ranges.log 2>&1 # real 0m8.964s time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=guides \ -stop=guides -buildDir=`pwd` anoGam3 ensGene) > guides.log 2>&1 # real 0m57.600s cat guides/run.time | sed -e 's/^/# /;' | head -6 # Completed: 99 of 99 jobs # CPU time in finished jobs: 742s 12.36m 0.21h 0.01d 0.000 y # IO & Wait Time: 228s 3.81m 0.06h 0.00d 0.000 y # Average job time: 10s 0.16m 0.00h 0.00d # Longest finished job: 13s 0.22m 0.00h 0.00d # Submission to last job: 25s 0.42m 0.01h 0.00d # this is the big cluster job time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScores \ -stop=specScores -buildDir=`pwd` anoGam3 ensGene) > specScores.log 2>&1 # real 1110m57.233s # Completed: 186799 of 186799 jobs # CPU time in finished jobs: 12910905s 215181.76m 3586.36h 149.43d 0.409 y # IO & Wait Time: 47964247s 799404.11m 13323.40h 555.14d 1.521 y # Average job time: 326s 5.43m 0.09h 0.00d # Longest finished job: 580s 9.67m 0.16h 0.01d # Submission to last job: 63574s 1059.57m 17.66h 0.74d time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=effScores \ -stop=offTargets -buildDir=`pwd` anoGam3 ensGene) \ > effScores.offTargets.log 2>&1 # real 251m21.154s # effScores # Completed: 1413 of 1413 jobs # CPU time in finished jobs: 793726s 13228.77m 220.48h 9.19d 0.025 y # IO & Wait Time: 10053s 167.55m 2.79h 0.12d 0.000 y # Average job time: 569s 9.48m 0.16h 0.01d # Longest finished job: 13440s 224.00m 3.73h 0.16d # Submission to last job: 14019s 233.65m 3.89h 0.16d # offTargets # Completed: 9340 of 9340 jobs # CPU time in finished jobs: 14042s 234.03m 3.90h 0.16d 0.000 y # IO & Wait Time: 37970s 632.84m 10.55h 0.44d 0.001 y # Average job time: 6s 0.09m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 66s 1.10m 0.02h 0.00d time (~/kent/src/hg/utils/automation/doCrispr.pl -continue=load \ -stop=load -buildDir=`pwd` anoGam3 ensGene) \ > load.log 2>&1 # real 13m52.830s featureBits -countGaps anoGam3 crispr10KRanges # 147498676 bases of 264974304 (55.665%) in intersection # XXX this does not work: Mon Jan 8 13:29:18 PST 2018 ~/kent/src/utils/doLocusName -o anoGam3 ensGene sh: tabRepl: command not found Usage: bedOverlapMerge [options] file - merge overlapping bed features, join their names bedOverlapMerge: error: no such option: -o Usage: bedBetween [options] inSortedBedFile outfile (stdout ok): given sorted feats., create features between them and annotated these with the neighboring bednames. Regions around chromosomes are limited to 50kbp. bedBetween: error: no such option: -m ##############################################################################