# for emacs: -*- mode: sh; -*- # DATE: 31-May-2012 # ORGANISM: Alligator mississippiensis # TAXID: 8496 # ASSEMBLY LONG NAME: allMis0.2 # ASSEMBLY SHORT NAME: allMis0.2 # ASSEMBLY SUBMITTER: International Crocodilian Genomes Working Group # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000281125.1 # FTP-RELEASE DATE: 07-Aug-2012 # http://www.ncbi.nlm.nih.gov/genome/13409 # http://www.ncbi.nlm.nih.gov/assembly/406428/ # http://www.ncbi.nlm.nih.gov/bioproject/159843 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AKHW01 # Genome Coverage : 68x # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=8496 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Alligator_mississippiensis/allMis0.2/ ########################################################################## # Download sequence (DONE - 2012-08-23 - Hiram) mkdir /hive/data/genomes/allMis1 cd /hive/data/genomes/allMis1 mkdir genbank cd genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Alligator_mississippiensis/allMis0.2/ ./ # verify the size of the sequence here: faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2174243242 bases (45539765 N's 2128703477 real 2128703477 upper 0 lower) # in 14644 sequences in 1 files # Total size: mean 148473.3 sd 283025.3 min 816 (gi|397224103|gb|AKHW01085896.1|) # max 4958242 (gi|397457027|gb|JH738473.1|) median 25764 # %0.00 masked total, %0.00 masked real # strip the names down to something reasonable, they can all be: # >JH7[0-9] zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s/^>.*JH7/>JH7/; s/^>.*AKHW01/>AKHW01/; s/.1. Alligator.*//" \ | gzip -c > ucsc.fa.gz zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | sed -e "s/^\(JH7[0-9]*\).1/\1/; s/^\(AKHW01[0-9]*\).1/\1/;" \ | gzip -c > ucsc.agp.gz time checkAgpAndFa ucsc.agp.gz ucsc.fa.gz 2>&1 | tail -2 # Valid Fasta file entry # All AGP and FASTA entries agree - both files are valid # real 0m59.548s mkdir /hive/data/genomes/allMis1/photograph cd /hive/data/genomes/allMis1/photograph wget --timestamping \ http://www.nasa.gov/centers/kennedy/images/content/91159main_93pc780.jpg convert -geometry "220x300" 91159main_93pc780.jpg \ Alligator_Mississippiensis.jpg # check this .jpg file into the source tree kent/src/hg/htdocs/images/ git commit -m "gator photo from NASA Kennedy" Alligator_Mississippiensis.jpg # and copy to /usr/local/apache/htdocs/images cp -p Alligator_Mississippiensis.jpg /usr/local/apache/htdocs/images ########################################################################## # Initial makeGenomeDb.pl (DONE - 2012-08-23 - Hiram) # obtain a template for this from the source tree: # kent/src/hg/utils/automation/configFiles/ # and check it back into the source tree when completed here: cd /hive/data/genomes/geoFor1 cat << '_EOF_' > geoFor1.config.ra # Config parameters for makeGenomeDb.pl: db allMis1 clade vertebrate genomeCladePriority 68 scientificName Alligator Mississippiensis commonName American alligator assemblyDate Aug. 2012 assemblyLabel International Crocodilian Genomes Working Group assemblyShortLabel allMis0.2 orderKey 4380 mitoAcc NC_001922 fastaFiles /hive/data/genomes/allMis1/genbank/ucsc.fa.gz agpFiles /hive/data/genomes/allMis1/genbank/ucsc.agp.gz # qualFiles none dbDbSpeciesDir gator photoCreditURL http://green.soe.ucsc.edu/ photoCreditName Photo courtesy of the Green Lab, U.C. Santa Cruz ncbiGenomeId 13409 ncbiAssemblyId 406428 ncbiAssemblyName allMis0.2 ncbiBioProject 159843 genBankAccessionID GCA_000281125.1 taxId 8496 '_EOF_' # << happy emacs time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp allMis1.config.ra > agp.log 2>&1 # real 2m32.247s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) # finish it off time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev allMis1.config.ra > db.log 2>&1 # real 16m30.326s # was missing the photograph, it needs to exist for this to complete: time makeGenomeDb.pl -continue=trackDb -workhorse=hgwdev \ -fileServer=hgwdev -dbHost=hgwdev allMis1.config.ra > trackDb.log 2>&1 # real 9m24.133s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/allMis1.unmasked.2bit /gbdb/allMis1/allMis1.2bit # browser should function now hgsql -e 'update defaultDb set name="allMis1" where name="allMis9";' \ hgcentraltest ########################################################################## # running repeat masker (DONE - 2013-06-07 - Hiram) # with new repeat masker version that includes the Alligator repeats mkdir /hive/data/genomes/allMis1/bed/rmsk.2013-06-06 cd /hive/data/genomes/allMis1/bed/rmsk.2013-06-06 # the clusterRun portion: # real 1251m59.513s # user 0m34.986s # sys 0m19.211s # Completed: 6152 of 6152 jobs # CPU time in finished jobs: 16757781s 279296.34m 4654.94h 193.96d 0.531 y # IO & Wait Time: 1074282s 17904.71m 298.41h 12.43d 0.034 y # Average job time: 2899s 48.31m 0.81h 0.03d # Longest finished job: 4741s 79.02m 1.32h 0.05d # Submission to last job: 74762s 1246.03m 20.77h 0.87d time doRepeatMasker.pl -buildDir=`pwd` \ -continue=cat -dbHost=hgwdev \ -bigClusterHub=swarm -workhorse=hgwdev allMis1 > cat.log 2>&1 # real 42m20.999s cat faSize.rmsk.txt # 2174259888 bases (45539765 N's 2128720123 real 1337574861 upper # 791145262 lower) in 14645 sequences in 1 files # Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896) # max 4958242 (JH738473) median 25754 # %36.39 masked total, %37.17 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.2 # April 29 2013 (open-4-0-2) version of RepeatMasker # CC RELEASE 20130422; featureBits -countGaps allMis1 rmsk # 793968795 bases of 2174259888 (36.517%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-08-28 - Hiram) mkdir /hive/data/genomes/allMis1/bed/simpleRepeat cd /hive/data/genomes/allMis1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ allMis1 > do.log 2>&1 & # real 9m46.133s cat fb.simpleRepeat # 22565360 bases of 2129659933 (1.060%) in intersection # considering rmsk %37 vs. WM %29, using RM and TRF cd /hive/data/genomes/allMis1 twoBitMask allMis1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed allMis1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa allMis1.2bit stdout | faSize stdin > faSize.allMis1.2bit.txt cat faSize.allMis1.2bit.txt # 2174259888 bases (45539765 N's 2128720123 real 1337312569 upper # 791407554 lower) in 14645 sequences in 1 files # Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896) # max 4958242 (JH738473) median 25754 # %36.40 masked total, %37.18 masked real rm /gbdb/allMis1/allMis1.2bit ln -s `pwd`/allMis1.2bit /gbdb/allMis1/allMis1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/allMis1/bed/gap cd /hive/data/genomes/allMis1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../allMis1.unmasked.2bit > findMotif.txt 2>&1 # real 0m29.584s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits allMis1 -not gap -bed=notGap.bed # 2129659933 bases of 2129659933 (100.000%) in intersection # real 0m16.373s # can see now if allGaps.bed actually is all the gaps: hgsql -N -e "select size from gap;" allMis1 | ave stdin | grep total # total 44599955.000000 ave -col=5 allGaps.bed | grep total # total 45539765.000000 # not the same count, not all gaps are marked time featureBits allMis1 allGaps.bed notGap.bed -bed=new.gaps.bed # 939810 bases of 2129659933 (0.044%) in intersection # real 44m59.537s # check if any non-bridged gaps here: hgsql -N -e "select bridge from gap;" allMis1 | sort | uniq -c # 99514 yes ######################################################################### # cytoBandIdeo - (DONE - 2013-06-18 - Hiram) mkdir /hive/data/genomes/allMis1/bed/cytoBand cd /hive/data/genomes/allMis1/bed/cytoBand makeCytoBandIdeo.csh allMis1 ########################################################################## ## WINDOWMASKER (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/allMis1/bed/windowMasker cd /hive/data/genomes/allMis1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev allMis1 > do.log 2>&1 & # real 161m54.809s # Masking statistics cat faSize.allMis1.wmsk.txt # 2174259888 bases (45539765 N's 2128720123 real 1521018315 upper # 607701808 lower) in 14645 sequences in 1 files # Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896) # max 4958242 (JH738473) median 25754 # %27.95 masked total, %28.55 masked real cat faSize.allMis1.wmsk.sdust.txt # 2174259888 bases (45539765 N's 2128720123 real 1508788037 # upper 619932086 lower) in 14645 sequences in 1 files # Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896) # max 4958242 (JH738473) median 25754 # %28.51 masked total, %29.12 masked real cat faSize.allMis1.cleanWMSdust.txt # 2174259888 bases (45539765 N's 2128720123 real 1508788037 # upper 619932086 lower) in 14645 sequences in 1 files # Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896) # max 4958242 (JH738473) median 25754 # %28.51 masked total, %29.12 masked real cat fb.allMis1.windowmaskerSdust.clean.txt # 619953052 bases of 2174259888 (28.513%) in intersection # how much does this window masker and repeat masker overlap: # can be done after rmsk is done time featureBits -countGaps allMis1 rmsk windowmaskerSdust \ > fb.allMis1.rmsk.windowmaskerSdust.txt 2>&1 cat fb.allMis1.rmsk.windowmaskerSdust.txt # 460348480 bases of 2174259888 (21.173%) in intersection ########################################################################## # add simpleRepeats to WindowMasker result (DONE - 2012-07-26 - Hiram) # add to rmsk after it is done: cd /hive/data/genomes/allMis1 twoBitMask -add bed/windowMasker/allMis1.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed allMis1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa allMis1.2bit stdout | faSize stdin > faSize.allMis1.2bit.txt cat faSize.allMis1.2bit.txt # 1065292181 bases (24006152 N's 1041286029 real 833083628 upper # 208202401 lower) in 27239 sequences in 1 files # Total size: mean 39109.1 sd 545732.4 min 131 (JH767125) # max 30495534 (JH739887) median 598 # %19.54 masked total, %19.99 masked real rm /gbdb/allMis1/allMis1.2bit ln -s `pwd`/allMis1.2bit /gbdb/allMis1/allMis1.2bit ########################################################################## # cpgIslands - (DONE - 2013-06-07 - Hiram) mkdir /hive/data/genomes/allMis1/bed/cpgIslands cd /hive/data/genomes/allMis1/bed/cpgIslands time doCpgIslands.pl allMis1 > do.log 2>&1 & # real 72m55.069s cat fb.allMis1.cpgIslandExt.txt # 20404153 bases of 2129659933 (0.958%) in intersection ######################################################################### # genscan - (DONE - 2013-06-07 - Hiram) mkdir /hive/data/genomes/allMis1/bed/genscan cd /hive/data/genomes/allMis1/bed/genscan time doGenscan.pl allMis1 > do.log 2>&1 & # real 89m34.019s cat fb.allMis1.genscan.txt # 48525950 bases of 2129659933 (2.279%) in intersection cat fb.allMis1.genscanSubopt.txt # 53173655 bases of 2129659933 (2.497%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-08 - Hiram) # Use -repMatch=800, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2128720123 / 2897316137 \) \* 1024 # ( 2128720123 / 2897316137 ) * 1024 = 752.354698 # round up to 800 cd /hive/data/genomes/allMis1 time blat allMis1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/allMis1.11.ooc -repMatch=800 # Wrote 12702 overused 11-mers to jkStuff/allMis1.11.ooc # real 0m43.678s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" allMis1 | sort | uniq -c # 68589 yes ######################################################################### # AUTO UPDATE GENBANK (DONE - 2013-06-12,18 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Alligator mississippiensis 107 5428 0 # Alligator sinensis 16 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add allMis1 just after ce2 # allMis1 - Alligator mississippiensis allMis1.serverGenome = /hive/data/genomes/allMis1/allMis1.2bit allMis1.clusterGenome = /hive/data/genomes/allMis1/allMis1.2bit allMis1.ooc = /hive/data/genomes/allMis1/allMis1.11.ooc allMis1.lift = no allMis1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} allMis1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} allMis1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} allMis1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} allMis1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} allMis1.refseq.mrna.native.load = yes allMis1.refseq.mrna.xeno.load = yes allMis1.genbank.mrna.xeno.load = yes allMis1.downloadDir = allMis1 allMis1.upstreamGeneTbl = refGene allMis1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "updating allMis0 to allMis1 refs #8750" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S allMis1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial allMis1 & # var/build/logs/2013.06.12-15:21:37.allMis1.initalign.log # real 1444m27.582s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad allMis1 & # var/dbload/hgwdev/logs/2013.06.18-13:52:33.dbload.log # real 108m47.912s # check the end of that dbload.log to see if it was successful # hgwdev 2013.06.18-15:41:20 dbload: finish # enable daily alignment and update of hgwdev (DONE - 2012-05-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add allMis1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added allMis1 refs #8750" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to FOXP2 gene displays (DONE - 2013-06-28 - Hiram) hgsql -e \ 'update dbDb set defaultPos="JH731472:504271-884586" where name="allMis1";' \ hgcentraltest ############################################################################ # downloads and pushQ entry (DONE - 2013-06-28 - Hiram) # after adding allMis1 to the all.joiner file and verifying that # joinerCheck is clean, can construct the downloads: cd /hive/data/genomes/allMis1 time makeDownloads.pl -workhorse=hgwdev allMis1 > downloads.log 2>&1 & # real 21m2.394s mkdir /hive/data/genomes/allMis1/pushQ cd /hive/data/genomes/allMis1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl allMis1 2> stderr.txt > allMis1.sql # real 3m44.708s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/allMis1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/allMis1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/allMis1/bbi/quality.bw # WARNING: allMis1 does not have seq # WARNING: allMis1 does not have extFile # WARNING: allMis1 does not have estOrientInfo scp -p allMis1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/allMis1.sql" ########################################################################## # BLATSERVERS ENTRY (DONE - 2013-06-26 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("allMis1", "blat4a", "17848", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("allMis1", "blat4a", "17849", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # fixup search rule for assembly track/gold table (DONE - 2013-08-06 - Hiram) hgsql -N -e "select frag from gold;" allMis1 | sort | head -1 AKHW01000001.1 hgsql -N -e "select frag from gold;" allMis1 | sort | tail -2 AKHW01114158.1 NC_001922 # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" allMis1 | wc -l # 114159 hgsql -N -e "select frag from gold;" allMis1 | egrep -e '[AN][KC][H_][W0]0[0-9]+(\.1)?' | wc -l # 114159 hgsql -N -e "select frag from gold;" allMis1 | egrep -v -e '[AN][KC][H_][W0]0[0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/gator/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][KC][H_][W0]0[0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ######################################################################### # swap human/gator hg19/allMis1 lastz (DONE - 2013-06-30 - Hiram) # original alignment cd /hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28 cat fb.hg19.chainAllMis1Link.txt # 205244552 bases of 2897316137 (7.084%) in intersection # and for this swap mkdir /hive/data/genomes/allMis1/bed/blastz.hg19.swap cd /hive/data/genomes/allMis1/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # forgot to get stderr in the swap.log # real 27m54.364s cat fb.allMis1.chainHg19Link.txt # 172919308 bases of 2129659933 (8.120%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/allMis1/bed ln -s blastz.hg19.swap lastz.hg19 ######################################################################### ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ##############################################################################