# for emacs: -*- mode: sh; -*- # DATE: 22-Jan-2013 # ORGANISM: Amazona vittata # TAXID: 241585 # ASSEMBLY LONG NAME: AV1 # ASSEMBLY SHORT NAME: AV1 # ASSEMBLY SUBMITTER: Puerto Rican Parrot Genome Project # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000332375.1 # FTP-RELEASE DATE: 07-Feb-2013 # Available at NCBI 2013-02-07 # http://www.ncbi.nlm.nih.gov/bioproject/171587 # http://www.ncbi.nlm.nih.gov/genome/15170 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AOCU01 # http://www.ncbi.nlm.nih.gov/assembly/529068/ # 24X WGS 1,175 Mb 295,391 contigs 182,974 scaffolds N50 19,239 # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=241585 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Amazona_vittata/AV1/ ########################################################################## # Download sequence (DONE - 2013-02-26 - Hiram) mkdir -p /hive/data/genomes/amaVit1/genbank cd /hive/data/genomes/amaVit1/genbank rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Amazona_vittata/AV1/ ./ # verify the size of the sequence here: faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 1175404042 bases (47148290 N's 1128255752 real 1128255752 upper 0 lower) in 182974 sequences in 1 files # Total size: mean 6423.9 sd 11509.9 min 201 (gi|442875222|gb|AOCU01226611.1|) max 206462 (gi|443354616|gb|KB285691.1|) median 2065 # prototyping a new script that can manage any assembly that has just # these unplaced scaffolds: $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl amaVit1 # constructs ucsc hierarchy: ls -og ../ucsc # -rw-rw-r-- 1 19792148 Mar 25 13:46 amaVit1.ucsc.agp # -rw-rw-r-- 1 346868275 Mar 25 13:51 amaVit1.ucsc.fa.gz # -rw-rw-r-- 1 209 Mar 25 18:48 checkAgp.result.txt # fetch photograph, same as from NCBI genome page: mkdir /hive/data/genomes/amaVit1/photo cd /hive/data/genomes/amaVit1/photo wget --timestamping \ http://upload.wikimedia.org/wikipedia/commons/9/98/Puerto_Rican_parrot.jpg convert -geometry "400x300" Puerto_Rican_parrot.jpg Amazona_vittata.jpg # -rw-rw-r-- 1 83471 Mar 7 14:53 Amazona_vittata.jpg # check this .jpg file into the source tree kent/src/hg/htdocs/images/ git commit -m "photo for amaVit1 browser, Tom MacKenzie, U.S. Fish and Wildlife Service www.fws.gov/caribbean/ES/Parrot-Gallery.html refs #10144" \ Amazona_vittata.jpg # and copy to /usr/local/apache/htdocs/images cp -p Amazona_vittata.jpg /usr/local/apache/htdocs/images ########################################################################## # Initial makeGenomeDb.pl (DONE - 2013-03-26 - Hiram) # prototyping a new template creation system: cd /hive/data/genomes/amaVit1 export genomeClade="vertebrate" export genomeCladePriority="60" export commonName="Parrot" export dbDbDir="birds" export genomeId="15170" export bioprojectId="171587" export assemblyId="529068" export mitoAcc="none" export orderKey="4310" export photoCreditURL="http://www.fws.gov/caribbean/ES/Parrot-Gallery.html" export photoCreditName="Tom MacKenzie, U.S. Fish and Wildlife Service" $HOME/kent/src/hg/utils/automation/mkConfigRa.sh /hive/data/genomes/amaVit1 \ ${genomeClade} ${genomeCladePriority} ${commonName} ${dbDbDir} \ ${genomeId} ${bioprojectId} ${assemblyId} ${mitoAcc} ${orderKey} \ ${photoCreditURL} "${photoCreditName}" > amaVit1.config.ra # constructs this file: # Config parameters for makeGenomeDb.pl: db amaVit1 clade vertebrate genomeCladePriority 60 orderKey 4310 commonName Parrot dbDbSpeciesDir birds scientificName Amazona vittata assemblyDate Jan. 2013 assemblyLabel Puerto Rican Parrot Genome Project AV1 assemblyShortLabel AV1 fastaFiles /hive/data/genomes/amaVit1/ucsc/*.fa.gz agpFiles /hive/data/genomes/amaVit1/ucsc/*.agp mitoAcc none # qualFiles none photoCreditURL http://www.fws.gov/caribbean/ES/Parrot-Gallery.html photoCreditName Tom MacKenzie, U.S. Fish and Wildlife Service ncbiGenomeId 15170 ncbiAssemblyName AV1 genBankAccessionID GCA_000332375.1 ncbiAssemblyId 529068 ncbiBioProject 171587 taxId 241585 # seq and agp steps were done without logs for testing, then continuing: # << happy emacs time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -continue=db amaVit1.config.ra > db.log 2>&1 # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/amaVit1.unmasked.2bit /gbdb/amaVit1/amaVit1.2bit # browser should function now, add the files from the trackDb # hierarchy here to the source tree # after checking in the photograph and getting it into # /usr/local/apache/htdocs/images: time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=trackDb -forceDescription -dbHost=hgwdev amaVit1.config.ra n50.pl chrom.sizes # reading: chrom.sizes # contig count: 182974, total size: 1175404042, one half size: 587702021 # cumulative N50 count contig contig size 587688024 16504 KB240690 19240 587702021 one half size 587707263 16505 KB265067 19239 ########################################################################## # running repeat masker (DONE - 2013-03-27 - Hiram) mkdir /hive/data/genomes/amaVit1/bed/repeatMasker cd /hive/data/genomes/amaVit1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek amaVit1 > do.log 2>&1 & # real 163m13.049s cat faSize.rmsk.txt # 1175404042 bases (47148290 N's 1128255752 real 1068522265 upper # 59733487 lower) in 182974 sequences in 1 files # Total size: mean 6423.9 sd 11509.9 min 201 (AOCU01226611) # max 206462 (KB285691) median 2065 # %5.08 masked total, %5.29 masked real egrep -i "versi|relea" do.log # January 10 2013 (open-4-0-0) version of RepeatMasker # CC RELEASE 20120418; featureBits -countGaps amaVit1 rmsk # 887930078 bases of 2410758013 (36.832%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2013-03-05 - Hiram) mkdir /hive/data/genomes/amaVit1/bed/simpleRepeat cd /hive/data/genomes/amaVit1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ amaVit1 > do.log 2>&1 & # real 85m49.548s cat fb.simpleRepeat # 7849266 bases of 1128255752 (0.696%) in intersection XXX - waiting to see what WM does # add to rmsk after it is done: cd /hive/data/genomes/amaVit1 twoBitMask amaVit1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed amaVit1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa amaVit1.2bit stdout | faSize stdin > faSize.amaVit1.2bit.txt cat faSize.amaVit1.2bit.txt # 2410758013 bases (132851443 N's 2277906570 real 1389998131 upper # 887908439 lower) in 7741 sequences in 1 files # Total size: mean 311427.2 sd 2002158.7 min 1000 (AEYP01117479) # max 52375790 (GL896898) median 1445 # %36.83 masked total, %38.98 masked real rm /gbdb/amaVit1/amaVit1.2bit ln -s `pwd`/amaVit1.2bit /gbdb/amaVit1/amaVit1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2013-03-27 - Hiram) mkdir /hive/data/genomes/amaVit1/bed/gap cd /hive/data/genomes/amaVit1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../amaVit1.unmasked.2bit > findMotif.txt 2>&1 # real 0m12.841s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits amaVit1 -not gap -bed=notGap.bed # 1128255752 bases of 1128255752 (100.000%) in intersection # real 1m2.648s # can see now if allGaps.bed actually is all the gaps: hgsql -N -e "select size from gap;" amaVit1 | ave stdin | grep total # total 47148290.000000 ave -col=5 allGaps.bed | grep total # total 47148290.000000 # same count, no new gaps # check if any non-bridged gaps here: hgsql -N -e "select bridge from gap;" amaVit1 | sort | uniq -c # 112417 yes ########################################################################## ## WINDOWMASKER (DONE - 2013-03-27 - Hiram) mkdir /hive/data/genomes/amaVit1/bed/windowMasker cd /hive/data/genomes/amaVit1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev amaVit1 > do.log 2>&1 & # real 234m56.918s # Masking statistics cat faSize.amaVit1.wmsk.txt # 1175404042 bases (47148290 N's 1128255752 real 926668264 upper # 201587488 lower) in 182974 sequences in 1 files # Total size: mean 6423.9 sd 11509.9 min 201 (AOCU01226611) # max 206462 (KB285691) median 2065 # %17.15 masked total, %17.87 masked real cat faSize.amaVit1.wmsk.sdust.txt # 1175404042 bases (47148290 N's 1128255752 real 921480969 upper # 206774783 lower) in 182974 sequences in 1 files # Total size: mean 6423.9 sd 11509.9 min 201 (AOCU01226611) # max 206462 (KB285691) median 2065 # %17.59 masked total, %18.33 masked real cat faSize.amaVit1.cleanWMSdust.txt # 1175404042 bases (47148290 N's 1128255752 real 921480969 # upper 206774783 lower) in 182974 sequences in 1 files # Total size: mean 6423.9 sd 11509.9 min 201 (AOCU01226611) # max 206462 (KB285691) median 2065 # %17.59 masked total, %18.33 masked real cat fb.amaVit1.windowmaskerSdust.clean.txt # 206774783 bases of 1175404042 (17.592%) in intersection # how much does this window masker and repeat masker overlap: # can be done after rmsk is done. The script will often # fail on this command in the doLoad.csh if RM is not yet # complete and these are running at the same time: featureBits -countGaps amaVit1 rmsk windowmaskerSdust # 453442864 bases of 2410758013 (18.809%) in intersection cat fb.amaVit1.rmsk.windowmaskerSdust.txt # 41747742 bases of 1175404042 (3.552%) in intersection # if the script did fail on that command, finish it: time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -continue=cleanup -dbHost=hgwdev amaVit1 > cleanup.log 2>&1 & # real 1m43.905s ########################################################################## # use WindowMasker and TRF for final masked genome (DONE - 2013-03-27 - Hiram) # add to rmsk after it is done: cd /hive/data/genomes/amaVit1 twoBitMask bed/windowMasker/amaVit1.cleanWMSdust.2bit \ -add bed/simpleRepeat/trfMask.bed amaVit1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa amaVit1.2bit stdout | faSize stdin > faSize.amaVit1.2bit.txt cat faSize.amaVit1.2bit.txt # 1175404042 bases (47148290 N's 1128255752 real 921150785 upper # 207104967 lower) in 182974 sequences in 1 files # Total size: mean 6423.9 sd 11509.9 min 201 (AOCU01226611) # max 206462 (KB285691) median 2065 # %17.62 masked total, %18.36 masked real rm /gbdb/amaVit1/amaVit1.2bit ln -s `pwd`/amaVit1.2bit /gbdb/amaVit1/amaVit1.2bit ########################################################################## # cpgIslands - (DONE - 2013-03-27 - Hiram) mkdir /hive/data/genomes/amaVit1/bed/cpgIslands cd /hive/data/genomes/amaVit1/bed/cpgIslands time doCpgIslands.pl amaVit1 > do.log 2>&1 # real 160m9.498s cat fb.amaVit1.cpgIslandExt.txt # 8721452 bases of 1128255752 (0.773%) in intersection ######################################################################### # genscan - (DONE - 2013-03-28 - Hiram) mkdir /hive/data/genomes/amaVit1/bed/genscan cd /hive/data/genomes/amaVit1/bed/genscan time doGenscan.pl amaVit1 > do.log 2>&1 # load step broken due to gtfToGenePred binary problems, fixed and # finished the load script manually, then continue: time doGenscan.pl -continue=cleanup amaVit1 > cleanup.log 2>&1 # real 3m24.993s cat fb.amaVit1.genscan.txt # 23027089 bases of 1128255752 (2.041%) in intersection cat fb.amaVit1.genscanSubopt.txt # 23060300 bases of 1128255752 (2.044%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-03-27 - Hiram) cd /hive/data/genomes/amaVit1 # Use -repMatch=450, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1175404042 / 2897316137 \) \* 1024 # ( 1175404042 / 2897316137 ) * 1024 = 415.423683 # round up to 450 cd /hive/data/genomes/amaVit1 time blat amaVit1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/amaVit1.11.ooc -repMatch=450 # Wrote 9104 overused 11-mers to jkStuff/amaVit1.11.ooc # real 0m26.387s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" amaVit1 | sort | uniq -c # 112417 yes # find other make doc where gapToLift is used to make nonBridged.bed file ######################################################################### # AUTO UPDATE GENBANK (TBD - 2013-03-06 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Mustela putorius 9 0 0 # Mustela putorius furo 122937 4210 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add amaVit1 just before petMar1 # amaVit1 (Ferret, Mustela putorius furo, taxId 9669) amaVit1.serverGenome = /hive/data/genomes/amaVit1/amaVit1.2bit amaVit1.clusterGenome = /hive/data/genomes/amaVit1/amaVit1.2bit amaVit1.ooc = /hive/data/genomes/amaVit1/jkStuff/amaVit1.11.ooc amaVit1.lift = no amaVit1.perChromTables = no amaVit1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} amaVit1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} amaVit1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} amaVit1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} amaVit1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} amaVit1.refseq.mrna.native.load = yes amaVit1.refseq.mrna.xeno.load = yes amaVit1.genbank.mrna.xeno.load = no amaVit1.genbank.est.native.load = yes amaVit1.downloadDir = amaVit1 # end of section added to etc/genbank.conf git commit -m "adding amaVit1 ferret redmine 6352" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. # musFurNames[] = {"Mustela putorius furo", "Mustela putorius", NULL}; git commit -m "adding definition for musFurNames redmine 6352" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S amaVit1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial amaVit1 & # var/build/logs/2013.03.06-09:49:44.amaVit1.initalign.log # real 325m17.355s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad amaVit1 & # real 28m17.094s # var/dbload/hgwdev/logs/2013.03.06-15:15:56.dbload.log # real 31m56.575s # check the end of that dbload.log to see if it was successful # hgwdev 2013.03.06-15:47:47 dbload: finish # enable daily alignment and update of hgwdev (TBD - 2013-03-06 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add amaVit1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added amaVit1/ferret to the daily build" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # construct ucscToEnsembl chrom name translation (2013-03-06 - Hiram) mkdir /hive/data/genomes/amaVit1/bed/ucscToEnsembl cd /hive/data/genomes/amaVit1/bed/ucscToEnsembl # all the Ensembl names are the same as UCSC with the addition of .1 cat ../../chrom.sizes | cut -f1 | sed -e 's/^\(.*\)/\1\t\1.1/' \ | sort > ucscToEnsembl.tab # find length of ucsc names for key length in SQL below: awk '{print length($1)}' ucscToEnsembl.tab | sort -u # 12 # 8 cat << '_EOF_' > ucscToEnsembl.sql # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(12)) ); '_EOF_' hgsql amaVit1 < ucscToEnsembl.sql hgsql amaVit1 \ -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl' ############################################################################ # Ensembl Genes version 70 (TBD - 2013-03-06 - Hiram) cd /hive/data/genomes/amaVit1 cat << '_EOF_' > amaVit1.ensGene.ra # required db variable db amaVit1 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to protect in perl: nameTranslation 's/^MT/chrM/; s/\.1//' '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=70 -stop=process \ amaVit1.ensGene.ra > ensGene.process.log 2>&1 # log indicates OK: # genePredCheck -db=amaVit1 amaVit1.allGenes.gp.gz # checked: 23963 failed: 0 doEnsGeneUpdate.pl -ensVersion=70 -continue=load \ amaVit1.ensGene.ra > ensGeneV70.load.log 2>&1 featureBits amaVit1 ensGene # 52850166 bases of 2277906570 (2.320%) in intersection ######################################################################### # set default position as recommended from Jeramiah Smith # (TBD - 2012-10-23 - Hiram) hgsql -e \ 'update dbDb set defaultPos="GL476334:480870-830419" where name="amaVit1";' \ hgcentraltest ############################################################################ # downloads and pushQ entry (TBD - 2012-03-06 - Hiram) # after adding amaVit1 to the all.joiner file and verifying that # joinerCheck is clean, can construct the downloads: cd /hive/data/genomes/amaVit1 time makeDownloads.pl -workhorse=hgwdev amaVit1 XXX - running - Thu Mar 7 10:02:01 PST 2013 # real 21m55.107s mkdir /hive/data/genomes/amaVit1/pushQ cd /hive/data/genomes/amaVit1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl amaVit1 2> stderr.txt > amaVit1.sql # real 3m38.916s # will have to verify this one after loading on hgwbeta: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # ucscToEnsembl # the script should be fixed to place this in Ensembl Genes track # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/amaVit1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/amaVit1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/amaVit1/bbi/quality.bw # WARNING: amaVit1 does not have seq # WARNING: amaVit1 does not have extFile scp -p amaVit1.sql hgwbeta:/tmp/ ssh hgwbeta "hgsql qapushq < /tmp/amaVit1.sql" ########################################################################## # BLATSERVERS ENTRY (TBD - 2012-10-23 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("amaVit1", "blat4b", "17838", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("amaVit1", "blat4b", "17839", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ##############################################################################