# for emacs: -*- mode: sh; -*- # Sorex araneus 2.0 sequence: # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sorex_araneus/SorAra2.0/ ########################################################################## # Download sequence (DONE - 2012-08-27 - Hiram) mkdir -p /hive/data/genomes/sorAra2/genbank cd /hive/data/genomes/sorAra2/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sorex_araneus/SorAra2.0/ \ ./ > fetch.log 2>&1 ########################################################################### # fixup to UCSC names (DONE - 2013-03-25 - Hiram) cd /hive/data/genomes/sorAra2 $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl # constructs the directory: /hive/data/genomes/sorAra2/ucsc # with files: cd /hive/data/genomes/sorAra2/ucsc ls -ogrt # -rw-rw-r-- 1 22718263 Mar 25 23:09 sorAra2.ucsc.agp # -rw-rw-r-- 1 671856471 Mar 25 23:20 sorAra2.ucsc.fa.gz # -rw-rw-r-- 1 209 Mar 25 23:21 checkAgp.result.txt cat checkAgp.result.txt # AALT02201788 1 1000 1 W AALT02201788.1 1 1000 + # agpFrag->chromStart: 0, agpFrag->chromEnd: 1000, dnaOffset: 0 # FASTA sequence entry # Valid Fasta file entry # All AGP and FASTA entries agree - both files are valid # this script also constructs the sorAra2.unmasked.2bit file, but # this is not needed with the makeGenomeDb.pl script: rm -f /hive/data/genomes/sorAra2/sorAra2.unmasked.2bit ########################################################################### # Initial genome build (DONE - 2013-05-28 - Hiram) cd /hive/data/genomes/sorAra2 cat << '_EOF_' > sorAra2.config.ra # Config parameters for makeGenomeDb.pl: db sorAra2 clade mammal genomeCladePriority 20.1 scientificName Sorex araneus commonName Shrew assemblyDate Aug. 2008 assemblyLabel Broad Institute SorAra2.0 assemblyShortLabel SorAra2.0 orderKey 2579 mitoAcc none fastaFiles /cluster/data/sorAra2/ucsc/sorAra2.ucsc.fa.gz agpFiles /cluster/data/sorAra2/ucsc/sorAra2.ucsc.agp # qualFiles /cluster/data/sorAra2/ucsc/sorAra1.quals.qac dbDbSpeciesDir shrew photoCreditURL http://en.wikipedia.org/wiki/File:Common_Shrew.jpg photoCreditName Sjonge, Wikipedia Commons ncbiGenomeId 226 ncbiAssemblyId 228978 ncbiAssemblyName SorAra2.0 ncbiBioProject 13689 genBankAccessionID GCA_000181275.2 taxId 42254 '_EOF_' # run step wise to confirm sequence and AGP files match each other time nice -n +19 makeGenomeDb.pl -fileServer=hgwdev \ -workhorse=hgwdev -stop=agp sorAra2.config.ra > genomeDb.agp.log 2>&1 # verify it is OK: tail -1 genomeDb.agp.log # *** All done! (through the 'agp' step) time nice -n +19 makeGenomeDb.pl -fileServer=hgwdev \ -workhorse=hgwdev -continue=db -stop=dbDb sorAra2.config.ra \ > genomeDb.dbDb.log 2>&1 time nice -n +19 makeGenomeDb.pl -fileServer=hgwdev \ -workhorse=hgwdev -continue=trackDb sorAra2.config.ra \ > genomeDb.trackDb.log 2>&1 # add the trackDb business to the source tree ########################################################################## # running repeat masker (DONE - 2013-05-29 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/repeatMasker cd /hive/data/genomes/sorAra2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek sorAra2 > do.log 2>&1 & # Elapsed time: 476m34s cat faSize.rmsk.txt # 2423158183 bases (231054757 N's 2192103426 real 1476627246 upper # 715476180 lower) in 12845 sequences in 1 files # Total size: mean 188646.0 sd 2097082.5 min 1000 (AALT02201788) # max 60238417 (JH798146) median 1461 # %29.53 masked total, %32.64 masked real grep -i versi do.log # RepeatMasker version open-4.0.0 # January 10 2013 (open-4-0-0) version of RepeatMasker ########################################################################## # running simple repeat (DONE - 2013-05-29 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/simpleRepeat cd /hive/data/genomes/sorAra2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ sorAra2 > do.log 2>&1 & # about 15 minutes cat fb.simpleRepeat # 49169633 bases of 2192103426 (2.243%) in intersection cd /hive/data/genomes/sorAra2 twoBitMask sorAra2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed sorAra2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa sorAra2.2bit stdout | faSize stdin > faSize.sorAra2.2bit.txt cat faSize.sorAra2.2bit.txt # 2423158183 bases (231054757 N's 2192103426 real 1474029646 upper # 718073780 lower) in 12845 sequences in 1 files # Total size: mean 188646.0 sd 2097082.5 min 1000 (AALT02201788) # max 60238417 (JH798146) median 1461 # %29.63 masked total, %32.76 masked real rm /gbdb/sorAra2/sorAra2.2bit ln -s `pwd`/sorAra2.2bit /gbdb/sorAra2/sorAra2.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2013-05-29 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/gap cd /hive/data/genomes/sorAra2/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../sorAra2.unmasked.2bit > findMotif.txt 2>&1 # real 0m40.949s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits sorAra2 -not gap -bed=notGap.bed # 2192103426 bases of 2192103426 (100.000%) in intersection time featureBits sorAra2 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2192103426 (0.000%) in intersection # real 12m10.066s ######################################################################### # cytoBandIdeo - (DONE - 2013-06-12 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/cytoBand cd /hive/data/genomes/sorAra2/bed/cytoBand makeCytoBandIdeo.csh sorAra2 ########################################################################## ## WINDOWMASKER (DONE - 2013-05-29 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/windowMasker cd /hive/data/genomes/sorAra2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev sorAra2 > do.log 2>&1 & # about 3h 18m # Masking statistics cat faSize.sorAra2.cleanWMSdust.txt # 2423158183 bases (231054757 N's 2192103426 real 1336787616 upper # 855315810 lower) in 12845 sequences in 1 files # Total size: mean 188646.0 sd 2097082.5 min 1000 (AALT02201788) # max 60238417 (JH798146) median 1461 # %35.30 masked total, %39.02 masked real cat fb.sorAra2.rmsk.windowmaskerSdust.txt # 519681347 bases of 2423158183 (21.446%) in intersection ######################################################################## # cpgIslands - (DONE - 2013-06-12 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/cpgIslands cd /hive/data/genomes/sorAra2/bed/cpgIslands time doCpgIslands.pl sorAra2 > do.log 2>&1 # real 9m27.905s cat fb.sorAra2.cpgIslandExt.txt # 57978513 bases of 2192103426 (2.645%) in intersection ######################################################################### # genscan - (DONE - 2013-06-12 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/genscan cd /hive/data/genomes/sorAra2/bed/genscan time doGenscan.pl sorAra2 > do.log 2>&1 # real 38m51.895s cat fb.sorAra2.genscan.txt # 61328825 bases of 2192103426 (2.798%) in intersection cat fb.sorAra2.genscanSubopt.txt # 50722612 bases of 2192103426 (2.314%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-06 - Hiram) # Use -repMatch=800, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2192103426 / 2897316137 \) \* 1024 # ( 2192103426 / 2897316137 ) * 1024 = 774.756292 # round up to 800 # sorAra1 had: -repMatch=700 # Wrote 26119 overused 11-mers to sorAra1.11.ooc cd /hive/data/genomes/sorAra2 blat sorAra2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/sorAra2.11.ooc -repMatch=800 # Wrote 26682 overused 11-mers to jkStuff/sorAra2.11.ooc # there are *only* bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" sorAra2 | sort | uniq -c # 188953 yes ######################################################################### # AUTO UPDATE GENBANK (DONE - 2013-06-08 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Sorex araneus 1 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf # this appears that sorAra2 has plenty of native est's ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add sorAra2 before sorAra1 and commit to GIT # sorAra2 (common shrew) sorAra2.serverGenome = /hive/data/genomes/sorAra2/sorAra2.2bit sorAra2.clusterGenome = /hive/data/genomes/sorAra2/sorAra2.2bit sorAra2.ooc = /hive/data/genomes/sorAra2/jkStuff/sorAra2.11.ooc sorAra2.lift = no sorAra2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} sorAra2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} sorAra2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} sorAra2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} sorAra2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} sorAra2.refseq.mrna.native.load = no sorAra2.refseq.mrna.xeno.load = yes sorAra2.genbank.mrna.xeno.load = no sorAra2.genbank.est.native.load = no sorAra2.downloadDir = sorAra2 sorAra2.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding sorAra2 common shrew refs #9348" etc/genbank.conf git push make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # sorAra1 genome information, if this is a new species, need to add stuff # there ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial sorAra2 & # var/build/logs/2013.06.06-13:55:59.sorAra2.initalign.log # real 583m17.182s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sorAra2 & # logFile: var/dbload/hgwdev/logs/2013.06.08-15:19:02.dbload.log # real 12m18.795s # enable daily alignment and update of hgwdev (TBD - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add sorAra2 to: etc/align.dbs etc/hgwdev.dbs git commit -m "added sorAra2 - shrew - refs #9348" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # construct liftOver from sorAra1 to sorAra2 (DONE - 2013-06-10 - Hiram) # documentation for this step is in sorAra1 - remember to do this ########################################################################### # lastz alignment with Human/hg19 (DONE - 2013-06-19 - Hiram) # the original alignment cd /hive/data/genomes/hg19/bed/lastzSorAra2.2013-06-17 cat fb.hg19.chainSorAra2Link.txt # 802006172 bases of 2897316137 (27.681%) in intersection # running the swap mkdir /hive/data/genomes/sorAra2/bed/blastz.hg19.swap cd /hive/data/genomes/sorAra2/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzSorAra2.2013-06-17/DEF \ -swap -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -bigClusterHub=swarm -chainMinScore=3000 -chainLinearGap=medium \ > swap.log 2>&1 # real 77m5.017s cat fb.sorAra2.chainHg19Link.txt # 778513475 bases of 2192103426 (35.514%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sorAra2/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################ # blat servers (DONE - 2013-06-10 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("sorAra2", "blat4b", "17846", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("sorAra2", "blat4b", "17847", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### ## Default position to same as sorAra1 found via blat ## (DONE - 2013-06-10 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="JH798167:6989994-6998804" where name="sorAra2";' hgcentraltest ######################################################################### # CPG Islands track (DONE - 2013-06-12 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/cpgIslands cd /hive/data/genomes/sorAra2/bed/cpgIslands time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -smallClusterHub=ku -workhorse=hgwdev sorAra2 > do.log 2>&1 ############################################################################# # lastz swap Mouse/Mm10 (DONE - 2013-06-17 - Hiram) # alignment to mouse cd /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 cat fb.mm10.chainSorAra2Link.txt # 354499462 bases of 2652783500 (13.363%) in intersection # and this swap mkdir /hive/data/genomes/sorAra2/bed/blastz.mm10.swap cd /hive/data/genomes/sorAra2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 39m53.463s cat fb.sorAra2.chainMm10Link.txt # 343760052 bases of 2192103426 (15.682%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sorAra2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # CPG Islands Unmasked track (DONE - 2014-04-24 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/cpgIslandsUnmasked cd /hive/data/genomes/sorAra2/bed/cpgIslandsUnmasked time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \ -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \ -workhorse=hgwdev \ -maskedSeq=/hive/data/genomes/sorAra2/sorAra2.unmasked.2bit \ sorAra2 > do.log 2>&1 # real 8m11.884s cat fb.sorAra2.cpgIslandExtUnmasked.txt # 61507517 bases of 2192103426 (2.806%) in intersection ############################################################################## # create ucscToINSDC name mapping (DONE - 2014-04-24 - Hiram) mkdir /hive/data/genomes/sorAra2/bed/ucscToINSDC cd /hive/data/genomes/sorAra2/bed/ucscToINSDC # this script has been maturing over time, it is close to complete. # to find a latest copy of it: # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh cp -p /hive/data/genomes/ochPri3/bed/ucscToINSDC/translateNames.sh . ./translateNames.sh # needs to be sorted to work with join sort ucscToINSDC.txt > ucscToINSDC.tab awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \ > name.coordinate.tab join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 12 # use the 12 in this sed: sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab sorAra2 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords sorAra2 ucscToINSDC # should cover all bases featureBits -countGaps sorAra2 ucscToINSDC # 2423158183 bases of 2423158183 (100.000%) in intersection ############################################################################## # construct downloads files (DONE - 2014-04-23 - Hiram) # before starting downloads, the joinerCheck should be clean # after sorAra2 is added to all.joiner: joinerCheck -keys -database=sorAra2 all.joiner cd /hive/data/genomes/sorAra2 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev sorAra2 \ > downloads.log 2>&1 # real 23m48.223s # examine the README.txt files to verify the text ############################################################################## # ready for first pushQ entry (DONE - 2014-04-28 - Hiram) mkdir /hive/data/genomes/sorAra2/pushQ cd /hive/data/genomes/sorAra2/pushQ makePushQSql.pl sorAra2 > sorAra2.sql 2> stderr.out # real 1m52.294s # some errors are legitimate and OK: head stderr.out # WARNING: hgwdev does not have /gbdb/sorAra2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/sorAra2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/sorAra2/bbi/gc5BaseBw/gc5Base.bw # WARNING: hgwdev does not have /gbdb/sorAra2/bbi/qualityBw/quality.bw # WARNING: sorAra2 does not have seq # WARNING: sorAra2 does not have extFile # WARNING: sorAra2 does not have estOrientInfo scp -p sorAra2.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/sorAra2.sql' ############################################################################## # setup search rule for assembly track (DONE - 2014-07-08 - Hiram) export maxLen=`hgsql -N -e 'select frag from gold;' sorAra2 | awk '{print length($0)}' | sort -run | head -1` echo $maxLen # 14 export C=1 while [ $C -le $maxLen ]; do echo -n " $C: " hgsql -N -e 'select frag from gold;' sorAra2 | sort -u \ | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g' C=`echo $C | awk '{print $1+1}'` done # 1: A # 2: A # 3: L # 4: T # 5: 0 # 6: 2 # 7: 012 # 8: 0123456789 # 9: 0123456789 # 10: 0123456789 # 11: 0123456789 # 12: 0123456789 # 13: . # 14: 1 searchTable gold searchMethod prefix searchType bed shortCircuit 1 termRegex AALT02[0-9]+(\.1)* query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # test pattern: hgsql -N -e 'select frag from gold;' sorAra2 | wc -l # 201798 hgsql -N -e 'select frag from gold;' sorAra2 \ | egrep -e 'AALT02[0-9]+(\.1)*' | wc -l # 201798 hgsql -N -e 'select frag from gold;' sorAra2 \ | egrep -v -e 'AALT02[0-9]+(\.1)*' | wc -l # 0 ############################################################################## ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ##############################################################################