# for emacs: -*- mode: sh; -*- # Ochotona princeps OchPri3.0 # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Ochotona_princeps/OchPri3.0/ # DATE: 21-May-2012 # ORGANISM: Ochotona princeps # TAXID: 9978 # ASSEMBLY LONG NAME: OchPri3.0 # ASSEMBLY SHORT NAME: OchPri3.0 # ASSEMBLY SUBMITTER: Broad Institute # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000292845.1 # FTP-RELEASE DATE: 28-Aug-2012 ########################################################################## # Download sequence (DONE - 2012-02-26 - Hiram) mkdir -p /hive/data/genomes/ochPri3/genbank cd /hive/data/genomes/ochPri3/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Ochotona_princeps/OchPri3.0/ ./ > fetch.log ########################################################################### # fixup to UCSC names (DONE - 2013-03-25 - Hiram) cd /hive/data/genomes/ochPri3 $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl # constructs the directory: /hive/data/genomes/ochPri3/ucsc # with files: cd /hive/data/genomes/ochPri3/ucsc ls -ogrt # -rw-rw-r-- 1 14853091 Mar 25 21:40 ochPri3.ucsc.agp # -rw-rw-r-- 1 600736012 Mar 25 21:50 ochPri3.ucsc.fa.gz # -rw-rw-r-- 1 219 Mar 25 21:51 checkAgp.result.txt # this script also constructs the ochPri3.unmasked.2bit file, but # this is not needed with the makeGenomeDb.pl script: rm -f /hive/data/genomes/ochPri3/ochPri3.unmasked.2bit # there is one duplicate contig: echo ALIT01123196 > remove.duplicate.list zcat ochPri3.ucsc.fa.gz > /dev/shm/ochPri3.ucsc.fa faSomeRecords -exclude /dev/shm/ochPri3.ucsc.fa remove.duplicate.list stdout | gzip -c > ochPri3.clean.fa.gz grep -v ALIT01123196 ochPri3.ucsc.agp > ochPri3.clean.agp ########################################################################### # Initial genome build (DONE - 2013-06-18 - Hiram) cd /hive/data/genomes/ochPri3 cat << '_EOF_' > ochPri3.config.ra # Config parameters for makeGenomeDb.pl: db ochPri3 clade mammal # genomeCladePriority 35 scientificName Ochotona princeps commonName Pika assemblyDate May 2012 assemblyLabel Broad Institute assemblyShortLabel OchPri3.0 orderKey 1929 mitoAcc NC_005358 fastaFiles /hive/data/genomes/ochPri3/ucsc/ochPri3.clean.fa.gz agpFiles /hive/data/genomes/ochPri3/ucsc/ochPri3.clean.agp # qualFiles none dbDbSpeciesDir pika photoCreditURL http://en.wikipedia.org/wiki/File:Ochotona_princeps.jpg photoCreditName Justin Johnsen, Wikipedia Commons ncbiGenomeId 771 ncbiAssemblyId 681538 ncbiAssemblyName OchPri3.0 ncbiBioProject 193497 genBankAccessionID GCA_000292845.1 taxId 9978 '_EOF_' # run step wise to confirm sequence and AGP files match each other time makeGenomeDb.pl -fileServer=hgwdev -workhorse=hgwdev \ -stop=agp ochPri3.config.ra > genomeDb.agp.log 2>&1 # real 4m15.127s # verify it is OK: tail -1 genomeDb.agp.log # *** All done! (through the 'agp' step) time nice -n +19 makeGenomeDb.pl -fileServer=hgwdev \ -workhorse=hgwdev -continue=db ochPri3.config.ra \ > genomeDb.db.log 2>&1 # real 21m27.788s # add the trackDb business to the source tree ########################################################################## # running repeat masker (DONE - 2013-06-18 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/repeatMasker cd /hive/data/genomes/ochPri3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek ochPri3 > do.log 2>&1 & # real 2327m8.018s cat faSize.rmsk.txt # 2229835716 bases (285847846 N's 1943987870 real 1370818698 upper # 573169172 lower) in 10420 sequences in 1 files # Total size: mean 213995.8 sd 2558422.6 min 1000 (ALIT01132924) # max 83735184 (JH802061) median 2572 # %25.70 masked total, %29.48 masked real egrep -i "versi|releas" do.log # RepeatMasker version open-4.0.2 # April 29 2013 (open-4-0-2) version of RepeatMasker # CC RELEASE 20130422; * ########################################################################## # running simple repeat (DONE - 2013-06-18 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/simpleRepeat cd /hive/data/genomes/ochPri3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ ochPri3 > do.log 2>&1 & # about 24 minutes cat fb.simpleRepeat # 23780445 bases of 1943987870 (1.223%) in intersection # using RMSK and TRF since RMSK is about the same as WM cd /hive/data/genomes/ochPri3 twoBitMask ochPri3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed ochPri3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa ochPri3.2bit stdout | faSize stdin > faSize.ochPri3.2bit.txt cat faSize.ochPri3.2bit.txt # 2229835716 bases (285847846 N's 1943987870 real 1369994081 upper # 573993789 lower) in 10420 sequences in 1 files # Total size: mean 213995.8 sd 2558422.6 min 1000 (ALIT01132924) # max 83735184 (JH802061) median 2572 # %25.74 masked total, %29.53 masked real rm /gbdb/ochPri3/ochPri3.2bit ln -s `pwd`/ochPri3.2bit /gbdb/ochPri3/ochPri3.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2013-06-18 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/gap cd /hive/data/genomes/ochPri3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../ochPri3.unmasked.2bit > findMotif.txt 2>&1 # real 0m40.949s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits ochPri3 -not gap -bed=notGap.bed # 1943987870 bases of 1943987870 (100.000%) in intersection # real 0m11.599s awk '{print $3-$2,$0}' notGap.bed | sort -rn > notGap.sizes.txt # largest contiguous sequence: head -1 notGap.sizes.txt | awk '{print $1}' # 579101 # minimal coverage 1 base out of that largest sequence: echo 579101 | awk '{printf "%15.10f\n", 1/(2*$1)}' | sed -e 's/ //g' # 0.0000008634 time bedIntersect -minCoverage=0.0000008634 allGaps.bed notGap.bed \ test.new.gaps.bed # real 0m1.884s # number of bases in these new gaps, none: # -rw-rw-r-- 1 0 Jun 18 15:37 test.new.gaps.bed # when non-zero, to count them, e.g.: # awk '{print $3-$2}' test.new.gaps.bed | ave stdin | grep total # total 8314.000000 ######################################################################### # cytoBandIdeo - (DONE - 2013-06-18 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/cytoBand cd /hive/data/genomes/ochPri3/bed/cytoBand makeCytoBandIdeo.csh ochPri3 ########################################################################## ## WINDOWMASKER (DONE - 2013-06-18 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/windowMasker cd /hive/data/genomes/ochPri3/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev ochPri3 > do.log 2>&1 & # about 195 minutes cat faSize.ochPri3.cleanWMSdust.txt # 2229835716 bases (285847846 N's 1943987870 real 1348364836 # upper 595623034 lower) in 10420 sequences in 1 files # Total size: mean 213995.8 sd 2558422.6 min 1000 (ALIT01132924) # max 83735184 (JH802061) median 2572 # %26.71 masked total, %30.64 masked real # This is pretty good for WM, but RMSK isn't that bad either, # so, using the RMSK result to mask the genome featureBits -countGaps ochPri3 rmsk windowmaskerSdust > fb.ochPri3.rmsk.windowmaskerSdust.txt 2>&1 cat fb.ochPri3.rmsk.windowmaskerSdust.txt # 416158057 bases of 2947024286 (14.121%) in intersection ######################################################################## # cpgIslands - (DONE - 2013-06-20 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/cpgIslands cd /hive/data/genomes/ochPri3/bed/cpgIslands time doCpgIslands.pl ochPri3 > do.log 2>&1 & # Elapsed time: 14m35s cat fb.ochPri3.cpgIslandExt.txt # 20228048 bases of 1943987870 (1.041%) in intersection ############################################################################# # CPG Islands Unmasked track (DONE - 2014-04-16 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/cpgIslandsUnmasked cd /hive/data/genomes/ochPri3/bed/cpgIslandsUnmasked time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \ -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \ -workhorse=hgwdev \ -maskedSeq=/hive/data/genomes/ochPri3/ochPri3.unmasked.2bit ochPri3 > do.log 2>&1 ############################################################################# # genscan - (DONE - 2013-06-20 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/genscan cd /hive/data/genomes/ochPri3/bed/genscan time doGenscan.pl ochPri3 > do.log 2>&1 & # real 85m9.645s # last job finished manually: time ./lastGsBig.csh JH802073 000 gtf/000/JH802073.gtf \ pep/000/JH802073.pep subopt/000/JH802073.bed # real 14m23.140s time doGenscan.pl -continue=makeBed ochPri3 > makeBed.log 2>&1 & # real 4m41.262s cat fb.ochPri3.genscan.txt # 58343898 bases of 1943987870 (3.001%) in intersection cat fb.ochPri3.genscanSubopt.txt # 52334278 bases of 1943987870 (2.692%) in intersection ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-20 - Hiram) # Use -repMatch=700, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1943987870 / 2897316137 \) \* 1024 # ( 1943987870 / 2897316137 ) * 1024 = 687.064678 # round up to 700 # ochPri2 was: -repMatch=700 # Wrote 19533 overused 11-mers to jkStuff/ochPri2.11.ooc cd /hive/data/genomes/ochPri3 blat ochPri3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/ochPri3.11.ooc -repMatch=700 # Wrote 19859 overused 11-mers to jkStuff/ochPri3.11.ooc # there are *only* bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" ochPri3 | sort | uniq -c # 122542 yes ######################################################################### # AUTO UPDATE GENBANK (DONE - 2014-04-16 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Ochotona princeps 1 0 8 # to decide which "native" mrna or ests you want to specify in genbank.conf # this appears that ochPri3 has plenty of native est's ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add ochPri3 before echTel1 and commit to GIT # ochPri3 (tenrec) ochPri3.serverGenome = /hive/data/genomes/ochPri3/ochPri3.2bit ochPri3.clusterGenome = /hive/data/genomes/ochPri3/ochPri3.2bit ochPri3.ooc = /hive/data/genomes/ochPri3/jkStuff/ochPri3.11.ooc ochPri3.lift = no ochPri3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} ochPri3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} ochPri3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} ochPri3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} ochPri3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} ochPri3.refseq.mrna.native.load = no ochPri3.refseq.mrna.xeno.load = yes ochPri3.genbank.mrna.xeno.load = no ochPri3.genbank.est.native.load = no ochPri3.genbank.mrna.native.load = no ochPri3.genbank.mrna.native.loadDesc = no ochPri3.downloadDir = ochPri3 ochPri3.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding ochPri3 pika refs #9347" etc/genbank.conf git push make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # ochPri genome information, if this is a new species, need to add stuff # there ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial ochPri3 & # var/build/logs/2014.04.16-15:35:03.ochPri3.initalign.log # real 604m7.753s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ochPri3 & # logFile: var/dbload/hgwdev/logs/2014.04.17-10:50:53.ochPri3.dbload.log # real 11m21.383s # enable daily alignment and update of hgwdev (TBD - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add ochPri3 to: etc/align.dbs etc/hgwdev.dbs vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added ochPri3 to daily hgwdev build refs #9347" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # construct liftOver from ochPri2 to ochPri3 (DONE - 2014-04-18 - Hiram) # documentation for this step is in ochPri2 - remember to do this ########################################################################### # construct downloads files (DONE - 2014-04-23 - Hiram) # before starting downloads, the joinerCheck should be clean # after ochPri3 is added to all.joiner: joinerCheck -keys -database=ochPri3 all.joiner cd /hive/data/genomes/ochPri3 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev ochPri3 \ > downloads.log 2>&1 # about 40 minutes # examine the README.txt files to verify the text ########################################################################### # ready for first pushQ entry (DONE - 2013-06-12 - Hiram) mkdir /hive/data/genomes/ochPri3/pushQ cd /hive/data/genomes/ochPri3/pushQ makePushQSql.pl ochPri3 > ochPri3.sql 2> stderr.out # some errors are legitimate and OK: head stderr.out # WARNING: hgwdev does not have /gbdb/ochPri3/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/ochPri3/wib/quality.wib # WARNING: hgwdev does not have /gbdb/ochPri3/bbi/gc5BaseBw/gc5Base.bw # WARNING: hgwdev does not have /gbdb/ochPri3/bbi/qualityBw/quality.bw # WARNING: ochPri3 does not have seq # WARNING: ochPri3 does not have extFile # WARNING: ochPri3 does not have estOrientInfo scp -p ochPri3.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/ochPri3.sql' ############################################################################ # lastz alignment with Human/hg19 (DONE - 2014-06-30 - Hiram) # the original alignment cd /hive/data/genomes/hg19/bed/lastzOchPri3.2013-06-17 cat fb.hg19.chainOchPri3Link.txt # 1004662072 bases of 2897316137 (34.676%) in intersection # running the swap mkdir /hive/data/genomes/ochPri3/bed/blastz.hg19.swap cd /hive/data/genomes/ochPri3/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/hg19/bed/lastzOchPri3.2013-06-17/DEF \ -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 71m5.679s cat fb.ochPri3.chainHg19Link.txt # 969182988 bases of 1943987870 (49.855%) in intersection ############################################################################ # blat servers (TBD - 2014-06-10 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ochPri3", "blat4a", "17846", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ochPri3", "blat4a", "17847", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### ## Default position set at RHO protein found with blat ## (DONE - 2013-06-10 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="JH980304:37472293-37479623" where name="ochPri3";' hgcentraltest ############################################################################## # running cpgIsland business (DONE - 2013-06-20 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/cpgIsland cd /hive/data/genomes/ochPri3/bed/cpgIsland ln -s ../../../danRer6/bed/cpgIsland/cpglh.exe . mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../ochPri3.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh encodek cd /hive/data/genomes/ochPri3/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | egrep "^chr|^Zv" | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 6457 of 6457 jobs # CPU time in finished jobs: 135s 2.25m 0.04h 0.00d 0.000 y # IO & Wait Time: 16267s 271.11m 4.52h 0.19d 0.001 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 28s 0.47m 0.01h 0.00d # Submission to last job: 871s 14.52m 0.24h 0.01d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed # verify longest unique chrom name: cut -f1 cpgIsland.bed | sed -e 's/_random//' \ | awk '{print length($0)}' | sort -rn | head -1 # 18 # update the length 14 in the template to be 16: sed -e "s/14/18/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql cd /hive/data/genomes/ochPri3/bed/cpgIsland hgLoadBed ochPri3 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed # Loaded 15374 elements of size 10 featureBits ochPri3 cpgIslandExt # 6520314 bases of 1701353770 (0.383%) in intersection # there should be no output from checkTableCoords: checkTableCoords -verboseBlocks -table=cpgIslandExt ochPri3 # cleanup rm -fr hardMaskedFa ############################################################################ # create ucscToINSDC name mapping (DONE - 2014-04-11 - Hiram) mkdir /hive/data/genomes/ochPri3/bed/ucscToINSDC cd /hive/data/genomes/ochPri3/bed/ucscToINSDC # this script has been maturing over time, it is close to complete. # to find a latest copy of it: # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh cp -p /hive/data/genomes/papAnu2/bed/ucscToINSDC/translateNames.sh . ./translateNames.sh # it says: # need to find chrM accessions # so add this one: echo -e 'chrM\tNC_005358.1' >> ucscToINSDC.txt # needs to be sorted to work with join sort ucscToINSDC.txt > ucscToINSDC.tab awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \ > name.coordinate.tab join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 12 # use the 12 in this sed: sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ochPri3 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords ochPri3 ucscToINSDC # should cover all bases featureBits -countGaps ochPri3 ucscToINSDC # 2229835716 bases of 2229835716 (100.000%) in intersection ############################################################################## # setup search rule for assembly track (DONE - 2014-07-08 - Hiram) export maxLen=`hgsql -N -e 'select frag from gold;' ochPri3 | awk '{print length($0)}' | sort -run | head -1` echo $maxLen # 14 export C=1 while [ $C -le $maxLen ]; do echo -n " $C: " hgsql -N -e 'select frag from gold;' ochPri3 | sort -u \ | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g' C=`echo $C | awk '{print $1+1}'` done # 1: AN # 2: CL # 3: I_ # 4: 0T # 5: 0 # 6: 15 # 7: 013 # 8: 0123456789 # 9: 0123456789 # 10: 0123456789 # 11: 0123456789 # 12: 0123456789 # 13: . # 14: 1 searchTable gold searchMethod prefix searchType bed shortCircuit 1 termRegex [AN][CL][I_][0T]0[15][0-9]+(\.1)* query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # test pattern: hgsql -N -e 'select frag from gold;' ochPri3 | wc -l # 132962 hgsql -N -e 'select frag from gold;' ochPri3 \ | egrep -e '[AN][CL][I_][0T]0[15][0-9]+(\.1)*' | wc -l # 132962 hgsql -N -e 'select frag from gold;' ochPri3 \ | egrep -v -e '[AN][CL][I_][0T]0[15][0-9]+(\.1)*' | wc -l # 0 ############################################################################## ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ##############################################################################