# for emacs: -*- mode: sh; -*- # Cavia porcellus -- Broad Institute Release 3.0 (Feb 12 2008) # Template from bosTau4.txt ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2008-04-02 - Tim and Kate) ssh kkstore05 mkdir /cluster/store12/cavPor3 ln -s /cluster/store12/cavPor3 /cluster/data mkdir /cluster/data/cavPor3/broad cd /cluster/data/cavPor3/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.quals.gz wget --timestamping \ #Not helpful ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly_supers.qual.gz qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin cavPor3.qual.qac ########## From broad inst. cavPor3/BasicStats.out # -------------------------------------------------------------------------------- # Sat Feb 02 03:52:11 2008 run (pid=21060), using Tue Jan 22 11:07:31 EST 2008 make # BasicStats PRE=/wga/dev/WGAdata DATA=projects/Guineapig RUN=run/work \ # SUBDIR=post5 QUAL_STATS=True OUTFILE=BasicStats.out # -------------------------------------------------------------------------------- # Supercontigs having < 3 reads or < 1kb sequence are ignored. # 8 gaps <= -1000; 0 gaps <= -10000; 0 gaps <= -100000 # fraction of gaps < -10kb or more than 4 deviations below zero: 0.0222% # 667 gaps > 10kb, 60 gaps > 50kb, 0 gaps > 200kb, 0 gaps > 1Mb # 93.8% of reads were used in the assembly (95.55% of bases, 96.3% of Q20 bases) # 0.00512% of reads were used multiply in the assembly # 61603 contigs, having N50 length 80583 # total contig length: 2663352932, spanning 2722377657 bases (with 2.17% in gaps) # 3143 supercontigs, having N50 length 27408292 (not including gaps) # 2.48% of assembly in supers of size < 200000 (67399955 bases) # Assembly base coverage: 6.79X. Assembly Q20 coverage: 5.99X. # 100% of bases have q >= 1 # 99.57% of bases have q >= 20 # 99.02% of bases have q >= 30 # 98.4% of bases have q >= 40 # 97.72% of bases have q >= 50 cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 3143 cut -f 1 assembly.agp | uniq -c | sort -b -n -r | cut -c 1-8 | uniq -c | sort -n -r -b | head -3 # Number of scaffolds with single contig: 2176 thus 967 multi-contig scaffolds ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh kkstore05 cd /cluster/data/cavPor3 cat << _EOF_ >cavPor3.config.ra # Config parameters for makeGenomeDb.pl: db cavPor3 clade mammal genomeCladePriority 35 scientificName Cavia porcellus commonName Guinea Pig assemblyDate Feb. 2008 assemblyLabel Broad Institute cavPor3 orderKey 99 #mitoAcc AJ222767 mitoAcc 5679797 fastaFiles /cluster/data/cavPor3/broad/assembly.bases.gz agpFiles /cluster/data/cavPor3/broad/assembly.agp qualFiles /cluster/data/cavPor3/broad/cavPor3.qual.qac dbDbSpeciesDir guineaPig _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -verbose=2 cavPor3.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 7169.500000 # median 13298.000000 # Q3 55788.500000 # average 866164.007952 # min 3002.000000 # max 88675666.000000 # count 3144 # total 2723219641.000000 # standard deviation 5316243.688364 # NOTES -- STUFF THAT YOU WILL HAVE TO DO -- # # # Template trackDb.ra and .html's have been created, but they all need editing! # # cd /cluster/data/cavPor3/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/guineaPig/cavPor3 # # Search for '***' notes in each file in and make corrections (sometimes the # files used for a previous assembly might make a better template): # description.html /cluster/data/cavPor3/html/{trackDb.ra,gap.html,gold.html} # # Then cd ../.. (to trackDb/) and # - edit makefile to add cavPor3 to DBS. # - (if necessary) cvs add guineaPig # - cvs add guineaPig/cavPor3 # - cvs add guineaPig/cavPor3/*.{ra,html} # - cvs ci -m "Added cavPor3 to DBS." makefile # - cvs ci -m "Initial descriptions for cavPor3." guineaPig/cavPor3 # - (if necessary) cvs ci guineaPig cvs ci -m "Initial description page for browser gateway of Cavia porcellus (guinea pig)" description.html cvs ci -m "Initial description page of gap location track for Cavia porcellus (guinea pig)" gap.html cvs ci -m "Initial description page of assembly track for Cavia porcellus (guinea pig)" gold.html # - Run make update DBS=cavPor3 and make alpha when done. # - (optional) Clean up /cluster/data/cavPor3/TemporaryTrackDbCheckout # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there. ######################################################################### # REPEATMASKER (DONE - 2008-04-05 - Tim) ssh kkstore06 screen # use a screen to manage this job mkdir /cluster/data/cavPor3/bed/repeatMasker cd /cluster/data/cavPor3/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/cavPor3/bed/repeatMasker \ cavPor3 > do.log 2>&1 & # Note: can run simpleRepeats simultaneously #### When done with RM: ssh pk para time # CPU time in finished jobs: 26236391s 437273.18m 7287.89h 303.66d 0.832 y # IO & Wait Time: 548808s 9146.80m 152.45h 6.35d 0.017 y # Average job time: 4735s 78.91m 1.32h 0.05d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 5871s 97.85m 1.63h 0.07d # Submission to last job: 130217s 2170.28m 36.17h 1.51d time nice -n +19 featureBits cavPor3 rmsk > fb.cavPor3.rmsk.txt 2>&1 & # 732765485 bases of 2663369733 (27.513%) in intersection # RepeatMasker and lib version from do.log: # RepeatMasker version development-$Id: cavPor3.txt,v 1.22 2010/04/21 19:22:27 hiram Exp $ # Jan 11 2008 (open-3-1-9) version of RepeatMasker # CC RELEASE 20071204; # Compare coverage to previous assembly: # skip this since cavPor 2 was never added to browser #featureBits cavPor2 rmsk ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-04-04 - Tim) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/cavPor3/bed/simpleRepeat cd /cluster/data/cavPor3/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/cavPor3/bed/simpleRepeat \ cavPor3 > do.log 2>&1 & #### When done ssh pk para time # Completed: 73 of 73 jobs # CPU time in finished jobs: 13220s 220.33m 3.67h 0.15d 0.000 y # IO & Wait Time: 679s 11.32m 0.19h 0.01d 0.000 y # Average job time: 190s 3.17m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 397s 6.62m 0.11h 0.00d # Submission to last job: 1320s 22.00m 0.37h 0.02d featureBits cavPor3 simpleRepeat # 33641656 bases of 2663369733 (1.263%) in intersection # after RM run is done, add this mask: cd /cluster/data/cavPor3 twoBitMask cavPor3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed cavPor3.2bit twoBitToFa cavPor3.2bit stdout | faSize stdin # 2723219641 bases (59849908 N's 2663369733 real 1930234123 upper 733135610 lower) in 3144 sequences in 1 files # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298 # N count: mean 19036.2 sd 100029.3 # U count: mean 613942.2 sd 3764766.9 # L count: mean 233185.6 sd 1462738.3 # %26.92 masked total, %27.53 masked real # >>> NOTE: 2723219641 bases matche chrome.sizes above but not featureBits which is 2663369733 twoBitToFa cavPor3.rmsk.2bit stdout | faSize stdin # 2723219641 bases (59849908 N's 2663369733 real 1931053208 upper 732316525 lower) in 3144 sequences in 1 files # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298 # N count: mean 19036.2 sd 100029.3 # U count: mean 614202.7 sd 3766351.7 # L count: mean 232925.1 sd 1461161.8 # %26.89 masked total, %27.50 masked real ######################################################################### # Link to it from /gbdb: (DONE - 2008-04-10 - Tim) ln -s /cluster/data/cavPor3/cavPor3.2bit /gbdb/cavPor3/cavPor3.2bit ######################################################################### # Create OOC file for genbank runs (DONE - 2008-04-07 - Tim) # use same repMatch value as bosTau2 ssh kkstore05 cd /cluster/data/cavPor3 blat cavPor3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=cavPor3.11.1005.ooc -repMatch=1005 # Wrote 26905 overused 11-mers to cavPor3.11.1005.ooc # -rw-rw-r-- 1 tdreszer protein 107628 Apr 7 13:22 cavPor3.11.1005.ooc blat cavPor3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=cavPor3.11.ooc -repMatch=1024 # Wrote 25815 overused 11-mers to cavPor3.11.ooc # -rw-rw-r-- 1 tdreszer protein 103268 Apr 7 13:17 cavPor3.11.ooc ssh kkr1u00 mkdir /iscratch/i/cavPor3 cd /iscratch/i/cavPor3 cp -p /cluster/data/cavPor3/cavPor3.2bit . for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/cavPor3/ done ######################################################################### # Run WindowMasker because RepeatMasker coverge is low (DONE - 2008-04-08 - Tim) screen mkdir /cluster/data/cavPor3/bed/windowMasker ssh kkstore05 cd /cluster/data/cavPor3/bed/windowMasker nice doWindowMasker.pl -workhorse=kolossus \ -buildDir=/cluster/data/cavPor3/bed/windowMasker cavPor3 > wmRun.log 2>&1 & # load this initial data to get ready to clean it ssh hgwdev cd /cluster/data/cavPor3/bed/windowMasker hgLoadBed cavPor3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 14554107 elements of size 3 # eliminate the gaps from the masking featureBits cavPor3 -not gap -bed=notGap.bed # 2663369733 bases of 2663369733 (100.000%) in intersection featureBits cavPor3 windowmaskerSdust # 990884347 bases of 2663369733 (37.204%) in intersection time nice -n +19 featureBits cavPor3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 931034439 bases of 2663369733 (34.957%) in intersection # reload track to get it clean hgLoadBed cavPor3 windowmaskerSdust cleanWMask.bed.gz # Loaded 14547726 elements of size 4 featureBits cavPor3 windowmaskerSdust # 931034439 bases of 2663369733 (34.957%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../cavPor3.unmasked.2bit stdin \ -type=.bed cavPor3.cleanWMSdust.2bit twoBitToFa cavPor3.cleanWMSdust.2bit stdout | faSize stdin \ > cavPor3.cleanWMSdust.faSize.txt cat cavPor3.cleanWMSdust.faSize.txt # 2723219641 bases (59849908 N's 2663369733 real 1732335294 upper 931034439 lower) in 3144 sequences in 1 files # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298 # N count: mean 19036.2 sd 100029.3 # U count: mean 550997.2 sd 3547541.7 # L count: mean 296130.5 sd 1686844.5 # %34.19 masked total, %34.96 masked real ######################################################################### # Starting Genbank (Done - 2008-04-10 - Tim) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank # edit etc/genbank.conf to add the following entry: # cavPor3 (C. porcellus) 3144 scaffolds 967 are multi-contig and 2076 are single contig cavPor3.serverGenome = /cluster/data/cavPor3/cavPor3.2bit cavPor3.clusterGenome = /iscratch/i/cavPor3/cavPor3.2bit cavPor3.ooc = /cluster/data/cavPor3/cavPor3.11.ooc cavPor3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} cavPor3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} cavPor3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} cavPor3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} cavPor3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} cavPor3.lift = no cavPor3.downloadDir = cavPor3 cavPor3.perChromTables = no cavPor3.refseq.mrna.native.load = no cavPor3.refseq.mrna.xeno.load = yes cavPor3.genbank.mrna.xeno.load = yes cavPor3.genbank.est.native.load = yes cvs ci -m "Added cavPor3." etc/genbank.conf # update /cluster/data/genbank/: make etc-update cvs ci -m "Turned off native and on xeno." etc/genbank.conf # Edit src/lib/gbGenome.c to add new species. # static char *cavPorNames[] = {"Cavia porcellus", NULL}; # static struct dbToSpecies dbToSpeciesMap[] = { ...>>> {"cavPor", cavPorNames}, cvs ci -m "Added guinea pig." src/lib/gbGenome.c make install-server ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # This is a call to a script that will push our jobs out to the cluster # since it's a big job. time nice -n +19 bin/gbAlignStep -initial cavPor3 & # logFile: var/build/logs/2008.03.10-14:14:43.cavPor3.initalign.log # real 567m7.431s # > For comparison: logFile: var/build/logs/2008.04.08-14:37:23.bosTau4.initalign.log # > real 45m6.595s # Batch failed after 4 tries on /cluster/genbank/genbank/bin/gbBlat genbank.164.0/cavPor3/full/psl/est.eb.native.1/scaffold_41/scaffold_41.job genbank.164.0/cavPor3/full/psl/est.eb.native.1/scaffold_41/scaffold_41.psl # command failed: ssh -x kk cd /cluster/data/genbank/build/work/initial.cavPor3/align\; para make -maxPush=200000 align.jobs >> {"cavPor", cavPorNames}, #cvs ci -m "Added guinea pig." src/lib/gbGenome.c make install-server ### Mark Deikhens recommends: # You are not really realigning, but aligning additional. If you # just kick off an initial alignment, it *should* align the # missing partitions of the data. However, sometimes weird things # happen when a new, full release is made, and a new refseq just # came out. So kick on the alignment and let me know when its # done and I can take a look. # # Also, to just reload everything, include the -drop flag on your # command line. That is usually easiest; way waste your time # figuring out weird cases when the computer can just do the work. ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # This is a call to a script that will push our jobs out to the cluster # since it's a big job. time nice -n +19 bin/gbAlignStep -initial cavPor3 & # logFile: var/build/logs/2008.05.22-15:44:34.cavPor3.initalign.log # real 182m21.821s # gbAlignInstall: complete: real=2.35 # genbank 2008.05.22-18:46:56 cavPor3.initalign: finish # Mark Deikhens: Look good, load it. # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad cavPor3 & # logFile: var/dbload/hgwdev/logs/2008.05.23-11:39:52.dbload.log # real 17m53.123s ############################################################################ # DONE - 2008-04-10 - Tim # Reset default position to Math1=ATOH1=NM_005172=scaffold_15:19047435-19048046 hgsql -e \ 'update dbDb set defaultPos="scaffold_15:19047434-19048046" where name="cavPor3";' \ hgcentraltest # And there was a mistake in the description date entry hgsql -e \ 'update dbDb set description="Feb. 2008" where name="cavPor3";' \ hgcentraltest ######################################################################### ## genscan run (DONE - 2008-04-10 - Tim) ## create hard masked sequence ssh kkstore05 cd /cluster/data/cavPor3 mkdir hardMasked for C in `cut -f1 chrom.sizes` do echo "hardMasked/${C}.hard.fa" twoBitToFa -seq=${C} cavPor3.2bit stdout \ | maskOutFa stdin hard hardMasked/${C}.hard.fa ls -ld "hardMasked/${C}.hard.fa" done # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them. This drives genscan nuts echo hardMasked/*.hard.fa | xargs faCount > faCount.hard.txt # the lowest three are: egrep -v "^#|^total" faCount.hard.txt \ | awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3 # scaffold_2949 425 # scaffold_3101 357 # scaffold_3130 0 rm harMasked/scaffold_3130.hard.fa ##### Saving this junk from bosTau4.txt because it will be useful next time ## # There are a whole bunch of these, and many with just a few bases. ## # Actually, before removing these for genscan, run the cpgIsland ## # business first since it can work on them all. ## # So, remove any with less than 100 bases of sequence ## egrep -v "^#|^total" faCount.hard.txt | awk '{size=$2-$7; if (size < 100){printf "hardMasked/%s.hard.fa\n", $1}}' | xargs rm # now get them over to a kluster location mkdir /san/sanvol1/scratch/cavPor3/hardChunks cd /san/sanvol1/scratch/cavPor3/hardChunks # creating 4,000,000 sized chunks, the chroms stay together as # single pieces. The contigs get grouped together into 4,000,000 # sized fasta files. You don't want to break these things up # because genscan will be doing its own internal 2.4 million # window on these pieces, and the gene names are going to be # constructed from the sequence name in these fasta files. echo /cluster/data/cavPor3/hardMasked/*.hard.fa | xargs cat \ | faSplit about stdin ls c_ ssh hgwdev mkdir /cluster/data/cavPor3/bed/genscan cd /cluster/data/cavPor3/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh memk cd /cluster/data/cavPor3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_*.fa > genome.list # Create run-time script to operate gsBig in a cluster safe manner cat << \_EOF_ > runGsBig #!/bin/csh -fe set runDir = `pwd` set srcDir = $1 set inFile = $2 set fileRoot = $inFile:r mkdir /scratch/tmp/$fileRoot cp -p $srcDir/$inFile /scratch/tmp/$fileRoot pushd /scratch/tmp/$fileRoot /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000 popd cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt rm -fr /scratch/tmp/$fileRoot _EOF_ # << happy emacs chmod +x runGsBig cat << \_EOF_ > template #LOOP runGsBig /san/sanvol1/scratch/cavPor3/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} #ENDLOOP _EOF_ # << happy emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para check ##172 jobs in batch ##13 jobs (including everybody's) in Parasol queue. ##Checking finished jobs ##crashed: 7 ##ranOk: 165 ##total jobs in batch: 172 ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para status | grep -v done ##172 jobs in batch ##13 jobs (including everybody's) in Parasol queue. ##Checking finished jobs ###state tries real cpu host jobid cmd ##crash 1 60.00 55.30 mkr0u0 825047 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_19.fa gtf/c_19.gtf pep/c_19.pep subopt/c_19.bed ##crash 1 215.00 213.48 mkr0u6 825117 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_152.fa gtf/c_152.gtf pep/c_152.pep subopt/c_152.bed ##crash 1 97.00 96.01 mkr0u0 825153 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_111.fa gtf/c_111.gtf pep/c_111.pep subopt/c_111.bed ##crash 1 149.00 146.78 mkr0u0 825158 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_107.fa gtf/c_107.gtf pep/c_107.pep subopt/c_107.bed ##crash 1 638.00 636.81 mkr0u3 825178 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_71.fa gtf/c_71.gtf pep/c_71.pep subopt/c_71.bed ##crash 1 904.00 901.75 mkr0u6 825185 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_32.fa gtf/c_32.gtf pep/c_32.pep subopt/c_32.bed ##crash 1 1238.00 1232.05 mkr0u6 825196 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_01.fa gtf/c_01.gtf pep/c_01.pep subopt/c_01.bed ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para problems | grep Insuff ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ####### However, these are not the simply the largest files: [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_19.fa -rw-rw-r-- 1 tdreszer protein 4408036 Apr 8 15:39 /san/sanvol1/scratch/cavPor3/hardChunks/c_19.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_152.fa -rw-rw-r-- 1 tdreszer protein 8304826 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_152.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_111.fa -rw-rw-r-- 1 tdreszer protein 18155094 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_111.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_107.fa -rw-rw-r-- 1 tdreszer protein 20080719 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_107.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_71.fa -rw-rw-r-- 1 tdreszer protein 36860211 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_71.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_32.fa -rw-rw-r-- 1 tdreszer protein 49549701 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_32.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_01.fa -rw-rw-r-- 1 tdreszer protein 82754438 Apr 8 15:39 /san/sanvol1/scratch/cavPor3/hardChunks/c_01.fa # Okay try spliting the failed c_*.fa files into individual sequences rt=retry ssh memk pushd /san/sanvol1/scratch/cavPor3/hardChunks faSplit sequence c_19.fa 1000 c_rt19_ ## Wrong one! faSplit sequence c_151.fa 1000 c_rt151_ faSplit sequence c_111.fa 1000 c_rt111_ faSplit sequence c_107.fa 1000 c_rt107_ faSplit sequence c_71.fa 1000 c_rt71_ faSplit sequence c_32.fa 1000 c_rt32_ faSplit sequence c_01.fa 1000 c_rt01_ popd # cd /cluster/data/cavPor3/bed/genscan ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt*.fa > genome_rt.list gensub2 genome_rt.list single template jobList_rt para create jobList_rt para try, check, push, check, ... # Checking finished jobs # crashed: 6 # ranOk: 71 # total jobs in batch: 77 ## missed one: pushd /san/sanvol1/scratch/cavPor3/hardChunks faSplit sequence c_152.fa 1000 c_rt152_ popd # cd /cluster/data/cavPor3/bed/genscan ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt152*.fa > genome_rt2.list gensub2 genome_rt2.list single template jobList_rt2 para create jobList_rt2 para try, check, push, check, ... # Checking finished jobs # crashed: 1 # ranOk: 10 # total jobs in batch: 11 ######## After breaking failed chunks by sequence, the 7 culprit scaffolds are known: # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt19_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt151_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt111_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt107_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt71_0025.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt32_0005.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt01_0000.fa #seq len A C G T N cpg #scaffold_115 4065651 736071 696850 700707 740890 1191133 60230 #scaffold_81 7649460 1502224 1297711 1285486 1482474 2081565 87359 #scaffold_45 15715546 2926604 2362405 2351615 2933678 5141244 126807 #scaffold_41 16993173 3764581 2421095 2432646 3805601 4569250 103547 #scaffold_21 33939536 7356221 4629107 4649855 7388452 9915901 190985 #scaffold_13 48363610 10226185 7754898 7730208 10253556 12398763 432844 #scaffold_1 81131790 17393288 12255795 12237937 17366862 21877908 561145 ### Punting. Hiram suggests that we just wait till the alignments fill us in. # At this point, all scaffolds have been successfully genscanned EXCEPT the 7 listed above # cat and lift the results into single files ssh kkstore05 cd /cluster/data/cavPor3/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/cavPor3/bed/genscan ldHgGene cavPor3 -gtf genscan genscan.gtf # Read 64818 transcripts in 416957 lines in 1 files # 64818 groups 2179 seqs 1 sources 1 feature types # 64818 gene predictions hgPepPred cavPor3 generic genscanPep genscan.pep hgLoadBed cavPor3 genscanSubopt genscanSubopt.bed # Loaded 537376 elements of size 6 # let's check the numbers time nice -n +19 featureBits cavPor3 genscan # 76489707 bases of 2663369733 (2.872%) in intersection ######################################################################### ## genscan RErun (BEG: 2008-05-22 DONE: 2008-05-22 - Tim) ### ### QA found missing genscan data, so retrying the unlucky 7 by splitting on gaps ssh memk pushd /san/sanvol1/scratch/cavPor3/hardChunks faSplit gap c_rt01_0000.fa 1000000 c_rt_01_0000_gap_ faSplit gap c_rt32_0005.fa 1000000 c_rt_32_0005_gap_ faSplit gap c_rt71_0025.fa 1000000 c_rt_71_0025_gap_ faSplit gap c_rt107_0010.fa 1000000 c_rt_107_0010_gap_ faSplit gap c_rt111_0010.fa 1000000 c_rt_111_0010_gap_ faSplit gap c_rt151_0010.fa 1000000 c_rt_151_0010_gap_ faSplit gap c_rt19_0010.fa 1000000 c_rt_19_0010_gap_ popd # cd /cluster/data/cavPor3/bed/genscan ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt*_gap_*.fa > genome_gap.list gensub2 genome_gap.list single template jobList_gap para create jobList_gap para try, check, push, check, ... # 219 jobs in batch # 52852 jobs (including everybody's) in Parasol queue. # Checking finished jobs # ....... # ranOk: 219 # total jobs in batch: 219 # cat and lift the results into single files ssh kkstore05 cd /cluster/data/cavPor3/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan_full.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt_full.bed cat pep/c_*.pep > genscan_full.pep # Load into the database as so: ssh hgwdev cd /cluster/data/cavPor3/bed/genscan ldHgGene cavPor3 -gtf genscan genscan_full.gtf # Read 70478 transcripts in 454271 lines in 1 files # 70478 groups 2398 seqs 1 sources 1 feature types # 70478 gene predictions hgPepPred cavPor3 generic genscanPep genscan_full.pep hgLoadBed cavPor3 genscanSubopt genscanSubopt_full.bed # Loaded 605836 elements of size 6 # let's check the numbers time nice -n +19 featureBits cavPor3 genscan # 76489707 bases of 2663369733 (2.872%) in intersection ### ### ### Total defeat. faSplit on gaps results in new sequence names that donot match to the known scaffold_nnnn sequences. # Clean up mess: cd /cluster/data/cavPor3/bed/genscan pushd /san/sanvol1/scratch/cavPor3/hardChunks rm c_rt_*_gap_*.fa popd # cd /cluster/data/cavPor3/bed/genscan rm gtf/c_rt_*_gap_*.gtf rm pep/c_rt_*_gap_*.pep rm subopt/c_rt_*_gap_*.bed # Try the seven culprits one more time. cat << \_EOF_ > genome_oneMoreTime.list /san/sanvol1/scratch/cavPor3/hardChunks/c_rt19_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt151_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt111_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt107_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt71_0025.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt32_0005.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt01_0000.fa _EOF_ ssh memk cd /cluster/data/cavPor3/bed/genscan gensub2 genome_oneMoreTime.list single template jobList_oneMoreTime para create jobList_oneMoreTime para push, check, ... # para status # 7 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # #state tries real cpu host jobid cmd # crash 1 58.00 54.08 mkr0u5 1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed # done 1 228.00 224.16 mkr0u6 1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed # crash 1 57.00 52.47 mkr0u3 1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed # crash 1 106.00 99.41 mkr0u7 1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed # crash 1 617.00 602.73 mkr0u5 1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed # crash 1 922.00 896.75 mkr0u3 1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed # crash 1 1307.00 1258.05 mkr0u7 1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed # One down, 6 to go. Try lowering window size on gsBig mv runGsBig runGsBig.24 cat << \_EOF_ > runGsBig #!/bin/csh -fe set runDir = `pwd` set srcDir = $1 set inFile = $2 set fileRoot = $inFile:r mkdir /scratch/tmp/$fileRoot cp -p $srcDir/$inFile /scratch/tmp/$fileRoot pushd /scratch/tmp/$fileRoot /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=1200000 popd cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt rm -fr /scratch/tmp/$fileRoot _EOF_ # << happy emacs chmod +x runGsBig para push, check, ... # para status # 7 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # #state tries real cpu host jobid cmd # done 2 58.00 54.08 mkr0u5 1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed # done 1 228.00 224.16 mkr0u6 1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed # crash 2 57.00 52.47 mkr0u3 1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed # done 2 106.00 99.41 mkr0u7 1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed # done 2 617.00 602.73 mkr0u5 1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed # crash 2 922.00 896.75 mkr0u3 1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed # done 2 1307.00 1258.05 mkr0u7 1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed # All but two! para push, check, ... # para status # 7 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # #state tries real cpu host jobid cmd # done 2 58.00 54.08 mkr0u5 1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed # done 1 228.00 224.16 mkr0u6 1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed # done 3 57.00 52.47 mkr0u3 1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed # done 2 106.00 99.41 mkr0u7 1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed # done 2 617.00 602.73 mkr0u5 1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed # done 3 922.00 896.75 mkr0u3 1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed # done 2 1307.00 1258.05 mkr0u7 1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed ### All completed! # cat and lift the results into single files ssh kkstore05 cd /cluster/data/cavPor3/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan_full.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt_full.bed cat pep/c_*.pep > genscan_full.pep # Load into the database as so: ssh hgwdev cd /cluster/data/cavPor3/bed/genscan ldHgGene cavPor3 -gtf genscan genscan_full.gtf # Read 70161 transcripts in 453022 lines in 1 files # 70161 groups 2185 seqs 1 sources 1 feature types # 70161 gene predictions hgPepPred cavPor3 generic genscanPep genscan_full.pep hgLoadBed cavPor3 genscanSubopt genscanSubopt_full.bed # Loaded 603786 elements of size 6 # let's check the numbers time nice -n +19 featureBits cavPor3 genscan # 82960328 bases of 2663369733 (3.115%) in intersection ######################################################################### # CPGISLANDS (DONE - 2008-04-10 - Tim) ssh hgwdev mkdir /cluster/data/cavPor3/bed/cpgIsland cd /cluster/data/cavPor3/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # There was a problem in here, in both cpg.c and cpg_lh.c: # cpg_lh.c:74: warning: conflicting types for built-in function 'malloc' # warning: conflicting types for built-in function 'malloc' # commentted out line 74 ONLY to get this to build # gcc readseq.c cpg_lh.c -o cpglh.exe cd ../.. ln -s hg3rdParty/cpgIslands/cpglh.exe . # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. mkdir results echo ../../hardMasked/*.hard.fa | sed -e "s/ /\n/g" | while read F do FA=${F/*\/} C=${FA/.hard.fa/} echo "./cpglh.exe ${FA} > results/${C}.cpg" nice -n +19 ./cpglh.exe ${F} > results/${C}.cpg done > cpglh.out 2>&1 & # about 5 minutes # Transform cpglh output to bed + cat << \_EOF_ > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } _EOF_ # << happy emacs catDir results | awk -f filter.awk | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/cavPor3/bed/cpgIsland hgLoadBed cavPor3 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 36373 elements of size 10 featureBits cavPor3 cpgIslandExt # 21746514 bases of 2663369733 (0.817%) in intersection ######################################################################### # READ before BLASTZ: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment ######################################################################### # BLASTZ/CHAIN/NET Hg18-cavPor3 (START - 2008-04-10 - Tim; DONE 2008-04-15) ssh kkstore02 # store11->kkstore02-10 screen # use a screen to manage this multi-day job mkdir /cluster/data/hg18/bed/blastzCavPor3.2008-04-10 cd /cluster/data/hg18/bed/ ln -s blastzCavPor3.2008-04-10 blastz.cavPor3 cd blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1320m35.523s # HOWEVER, doBlastzChainNet failed because the run.blastz/para.results were corrupt. cd /cluster/data/hg18/bed/blastzCavPor3.2008-04-10/run.blastz para recover jobList jobListRecovered # wc -l jobListRecovered -> 9 para create jobListRecovered para push # completed all jobs. Now: ssh kkstore02 screen -r -d time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -syntenicNet > do.log 2>&1 & #real 210m38.768s # low because of the restart that had to occur tail do.log # *** All done! # *** Make sure that goldenPath/hg18/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. cat fb.hg18.chainCavPor3Link.txt # 1267036494 bases of 2881515245 (43.971%) in intersection ######### Change locations in DEF due to Hiram's new methods cd /cluster/data/hg18/bed/blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.hg18.swap ssh kkstore05 # where cavPor3 is located screen cd /cluster/data/cavPor3/bed/blastz.hg18.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 # real 216m13.808s # chainSplit chain cavPor3.hg18.all.chain.gz # sh: pipe error: Too many open files # Can't open chain/scaffold_621.chain to append: Too many open files # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.hg18.swap time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > do.log 2>&1 & # real 29m49.617s # *** All done! # *** Make sure that (/usr/local/apache/htdocs/goldenPath/)cavPor3/vsHg18/README.txt is accurate. # *** Add {chain,net}Hg18 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainHg18Link.txt # 1281925834 bases of 2663369733 (48.132%) in intersection ######################################################################### # BLASTZ/CHAIN/NET mm9-cavPor3 (START - 2008-04-10 - DONE 2008-04-14 - Tim) ssh kkstore06 # store4->kkstore04-10 screen # use a screen to manage this multi-day job mkdir /cluster/data/mm9/bed/blastzCavPor3.2008-04-10 cd /cluster/data/mm9/bed/ ln -s blastzCavPor3.2008-04-10 blastz.cavPor3 cd blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/cluster/bluearc/scratch/data/mm9/nib SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1764m55.155s tail do.log # *** All done! # *** Make sure that goldenPath/mm9/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. # blastz: ranOk: 52984 cat fb.mm9.chainCavPor3Link.txt # 757283793 bases of 2620346127 (28.900%) in intersection cd /cluster/data/mm9/bed/blastzCavPor3.2008-04-10 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.mm9.swap cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 & # real 166m53.671s # Exit 25 time nice -n +19 doBlastzChainNet.pl /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF -bigC # Can't open chain/scaffold_795.chain to append: Too many open files # gzip: stdout: Broken pipe # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastz.mm9.swap/axtChain/netSynteny.csh # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > syn.log 2>&1 & # real 24m37.561s # *** All done! # *** Make sure that goldenPath/cavPor3/vsMm9/README.txt is accurate. # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainMm9Link.txt # 781173609 bases of 2663369733 (29.330%) in intersection ######################################################################### # BLASTZ/CHAIN/NET galGal3-cavPor3 (START - 2008-04-10; DONE: 2008-04-15 - Tim) ssh kkstore03 # store6->kkstore03-10 screen # use a screen to manage this multi-day job mkdir /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 cd /cluster/data/galGal3/bed/ ln -s blastzCavPor3.2008-04-10 blastz.cavPor3 cd blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/cluster/bluearc/scratch/data/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1507m5.132s tail do.log # *** All done! # *** Make sure that goldenPath/galGal3/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. cat fb.galGal3.chainCavPor3Link.txt # 106239838 bases of 1042591351 (10.190%) in intersection cd /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/scratch/data/galGal3/nib SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.galGal3.swap cd /cluster/data/cavPor3/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > do.log 2>&1 # real 23m19.970s # *** All done! # *** Make sure that goldenPath/cavPor3/vsGalGal3/README.txt is accurate. # *** Add {chain,net}GalGal3 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainGalGal3Link.txt # 144795360 bases of 2663369733 (5.437%) in intersection ######################################################################### # BLASTZ/CHAIN/NET cavPor3-oryCun1 (RESTART - 2008-04-11 - DONE - 2008-04-22 Tim) ssh kkstore05 # store12->kkstore05 screen # use a screen to manage this multi-day job mkdir /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 cd /cluster/data/cavPor3/bed/ ln -s blastzOryCun1.2008-04-11 blastz.oryCun1 cd blastzOryCun1.2008-04-11 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: GuineaPig cavPor3 SEQ1_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ1_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes SEQ1_LIMIT=300 SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rabbit oryCun1 SEQ2_DIR=/cluster/bluearc/scratch/data/oryCun1/oryCun1.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/oryCun1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=partition > do.log 2>&1 & ## used stop, then `wc -l run.blastz/cavPor3.lst` * `wc -l run.balstz/monDom4.lst` ## to find out size based upon DEF SEQn_CHUNK and SEQn_LIMIT then: time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=blastz > do.log 2>&1 & # ps -ef | grep blastzCavPor3 #real 2818m19.825s tail do.log # Loading 22976113 chains into cavPor3.chainOryCun1 # Can't start query: # load data local infile 'link.tab' into table chainOryCun1Link # mySQL error 1114: The table 'chainOryCun1Link' is full # Command failed: # ssh -x hgwdev nice /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain/loadUp.csh # mysql> select count(*) from chainOryCun1Link; > 119,306,828 mm9-oryCun1 has ~40m spread out among ~20 chr tbls # featureBits cavPor3 chainOryCun1Link # 439282179 bases of 2663369733 (16.493%) in intersection # Options: # 1) combine small scaffolds into a scaffoldUn # Have to restart all blastzs Awkward since not clear dividing line between scaffold/superscaffold # 2) Do not have the raw chains available at all. Just have reciprocal best # 3) Go back to the idea of greater masking. ## [tdreszer@hgwdev /cluster/data/cavPor3] textHistogram -col=2 -binSize=40000000 chrom.sizes ## 0 ************************************************************ 3126 ## 40000000 16 ## 80000000 2 ## [tdreszer@hgwdev /cluster/data/cavPor3] textHistogram -col=2 -binSize=10000000 chrom.sizes ## 0 ************************************************************ 3079 ## 10000000 * 26 ## 20000000 14 ## 30000000 7 ## 40000000 5 ## 50000000 4 ## 60000000 5 ## 70000000 2 ## 80000000 2 ##################### Hiram recommends loading all 210 mil anyway with special table create to override row limit ################################################################### # this failed during the load because the chainLink table became # too large. These were loaded manually with an sql statement to # start the table definition, then a load data local infile using # the .tab files left over from the failed load. Note the extra # definitions on the chainOryCun1Link table time nice -n +19 hgsql -e \ "DROP TABLE chainOryCun1Link;" cavPor3 & CREATE TABLE chainOryCun1Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(16),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=220000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainOryCun1Link;" cavPor3 # this one took a number of hours # real 272m44.943s # finish the nets and load # Add gap/repeat stats to the net file using database tables: cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain netClass -verbose=0 -noAr noClass.net cavPor3 oryCun1 cavPor3.oryCun1.net # Load nets: time nice -n +19 netFilter -minGap=10 cavPor3.oryCun1.net \ | hgLoadNet -verbose=0 cavPor3 netOryCun1 stdin > netFilter.out 2>&1 & # real 6m27.050s cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 time nice -n +19 featureBits cavPor3 chainOryCun1Link >&fb.cavPor3.chainOryCun1Link.txt & #real 61m59.509s cat fb.cavPor3.chainOryCun1Link.txt # 752079320 bases of 2663369733 (28.238%) in intersection ################################################################### ### doReciprocalbest.pl look in mm9.txt with oryCun1 # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ > rbest.log 2>&1 & # failed due to experiments dir being in the way cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain mv experiments old.experiments time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ > rbestResume.log 2>&1 & # hung for some unknown reason. Starting again time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ >> rbestAgain.log 2>&1 & # real 37m33.221s # HgStepManager: executing step 'download' Fri Apr 18 10:33:31 2008. # download: output of previous step recipBest, /usr/local/apache/htdocs/goldenPath/cavPor3/vsOryCun1 , is required but does not appear to exist. # If it actually does exist, then this error is probably due to network/filesystem delays -- wait a minute and restart with -continue download. # If it really doesn't exist, either fix things manually or try -continue recipBest time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ -continue=recipBest >> rbestAgain.log 2>&1 & # Okay, the p[art that I am missing is that I didn't finish doBlastz with downloads! ssh kkstore05 screen cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download > doResumeAtDownload.log 2>&1 & # real 3m21.480s # *** All done! # *** Make sure that goldenPath/cavPor3/vsOryCun1/README.txt is accurate. # *** Add {chain,net}OryCun1 tracks to trackDb.ra if necessary. time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ -continue=download >> rbestAgain.log 2>&1 & # real 0m0.418s # *** All done! # *** Steps were performed in /cluster/data/cavPor3/bed/blastz.oryCun1 # real 611m17.901s cat fb.cavPor3.chainOryCun1Link.txt # 752079320 bases of 2663369733 (28.238%) in intersection ######### Change locations in DEF due to Hiram's new methods cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: GuineaPig cavPor3 SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ1_LEN=/scratch/data/cavPor3/chrom.sizes SEQ1_LIMIT=300 SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rabbit oryCun1 SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/oryCun1/bed/blastz.cavPor3.swap cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap screen time nice -n +19 doBlastzChainNet.pl \ /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap > do.log 2>&1 # # real 492m11.337s # load data local infile 'link.tab' into table chainCavPor3Link # mySQL error 1114: The table 'chainCavPor3Link' is full # Command failed: # ssh -x hgwdev nice /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain/loadUp.csh # wc -l link.tab 210745564 ################################################################### # Manual load again ssh hgwdev cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain hgsql oryCun1 DROP TABLE chainCavPor3Link; CREATE TABLE chainCavPor3Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(16),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=212000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; # exit screen time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainCavPor3Link;" oryCun1 # real 360m19.577s # mysql> select count(*) from chainCavPor3Link; | 210745564 | # finish the nets and load # Still on hgwdev because of sql screen time nice -n +19 /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain/loadUpResume.csh >> doFinishLoad.log 2>&1 # real 991m52.532s # cat fb.oryCun1.chainCavPor3Link.txt # 729628282 bases of 2076044328 (35.145%) in intersection # Now continue -swap at download ssh kkstore04 # oryCun1 -> store09 -> kkstore04 cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap screen time nice -n +19 doBlastzChainNet.pl \ /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download >> doResumeAtDownload.log 2>&1 # real 2m2.757s # *** All done! # *** Make sure that goldenPath/oryCun1/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. ######################################################################### # BLASTZ/CHAIN/NET rn4-cavPor3 (STARTED - 2008-04-14; DONE 04-15-2008 - Tim) ssh kkstore06 # rat on store3->kkstore06 screen # use a screen to manage this multi-day job mkdir /cluster/data/rn4/bed/blastzCavPor3.2008-04-14 cd /cluster/data/rn4/bed/ ln -s blastzCavPor3.2008-04-14 blastz.cavPor3 cd blastzCavPor3.2008-04-14 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/cluster/bluearc/scratch/data/rn4/nib SEQ1_LEN=/cluster/bluearc/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastzCavPor3.2008-04-14 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 & # ps -ef | grep blastzCavPor3 # real 611m17.901s cat fb.rn4.chainCavPor3Link.txt # 716379861 bases of 2571531505 (27.858%) in intersection ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastzCavPor3.2008-04-14 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.rn4.swap cd /cluster/data/cavPor3/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/rn4/bed/blastzCavPor3.2008-04-14/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 # real 103m34.523s # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/rn4/bed/blastzCavPor3.2008-04-14/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > doSyn.log 2>&1 & # real 14m49.396s # *** All done! # *** Make sure that goldenPath/cavPor3/vsRn4/README.txt is accurate. # *** Add {chain,net}Rn4 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainRn4Link.txt # 735147548 bases of 2663369733 (27.602%) in intersection ########################################################################### # BLASTZ/CHAIN/NET monDom4-cavPor3 (START - 2008-04-11 DONE 2008-04-16 - Tim) ssh kkstore04 # monDom on store9 -> kkstore04 screen # use screen to control this job mkdir /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 cd /cluster/data/monDom4/bed/ ln -s blastzCavPor3.2008-04-11 blastz.cavPor3 cd blastzCavPor3.2008-04-11 cat << \_EOF_ > DEF # opossum vs guineaPigs # Use "mammal-fish" params even though this is mammal-mammal... # pretty distant, hopefully not too many shared undermasked repeats. #BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 TMPDIR=/scratch/tmp _EOF_ # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 -stop=partition \ -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 & ## used stop, then `wc -l run.blastz/cavPor3.lst` * `wc -l run.balstz/monDom4.lst` ## to find out size based upon DEF SEQn_CHUNK and SEQn_LIMIT then: time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 -continue=blastz \ -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 & # ps -ef | grep blastzCavPor3 # real 1715m0.868s # tail do.log # updated job database on disk # Batch failed after 4 tries on /cluster/bin/scripts/blastz-run-ucsc -outFormat psl /san/sanvol1/scratch/monDom4/nib/chr5.nib:chr5:20000000-30010000 qParts/part042.lst ../DEF ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl # Command failed: # ssh -x pk nice /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/run.blastz/doClusterRun.csh para-eta time # Completed: 65513 of 65514 jobs # Crashed: 1 jobs # CPU time in finished jobs: 9807564s 163459.40m 2724.32h 113.51d 0.311 y # IO & Wait Time: 373385s 6223.08m 103.72h 4.32d 0.012 y # Average job time: 155s 2.59m 0.04h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2413s 40.22m 0.67h 0.03d # Submission to last job: 103236s 1720.60m 28.68h 1.19d #### One for automation ???????????: para problems ## start time: Mon Apr 14 14:01:19 2008 ## return: 9 ## ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl does not exist ## stderr: ## ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl already exists ## ls ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl ## ls: ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl: No such file or directory ## ls ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl ## ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl ########### Case sensitive path name!!! "Psl" vs "psl" ssh kkstore04 screen # use screen to control this job cd /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF # opossum vs guineaPigs # Use "mammal-fish" params even though this is mammal-mammal... # pretty distant, hopefully not too many shared undermasked repeats. #BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib #SEQ1_DIR=/scratch/data/monDom4/monDom.2bit SEQ1_LEN=/scratch/data/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 TMPDIR=/scratch/tmp _EOF_ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 -continue=cat \ -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 & # ps -ef | grep blastzCavPor3 # real 232m24.348s # Done time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 -verbose=2 -continue=cat - cat fb.monDom4.chainCavPor3Link.txt # 334067222 bases of 3501643220 (9.540%) in intersection mkdir /cluster/data/cavPor3/bed/blastz.monDom4.swap cd /cluster/data/cavPor3/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -swap -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 # real 238m31.843s # Loading 18041915 chains into cavPor3.chainMonDom4 # Can't start query: # load data local infile 'link.tab' into table chainMonDom4Link # mySQL error 1114: The table 'chainMonDom4Link' is full # select count(*) from chainMonDom4Link; => 119,311,098 # wc -l link.tab => 202,324,406 link.tab ################################################################### # this failed during the load because the chainLink table became # too large. These were loaded manually with an sql statement to # start the table definition, then a load data local infile using # the .tab files left over from the failed load. Note the extra # definitions on the chainMonDom4Link table time nice -n +19 hgsql -e \ "DROP TABLE chainMonDom4Link;" cavPor3 & CREATE TABLE chainMonDom4Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(16),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=210000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainMonDom4Link;" cavPor3 # this one took a number of hours # real 272m44.943s # finish the nets and load # Add gap/repeat stats to the net file using database tables: cd /cluster/data/cavPor3/bed/blastz.monDom4.swap/axtChain # copied loadUp.csh to loadUpResume.csh and commented out lines already done time nice -n +19 ./loadUpResume.csh > loadUpResume.out 2>&1 #real 30m47.809s # real 106m17.385s cat fb.cavPor3.chainMonDom4Link.txt # 382771802 bases of 2663369733 (14.372%) in intersection ssh kkstore05 screen cd /cluster/data/cavPor3/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -swap -chainLinearGap=loose -bigClusterHub=pk -continue=download > doDowload.log 2>&1 # real 0m36.604s # *** All done! # *** Make sure that goldenPath/cavPor3/vsMonDom4/README.txt is accurate. # *** Add {chain,net}MonDom4 tracks to trackDb.ra if necessary. ############################################################################ # FINAL BLASTZS STEPS: # Be sure to add chainCavPor3.html,netCavPor3.html to makeDb/trackDb and update makeDb/trackDb/trackDb.ra # Any orgs with non-standard priority update {org}/priority.ra # Any orgs with differnent BLASTZ_Q=HoxD55.q will need chain{DB}.html and trackDb.ra at assembly level #Modified (possibly merged) files: # trackDb/trackDb.ra #Files/directories not checked in to CVS that look like source: # doc/cavPor3.txt # trackDb/chainOryCun1.html # trackDb/netOryCun1.html ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-04-17 - Tim) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("cavPor3", "blat7", "17780", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("cavPor3", "blat7", "17781", "0", "1");' \ hgcentraltest # test it with some sequence # Can't open /gbdb/cavPor3/choHof1.2bit to read: No such file or directory ## Doh! I entered in the wrong blat server the first time! ############################################################################# # Downloads (DONE - 2008-04-23 - Tim) # Let's see if the downloads will work ssh hgwdev /cluster/data/cavPor3 # expecting to find repeat masker .out file here: ln -s bed/repeatMasker/cavPor3.fa.out . time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev cavPor3 > jkStuff/downloads.log 2>&1 # failed due to link to RepeatMasker instead of repeatMasker. Fixed and restarted. # *** All done! # *** Please take a look at the downloads for cavPor3 using a web browser. # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/cavPor3/goldenPath/database/README.txt # /cluster/data/cavPor3/goldenPath/bigZips/README.txt # (The htdocs/goldenPath/cavPor3/*/README.txt "files" are just links to those.) # *** If you have to make any edits that would always apply to future # assemblies from the same sequencing center, please edit them into # ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help). # the downloads are located at: http://hgwdev.soe.ucsc.edu/goldenPath/cavPor3/ mv goldenPath old.goldenPath mv jkStuff/downloads.log jkStuff/old.downloads.log time nice -n +19 /cluster/home/tdreszer/kent/src/hg/utils/automation/makeDownloads.pl \ -workhorse=hgwdev cavPor3 >> jkStuff/downloads.log 2>&1 # real 24m23.427s # *** Please take a look at the downloads for cavPor3 using a web browser. # *** The downloads directory is: /usr/local/apache/htdocs/goldenPath/cavPor3. # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/cavPor3/goldenPath/database/README.txt # /cluster/data/cavPor3/goldenPath/bigZips/README.txt # (The htdocs/goldenPath/cavPor3/*/README.txt "files" are just links to those.) # *** If you have to make any edits that would always apply to future # assemblies from the same sequencing center, please edit them into # ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help). ############################################################################# # PushQ entries (DONE - 2008-04-24 - Tim) ssh hgwdev /cluster/data/cavPor3 /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql # output warnings: # hgwdev does not have /usr/local/apache/htdocs/goldenPath/cavPor3/liftOver/cavPor3ToCavPor* ### is it worth it? NO: cavPor2 was never released. # cavPor3 does not have seq # cavPor3 does not have extFile ### Not needed because output is from multiz, visiGene, affyProbes, cloneend, stsMarker, nibbImageProbes ### Kate suggests updating makePushQsql to require gbSeq and gbExtFile instead, but it already does. # # *** All done! # *** Please edit the output to ensure correctness before using. # *** 1. Resolve any warnings output by this script. # *** 2. Remove any entries which should not be pushed. ### All accounted for but: genscanSubopt which is not in pushQ.sql but is in # *** 3. Add tables associated with the main track table (e.g. *Pep tables # for gene prediction tracks). # *** 4. Add files associated with tracks. First, look at the results # of this query: # hgsql cavPor3 -e 'select distinct(path) from extFile' ### Doesn't exist # Then, look at file(s) named in each of the following wiggle tables: # hgsql cavPor3 -e 'select distinct(file) from gc5Base' ### | /gbdb/cavPor3/wib/gc5Base.wib | # hgsql cavPor3 -e 'select distinct(file) from quality' ### | /gbdb/cavPor3/wib/qual.wib | # Files go in the second field after tables (it's tables, cgis, files). # *** 5. This script currently does not recognize composite tracks. If cavPor3 # has any composite tracks, you should manually merge the separate # per-table entries into one entry. ### Doen't apply to cavPor3 at this time. # *** 6. Just before executing the sql, note the ID of the most recent entry # in the Main Push Queue. If the ID (first column) of the last # INSERT statement is not 1 greater than the most recent entry's, # make it so to avoid an ID clash with an existing entry. ### Updated to 4283 # *** When everything looks complete and correct, use hgsql on the qapushq # machine (currently hgwbeta) to execute the sql file. (Make sure that # qapushq does not already have a table named cavPor3.) Then use the Push # Queue web interface to check the contents of all entries. ssh hgwbeta cd /cluster/data/cavPor3/jkStuff hgsql qapushq < pushQ.sql ### All is there # *** If you haven't already, please add cavPor3 to makeDb/schema/all.joiner ! # It should be in both $gbd and $chainDest. ### Already there # *** When cavPor3 is on the RR (congrats!), please doBlastz -swap if you haven't # already. ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-09) ssh kkstore05 # bash if not using bash shell already mkdir /cluster/data/cavPor3/blastDb cd /cluster/data/cavPor3 awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst cavPor3.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst cavPor3.unmasked.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 3401 mkdir -p /san/sanvol1/scratch/cavPor3/blastDb cd /cluster/data/cavPor3/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/cavPor3/blastDb done mkdir -p /cluster/data/cavPor3/bed/tblastn.hg18KG cd /cluster/data/cavPor3/bed/tblastn.hg18KG echo /san/sanvol1/scratch/cavPor3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3401 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/3401) = 356.881506 mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa split -l 357 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/cavPor3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/cavPor3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 350303 of 350303 jobs # CPU time in finished jobs: 25825855s 430430.91m 7173.85h 298.91d 0.819 y # IO & Wait Time: 2396715s 39945.25m 665.75h 27.74d 0.076 y # Average job time: 81s 1.34m 0.02h 0.00d # Longest finished job: 339s 5.65m 0.09h 0.00d # Submission to last job: 80595s 1343.25m 22.39h 0.93d ssh kkstore05 cd /cluster/data/cavPor3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 99 of 103 jobs # Crashed: 4 jobs # CPU time in finished jobs: 15940s 265.67m 4.43h 0.18d 0.001 y # IO & Wait Time: 397633s 6627.21m 110.45h 4.60d 0.013 y # Average job time: 4178s 69.63m 1.16h 0.05d # Longest finished job: 5502s 91.70m 1.53h 0.06d # Submission to last job: 7032s 117.20m 1.95h 0.08d # ran 4 crashed jobs on memk. Completed: 4 of 4 jobs CPU time in finished jobs: 1673s 27.88m 0.46h 0.02d 0.000 y IO & Wait Time: 529s 8.81m 0.15h 0.01d 0.000 y Average job time: 550s 9.17m 0.15h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 712s 11.87m 0.20h 0.01d Submission to last job: 1299s 21.65m 0.36h 0.02d ssh kkstore05 cd /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/cavPor3/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/cavPor3/bed/tblastn.hg18KG hgLoadPsl cavPor3 blastHg18KG.psl # check coverage featureBits cavPor3 blastHg18KG # 35569442 bases of 2663369733 (1.336%) in intersection featureBits cavPor3 all_mrna blastHg18KG -enrichment # all_mrna 0.023%, blastHg18KG 1.336%, both 0.015%, cover 66.85%, enrich 50.05x ssh kkstore05 rm -rf /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut #end tblastn ############################### Make editor happy: _EOF_ ######################################################################### ## 7-Way Multiz (STARTED - 2008-04-25 - Tim) ## # From Jim: A minimal set would be human/mouse/rabbit/chicken. # Add rat and possum if you feel like turning the crank a little more. # the all.chain.gz files were split up via kluster jobs on memk # in order to get mafSynNet files. Example above in ornAna1 blastz ssh hgwdev mkdir /cluster/data/cavPor3/bed/multiz7way cd /cluster/data/cavPor3/bed/multiz7way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get cavPor3 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram # select the 7 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Rat_rn4,GuineaPig_cavPor2,Rabbit_oryCun1,Opossum_monDom4,Chicken_galGal3 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ | sed -e "s/cavPor2/cavPor3/g" > 7-way.fullNames.nh # looks something like this: # ((((((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,GuineaPig_cavPor3:0.202990):0.034350,Rabbit_oryCun1:0.208548):0.014587 # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); # rearrange to get guineaPig at the top: # this leaves us with: cat << _EOF_ > cavPor3.7-way.nh (((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); _EOF_ # << happy emacs # verify all blastz's exists cat << \_EOF_ > listMafs.csh #!/bin/csh -fe cd /cluster/data/cavPor3/bed/multiz7way foreach db (`grep -v cavPor3 species.list`) set bdir = /cluster/data/cavPor3/bed/blastz.$db if (-e $bdir/mafRBestNet/cavPor3.$db.rbest.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/cavPor3.$db.syn.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/cavPor3.$db.net.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end _EOF_ # << happy emacs chmod +x ./listMafs.csh # see what it says: ./listMafs.csh # galGal3 mafNet # hg18 mafNet # mm9 mafNet # monDom4 mafNet # oryCun1 mafRBestNet # rn4 mafNet /cluster/bin/phast/all_dists cavPor3.7-way.nh > 7way.distances.txt # ERROR: Can't parse distance in tree ("0.014587 #"). #[hgwdev:tdreszer multiz7way> m cavPor3.7-way.nh # (((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587 # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); # Problem is that tree should not contain 'n' so reecho and retry m 7way.distances.txt # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Mouse_mm9 Rat_rn4 0.160657 # Mouse_mm9 Rabbit_oryCun1 0.519779 # Mouse_mm9 Human_hg18 0.452719 # Mouse_mm9 Opossum_monDom4 0.909852 # Mouse_mm9 Chicken_galGal3 1.285399 # Rat_rn4 Rabbit_oryCun1 0.527888 # Rat_rn4 Human_hg18 0.460828 # Rat_rn4 Opossum_monDom4 0.917961 # Rat_rn4 Chicken_galGal3 1.293508 # Rabbit_oryCun1 Human_hg18 0.350036 # Rabbit_oryCun1 Opossum_monDom4 0.807169 # Rabbit_oryCun1 Chicken_galGal3 1.182716 # Human_hg18 Opossum_monDom4 0.710935 # Human_hg18 Chicken_galGal3 1.086482 # Opossum_monDom4 Chicken_galGal3 1.016989 # (total) - 2.228942 grep -i cavPor 7way.distances.txt | sort -k3,3n # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Note that guinneaPig is closer to human than to other rodents. # This may be reasonable, since the speed of evolution (length of tree limbs) # in rodents is greater than for primates. For instance mm9 is closer to hg18 # than it is to either cavPor2 or oryCun1 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCavPor3Link chain linearGap # distance on cavPor3 on other minScore # 0.479871 Mouse_mm9 (28.900%) (29.330%) 3000 medium # 0.487980 Rat_rn4 (27.858%) (27.602%) 3000 medium # 0.445888 Rabbit_oryCun1 (35.145%) (28.238%) 3000 medium # 0.378828 Human_hg18 (43.971%) (48.132%) 3000 medium # 0.835961 Opossum_monDom4 (9.540%) (14.372%) 5000 loose # 1.211508 Chicken_galGal3 (10.190%) (5.437%) 5000 loose ### ### ### ### Be sure to use calJac as an example because of scaffolds ### ### ### ### will require making maf files by scaffold name # create a coherent set of all the mafs involved in this run mkdir mafLinks cd mafLinks ln -s ../../blastz.hg18/mafNet ./hg18 ln -s ../../blastz.mm9/mafNet ./mm9 ln -s ../../blastz.rn4/mafNet ./rn4 ln -s ../../blastz.monDom4/mafNet ./monDom4 ln -s ../../blastz.galGal3/mafNet ./galGal3 ln -s ../../blastz.oryCun1/mafRBestNet ./oryCun1 # check data size: du -hscL * # 100M galGal3 # 930M hg18 # 577M mm9 # 268M monDom4 # 495M oryCun1 # 543M rn4 # 2.9G total # need to split these things up by Contig number for efficient kluster run ssh kkstore06 mkdir -p /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf cd /scratch/tmp # the 16201 is from petMar/chrom.sizes echo "chrM 0 16801" > chrM.bed for D in `grep -v cavPor3 /cluster/data/cavPor3/bed/multiz7way/species.list` do echo ${D} zcat \ /cluster/data/cavPor3/bed/multiz7way/mafLinks/cavPor3.${D}.*.maf.gz \ > ${D}.maf mkdir /scratch/tmp/${D} cd /scratch/tmp/${D} mafSplit -verbose=2 /dev/null -byTarget -useHashedName=10 Contig \ ../${D}.maf -outDirDepth=2 mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf rsync -a --progress ./ \ /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf/${D} cd /scratch/tmp rm -fr ${D} ${D}.maf done # create a run-time list of contigs to operate on, not all contigs # exist in all alignments, but we want all contig names used in any # alignment: ssh kkstore05 # cavPor3 -> store12 -> kkstore05 cd /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf for D in * do cd "${D}" find . -type f cd .. done | sort -u > /tmp/7-way.contig.list wc -l /tmp/7-way.contig.list mkdir /cluster/data/cavPor3/bed/multiz7way/splitRun cp -p /tmp/7-way.contig.list \ /cluster/data/cavPor3/bed/multiz7way/splitRun # 296 /tmp/7-way.contig.list # ready for the multiz run ssh pk cd /cluster/data/cavPor3/bed/multiz7way/splitRun mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat > autoMultiz.csh << \_EOF_ #!/bin/csh -ef set db = cavPor3 set subdir = $1 set c = $2 set result = $3 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz7way/contigMaf rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.7.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$subdir/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.7.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db _EOF_ # << happy emacs chmod +x autoMultiz.csh cat << \_EOF_ > template #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz7way/splitRun/maf/$(dir1)/$(root1).maf} #ENDLOOP _EOF_ # << emacs sed -e "s/^\.\///" ../7-way.contig.list \ | gensub2 stdin single template jobList para create jobList para try ... check ... # Checking finished jobs # Completed: 10 of 296 jobs # CPU time in finished jobs: 4355s 72.59m 1.21h 0.05d 0.000 y # IO & Wait Time: 104s 1.73m 0.03h 0.00d 0.000 y # Average job time: 446s 7.43m 0.12h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1672s 27.87m 0.46h 0.02d # Submission to last job: 1681s 28.02m 0.47h 0.02d # Estimated complete: 0s 0.00m 0.00h 0.00d # [pk:tdreszer run> pc # 296 jobs in batch # 92505 jobs (including everybody's) in Parasol queue. # Checking finished jobs # unsubmitted jobs: 286 # ranOk: 10 # total jobs in batch: 296 para push ... check ... etc # Completed: 296 of 296 jobs # CPU time in finished jobs: 53659s 894.32m 14.91h 0.62d 0.002 y # IO & Wait Time: 1675s 27.92m 0.47h 0.02d 0.000 y # Average job time: 187s 3.12m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1969s 32.82m 0.55h 0.02d # Submission to last job: 4023s 67.05m 1.12h 0.05d # Estimated complete: 0s 0.00m 0.00h 0.00d # [pk:tdreszer run> pc # 296 jobs in batch # 84971 jobs (including everybody's) in Parasol queue. # Checking finished jobs # ranOk: 296 # total jobs in batch: 296 # put the split maf results back together into a single maf file # eliminate duplicate comments ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz7way mkdir togetherMaf grep "^##maf version" splitRun/maf/0/0/Contig00700.maf \ | sort -u > togetherMaf/cavPor3.7way.maf ##maf version=1 scoring=autoMZ.v1 ##maf version=1 scoring=maf_project.v12 ##maf version=1 scoring=multiz for F in `find ./splitRun/maf -type f -depth` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> togetherMaf/cavPor3.7way.maf for F in `find ./splitRun/maf -type f -depth` do grep -v -h "^#" "${F}" done >> togetherMaf/cavPor3.7way.maf grep "^##eof maf" splitRun/maf/0/0/Contig00700.maf \ | sort -u >> togetherMaf/cavPor3.7way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/cavPor3/multiz7way/maf ln -s /cluster/data/cavPor3/bed/multiz7way/togetherMaf/*.maf \ /gbdb/cavPor3/multiz7way/maf/multiz7way.maf # this generates an immense multiz7way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/cavPor3/multiz7way/maf cavPor3 multiz7way # Advisory lock created # Indexing and tabulating /gbdb/cavPor3/multiz7way/maf/multiz7way.maf # Loading multiz7way into database # Loaded 6939217 mafs in 1 files from /gbdb/cavPor3/multiz7way/maf # Advisory lock has been released # # real 7m7.421s # user 1m50.625s # sys 0m34.191s ### ### ################################################################### ### ### #### Abandoning 7-Way because I am adding Squirrel 2008-05-27 ##### ### ### ################################################################### ######################################################################### ## 8-Way Multiz (STARTED - 2008-05-27 - Tim) ## 8-Way Multiz (RESTARTED - 2008-07-02 - Tim Done: 7-11-2008) ## # the all.chain.gz files were split up via kluster jobs on memk # in order to get mafSynNet files. Example above in ornAna1 blastz ssh hgwdev mkdir /cluster/data/cavPor3/bed/multiz8way cd /cluster/data/cavPor3/bed/multiz8way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get cavPor3 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram ### Note the ground squirrel (Spermophilus tridecemlineatus) is not in the mm9 30-way ### Add it a same distance from other rodents as rabbit # select the 7 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Rat_rn4,GuineaPig_cavPor2,Rabbit_oryCun1,Opossum_monDom4,Chicken_galGal3 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ | sed -e "s/cavPor2/cavPor3/g" > 7-way.fullNames.nh # looks something like this: # ((((((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,GuineaPig_cavPor3:0.202990):0.034350,Rabbit_oryCun1:0.208548):0.014587 # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); # rearrange to get guineaPig at the top: # this leaves us with: cat << _EOF_ > cavPor3.7-way.nh (((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); _EOF_ # << happy emacs # Add squirrel at same distance to guineaPig and Mouse as rabbits: # Alternative tree: ((((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.017175,Squirrel_speTri0:0.208548):0.017175,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); cat << _EOF_ > cavPor3.8-way.nh (((((GuineaPig_cavPor3:0.202990,((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.18602,Squirrel_speTri0:0.208548):0.014587):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); _EOF_ # << happy emacs cd /cluster/data/cavPor3/bed/multiz8way cp ../multiz7way/species.list . echo speTri0 >> species.list # verify all blastz's exists cat << \_EOF_ > listMafs.csh #!/bin/csh -fe cd /cluster/data/cavPor3/bed/multiz8way foreach db (`grep -v cavPor3 species.list`) set bdir = /cluster/data/cavPor3/bed/blastz.$db if (-e $bdir/mafRBestNet/cavPor3.$db.rbest.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/cavPor3.$db.syn.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/cavPor3.$db.net.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end _EOF_ # << happy emacs chmod +x ./listMafs.csh # see what it says: ./listMafs.csh # galGal3 mafNet # hg18 mafNet # mm9 mafNet # monDom4 mafNet # oryCun1 mafRBestNet # rn4 mafNet # speTri0 mafs not found ### After some back and forth on my own, Kate finally got the speTri0 Mafs together: # galGal3 mafNet # hg18 mafNet # mm9 mafNet # monDom4 mafNet # oryCun1 mafRBestNet # rn4 mafNet # speTri0 mafRBestNet /cluster/bin/phast/all_dists cavPor3.8-way.nh > 8way.distances.txt # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Squirrel_speTri0 0.426125 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Mouse_mm9 Rat_rn4 0.160657 # Mouse_mm9 Squirrel_speTri0 0.470842 # Mouse_mm9 Rabbit_oryCun1 0.519779 # Mouse_mm9 Human_hg18 0.452719 # Mouse_mm9 Opossum_monDom4 0.909852 # Mouse_mm9 Chicken_galGal3 1.285399 # Rat_rn4 Squirrel_speTri0 0.478951 # Rat_rn4 Rabbit_oryCun1 0.527888 # Rat_rn4 Human_hg18 0.460828 # Rat_rn4 Opossum_monDom4 0.917961 # Rat_rn4 Chicken_galGal3 1.293508 # Squirrel_speTri0 Rabbit_oryCun1 0.466033 # Squirrel_speTri0 Human_hg18 0.398973 # Squirrel_speTri0 Opossum_monDom4 0.856106 # Squirrel_speTri0 Chicken_galGal3 1.231653 # Rabbit_oryCun1 Human_hg18 0.350036 # Rabbit_oryCun1 Opossum_monDom4 0.807169 # Rabbit_oryCun1 Chicken_galGal3 1.182716 # Human_hg18 Opossum_monDom4 0.710935 # Human_hg18 Chicken_galGal3 1.086482 # Opossum_monDom4 Chicken_galGal3 1.016989 # (total) - 2.437490 grep -i cavPor 8way.distances.txt | sort -k3,3n # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Squirrel_speTri0 0.426125 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Note that guinneaPig is closer to human than to other rodents. # This may be reasonable, since the speed of evolution (length of tree limbs) # in rodents is greater than for primates. For instance mm9 is closer to hg18 # than it is to either cavPor2 or oryCun1 ## ??? time nice -n +19 featureBits cavPor3 axtChain/cavPor3.speTri0.rbest.chain.psl >& fb.cavPor3.speTri0.rbest.chain.psl.txt & # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCavPor3Link chain linearGap # distance on cavPor3 on other minScore # 0.479871 Mouse_mm9 (28.900%) (29.330%) 3000 medium # 0.487980 Rat_rn4 (27.858%) (27.602%) 3000 medium # 0.479871 Squirrel_speTri0 3000 medium # 0.445888 Rabbit_oryCun1 (35.145%) (28.238%) 3000 medium # 0.378828 Human_hg18 (43.971%) (48.132%) 3000 medium # 0.835961 Opossum_monDom4 (9.540%) (14.372%) 5000 loose # 1.211508 Chicken_galGal3 (10.190%) (5.437%) 5000 loose ### ### ### ### Be sure to use calJac as an example because of scaffolds ### ### ### ### will require making maf files by scaffold name # create a coherent set of all the mafs involved in this run mkdir mafLinks cd mafLinks ln -s ../../blastz.hg18/mafNet ./hg18 ln -s ../../blastz.mm9/mafNet ./mm9 ln -s ../../blastz.rn4/mafNet ./rn4 ln -s ../../blastz.monDom4/mafNet ./monDom4 ln -s ../../blastz.galGal3/mafNet ./galGal3 ln -s ../../blastz.oryCun1/mafRBestNet ./oryCun1 ln -s ../../blastz.speTri0/mafRBestNet ./speTri0 # check data size: du -hscL * # 100M galGal3 # 930M hg18 # 577M mm9 # 268M monDom4 # 495M oryCun1 # 543M rn4 # 680M speTri0 # 3.6G total # need to split these things up by Contig number for efficient kluster run ssh kkstore05 mkdir -p /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf cd /iscratch/tmp # the 16201 is from petMar/chrom.sizes echo "chrM 0 16801" > chrM.bed for D in `grep -v cavPor3 /cluster/data/cavPor3/bed/multiz8way/species.list` do echo ${D} zcat \ /cluster/data/cavPor3/bed/multiz8way/mafLinks/${D}/cavPor3.${D}.*.maf.gz \ > ${D}.maf mkdir /iscratch/tmp/${D} cd /iscratch/tmp/${D} mafSplit -verbose=2 /dev/null -byTarget -useHashedName=10 Contig \ ../${D}.maf -outDirDepth=2 mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf rsync -a --progress ./ \ /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf/${D} cd /iscratch/tmp rm -fr ${D} ${D}.maf done # create a run-time list of contigs to operate on, not all contigs # exist in all alignments, but we want all contig names used in any # alignment: ssh kkstore05 # cavPor3 -> store12 -> kkstore05 cd /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf for D in * do cd "${D}" find . -type f cd .. done | sort -u > /tmp/8-way.contig.list wc -l /tmp/8-way.contig.list mkdir /cluster/data/cavPor3/bed/multiz8way/splitRun cp -p /tmp/8-way.contig.list \ /cluster/data/cavPor3/bed/multiz8way/splitRun # wc -l 8-way.contig.list # 296 8-way.contig.list # ready for the multiz run ssh pk cd /cluster/data/cavPor3/bed/multiz8way/splitRun mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat > autoMultiz.csh << \_EOF_ #!/bin/csh -ef set db = cavPor3 set subdir = $1 set c = $2 set result = $3 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz8way/contigMaf rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.8.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$subdir/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.8.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db _EOF_ # << happy emacs chmod +x autoMultiz.csh cat << \_EOF_ > template #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz8way/splitRun/maf/$(dir1)/$(root1).maf} #ENDLOOP _EOF_ # << emacs sed -e "s/^\.\///" ../8-way.contig.list \ | gensub2 stdin single template jobList para create jobList para try ... check ... ## first 20 failed because of tree.8.nh: ### Hand edited tree.8.nh from cavPor3.8-way.nh to: # (((((cavPor3 ((mm9 rn4) speTri0)) oryCun1) hg18) monDom4) galGal3) # Next 10 succeeded # 296 jobs in batch # 15118 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 10 of 296 jobs # Crashed: 20 jobs # CPU time in finished jobs: 5742s 95.69m 1.59h 0.07d 0.000 y # IO & Wait Time: 76s 1.27m 0.02h 0.00d 0.000 y # Average job time: 582s 9.70m 0.16h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1482s 24.70m 0.41h 0.02d # Submission to last job: 2712s 45.20m 0.75h 0.03d # Estimated complete: 0s 0.00m 0.00h 0.00d para push; para check # 296 jobs in batch # 13908 jobs (including everybody's) in Parasol queue. # Checking finished jobs # running: 25 # ranOk: 271 # total jobs in batch: 296 # [pk:tdreszer run> pc # 296 jobs in batch # 10574 jobs (including everybody's) in Parasol queue. # Checking finished jobs # . # ranOk: 296 # total jobs in batch: 296 para -eta time # 296 jobs in batch # 10542 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 296 of 296 jobs # CPU time in finished jobs: 81235s 1353.92m 22.57h 0.94d 0.003 y # IO & Wait Time: 11960s 199.33m 3.32h 0.14d 0.000 y # Average job time: 315s 5.25m 0.09h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3262s 54.37m 0.91h 0.04d # Submission to last job: 59090s 984.83m 16.41h 0.68d # Estimated complete: 0s 0.00m 0.00h 0.00d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz8way mkdir togetherMaf grep "^##maf version" splitRun/maf/0/0/Contig00700.maf \ | sort -u > togetherMaf/cavPor3.8way.maf ##maf version=1 scoring=autoMZ.v1 ##maf version=1 scoring=maf_project.v12 ##maf version=1 scoring=multiz for F in `find ./splitRun/maf -type f -depth` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> togetherMaf/cavPor3.8way.maf for F in `find ./splitRun/maf -type f -depth` do grep -v -h "^#" "${F}" done >> togetherMaf/cavPor3.8way.maf grep "^##eof maf" splitRun/maf/0/0/Contig00700.maf \ | sort -u >> togetherMaf/cavPor3.8way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/cavPor3/multiz8way/maf ln -s /cluster/data/cavPor3/bed/multiz8way/togetherMaf/*.maf \ /gbdb/cavPor3/multiz8way/maf/multiz8way.maf # this generates an immense multiz8way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/cavPor3/multiz8way/maf cavPor3 multiz8way & # Advisory lock created # Indexing and tabulating /gbdb/cavPor3/multiz8way/maf/multiz8way.maf # Loading multiz8way into database # Loaded 8976211 mafs in 1 files from /gbdb/cavPor3/multiz8way/maf # Advisory lock has been released # real 4m26.559s # user 2m35.837s # sys 0m47.660s # load summary table time nice -n +19 cat /gbdb/cavPor3/multiz8way/maf/*.maf \ | hgLoadMafSummary cavPor3 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz8waySummary stdin # Indexing and tabulating stdin # Created 1650940 summary blocks from 26384188 components and 8747391 mafs from stdin # Loading into cavPor3 table multiz8waySummary... # Loading completeAdvisory lock has been released # real 5m31.133s # user 4m55.729s # sys 0m21.743s # Gap Annotation # prepare bed files with gap info ssh kkstore05 mkdir /cluster/data/cavPor3/bed/multiz8way/anno cd /cluster/data/cavPor3/bed/multiz8way/anno mkdir maf run # these actually already all exist from previous multiple alignments # remove the echo in front of the twoBitInfo to actually make it work for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done # creating cavPor3.N.bed # twoBitInfo -nBed /cluster/data/cavPor3/cavPor3.2bit /cluster/data/cavPor3/cavPor3.N.bed # -rw-rw-r-- 1 2164385 Jul 18 2006 /cluster/data/galGal3/galGal3.N.bed # -rw-rw-r-- 1 232970 Feb 6 2006 /cluster/data/hg18/hg18.N.bed # -rw-rw-r-- 1 27838 Oct 15 2007 /cluster/data/mm9/mm9.N.bed # -rw-rw-r-- 1 1788138 Feb 28 2006 /cluster/data/monDom4/monDom4.N.bed # -rw-rw-r-- 1 13782261 Nov 13 2005 /cluster/data/oryCun1/oryCun1.N.bed # -rw-rw-r-- 1 20910683 Feb 28 2006 /cluster/data/rn4/rn4.N.bed # creating speTri0.N.bed # twoBitInfo -nBed /cluster/data/speTri0/speTri0.2bit /cluster/data/speTri0/speTri0.N.bed cd run rm -f nBeds sizes for DB in `grep -v cavPor3 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # galGal3 # hg18 # mm9 # monDom4 # oryCun1 # rn4 # speTri0 ssh memk # temporarily copy the cavPor3.8way.maf file onto the memk # nodes /scratch/data/cavPor3/maf/ directory for R in 0 1 2 3 4 5 6 7 do ssh mkr0u${R} rsync -a --progress \ /cluster/data/cavPor3/bed/multiz8way/togetherMaf/cavPor3.8way.maf \ /scratch/data/cavPor3/maf/ done mkdir /cluster/data/cavPor3/bed/multiz8way/anno/splitMaf # need to split up the single maf file into individual # per-scaffold maf files to run annotation on cd /cluster/data/cavPor3/bed/multiz8way/anno/splitMaf # create bed files to list approximately 394 scaffolds in # a single list, approximately 8 lists cat << \_EOF_ > mkBedLists.pl #!/usr/bin/env perl use strict; use warnings; my $bedCount = 0; my $i = 0; my $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; open (FH,") { chomp $line; if ( (($i + 1) % 394) == 0 ) { printf "%s\n", $line; close (BF); ++$bedCount; $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; } ++$i; my ($chr, $size) = split('\s+',$line); printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr; } close (FH); # close (BH); _EOF_ # << happy emacs chmod +x mkBedLists.pl ./mkBedLists.pl # -rw-rw-r-- 1 tdreszer protein 13805 Jul 10 16:23 file_0.bed # -rw-rw-r-- 1 tdreszer protein 13602 Jul 10 16:23 file_1.bed # -rw-rw-r-- 1 tdreszer protein 13756 Jul 10 16:23 file_2.bed # -rw-rw-r-- 1 tdreszer protein 14166 Jul 10 16:23 file_3.bed # -rw-rw-r-- 1 tdreszer protein 14090 Jul 10 16:23 file_4.bed # -rw-rw-r-- 1 tdreszer protein 13790 Jul 10 16:23 file_5.bed # -rw-rw-r-- 1 tdreszer protein 13790 Jul 10 16:23 file_6.bed # -rw-rw-r-- 1 tdreszer protein 13545 Jul 10 16:23 file_7.bed # now, run a mafsInRegion on each one of those lists cat << \_EOF_ > runOne #!/bin/csh -fe set runDir = "/cluster/data/cavPor3/bed/multiz8way/anno/splitMaf" set resultDir = $1 set bedFile = $resultDir.bed mkdir -p $resultDir mkdir -p /scratch/tmp/cavPor3/$resultDir pushd /scratch/tmp/cavPor3/$resultDir mafsInRegion $runDir/$bedFile -outDir . \ /scratch/data/cavPor3/maf/cavPor3.8way.maf popd rsync -q -a /scratch/tmp/cavPor3/$resultDir/ ./$resultDir/ rm -fr /scratch/tmp/cavPor3/$resultDir rmdir --ignore-fail-on-non-empty /scratch/tmp/cavPor3 _EOF_ # << happy emacs chmod +x runOne cat << \_EOF_ > template #LOOP ./runOne $(root1) #ENDLOOP _EOF_ # << happy emacs ls file*.bed > runList gensub2 runList single template jobList para create jobList para try ... check ... push ... etc # 8 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 8 of 8 jobs # CPU time in finished jobs: 1810s 30.16m 0.50h 0.02d 0.000 y # IO & Wait Time: 1218s 20.30m 0.34h 0.01d 0.000 y # Average job time: 379s 6.31m 0.11h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 614s 10.23m 0.17h 0.01d # Submission to last job: 614s 10.23m 0.17h 0.01d # Estimated complete: 0s 0.00m 0.00h 0.00d cd /cluster/data/cavPor3/bed/multiz8way/anno/run cat << \_EOF_ > doAnno.csh #!/bin/csh -ef set outDir = ../maf/$2 set result = $3 set input = $1 mkdir -p $outDir cat $input | \ nice mafAddIRows -nBeds=nBeds stdin /scratch/data/cavPor3/cavPor3.2bit $result _EOF_ # << happy emacs chmod +x doAnno.csh cat << \_EOF_ > template #LOOP ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf} #ENDLOOP _EOF_ # << happy emacs find ../splitMaf -type f -name "*.maf" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc. 1849 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # unsubmitted jobs: 1839 # crashed: 10 # total jobs in batch: 1849 # Couldn't open speTri0.bed , No such file or directory # lrwxrwxrwx 1 tdreszer protein 35 Jul 10 14:13 speTri0.bed -> /cluster/data/speTri0/speTri0.N.bed ### /cluster/data/speTri0/speTri0.N.bed Does not exist... twoBitInfo -nBed /cluster/data/speTri0/speTri0.2bit /cluster/data/speTri0/speTri0.N.bed & para try; para push... # 1849 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 1849 of 1849 jobs # CPU time in finished jobs: 6313s 105.22m 1.75h 0.07d 0.000 y # IO & Wait Time: 5708s 95.13m 1.59h 0.07d 0.000 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 171s 2.85m 0.05h 0.00d # Submission to last job: 651s 10.85m 0.18h 0.01d # Estimated complete: 0s 0.00m 0.00h 0.00d # put the results back together into a single file ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz8way/anno grep "^##maf version" maf/file_0//scaffold_0.maf \ | sort -u > cavPor3.anno.8way.maf ##maf version=1 scoring=autoMZ.v1 find ./maf -type f -depth -name "*.maf" | while read F do grep -v -h "^#" "${F}" done >> cavPor3.anno.8way.maf echo "##eof maf" >> cavPor3.anno.8way.maf # -rw-rw-r-- 1 tdreszer protein 15080110322 Jul 11 14:14 cavPor3.anno.8way.maf ssh hgwdev cd /cluster/data/cavPor3/bed/multiz8way/anno mkdir -p /gbdb/cavPor3/multiz8way/anno ln -s `pwd`/cavPor3.anno.8way.maf \ /gbdb/cavPor3/multiz8way/anno/multiz8way.maf # by loading this into the table multiz8way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/cavPor3/multiz8way/anno \ cavPor3 multiz8way # Loaded 10486949 mafs in 1 files from /gbdb/cavPor3/multiz8way/anno # Advisory lock has been released # real 6m19.376s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything: time nice -n +19 hgLoadMafSummary cavPor3 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz8waySummary \ /gbdb/cavPor3/multiz8way/anno/multiz8way.maf # Created 1650940 summary blocks from 26384188 components # and 10209337 mafs from /gbdb/cavPor3/multiz8way/anno/multiz8way.maf # real 6m29.261s # by loading this into the table multiz8waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz8way*.tab files in this /scratch/tmp directory rm multiz8way*.tab # And, you can remove the previously loaded non-annotated maf file link: rm /gbdb/cavPor3/multiz8way/maf/multiz8way.maf rmdir /gbdb/cavPor3/multiz8way/maf # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far ################## # Effort to move to hive ################## ssh hgwdev mv /hive/archive/store12/cavPor3 /hive/data/genomes/cavPor3 ln -sf /hive/data/genomes/cavPor3 /cluster/data/cavPor3 find /gbdb/cavPor3 /usr/local/apache/htdocs/goldenPath/cavPor3 -type l -ls | grep /cluster/store ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ########################################################################### ## Annotate 8-way multiple alignment with gene annotations ## (START - 2008-07-14 - Tim) # Gene frames ## given previous survey done for 9-way alignment on Marmoset ## and the appearance of new ensGene tables on everything # use knownGene for hg18, mm9 # use ensGene for canFam2, ornAna1 # and refGene for cavPor3 ssh hgwdev mkdir /cluster/data/cavPor3/bed/multiz8way/frames cd /cluster/data/cavPor3/bed/multiz8way/frames mkdir genes # knownGene for DB in hg18 mm9 rn4 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in oryCun1 ornAna1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # refGene for DB in cavPor3 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 861210 Mar 18 15:40 cavPor3.gp.gz # -rw-rw-r-- 1 1865308 Mar 18 15:40 canFam2.gp.gz # -rw-rw-r-- 1 2008806 Mar 18 15:39 hg18.gp.gz # -rw-rw-r-- 1 1965274 Mar 18 15:39 mm9.gp.gz # -rw-rw-r-- 1 1347532 Mar 18 15:40 ornAna1.gp.gz ssh kkstore06 cd /cluster/data/cavPor3/bed/multiz5way/frames # anything to annotate is in a pair, e.g.: cavPor3 genes/cavPor3.gp.gz time (cat ../anno/cavPor3.anno.5way.maf | nice -n +19 genePredToMafFrames cavPor3 stdin stdout cavPor3 genes/cavPor3.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz canFam2 genes/canFam2.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 77526 cavPor3 # 110768 ornAna1 # 221524 mm9 # 230010 hg18 # 243396 canFam2 # load the resulting file ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/frames time nice -n +19 hgLoadMafFrames cavPor3 multiz5wayFrames \ multiz5way.mafFrames.gz # real 0m21.968s # enable the trackDb entries: # frames multiz5wayFrames # irows on ############################################################################# # phastCons 5-way (DONE - 2008-03-19 - Hiram) # split 5way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir /cluster/data/cavPor3/bed/multiz5way/msa.split cd /cluster/data/cavPor3/bed/multiz5way/msa.split mkdir -p /san/sanvol1/scratch/cavPor3/multiz5way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/cavPor3/bed/multiz5way/anno/maf set WINDOWS = /san/sanvol1/scratch/cavPor3/multiz5way/cons/ss pushd $WINDOWS set resultDir = $1 set c = $2 rm -fr $resultDir/$c mkdir -p $resultDir twoBitToFa -seq=$c /scratch/data/cavPor3/cavPor3.2bit /scratch/tmp/cavPor3.$c.fa # need to truncate odd-ball scaffold/chrom names that include dots # as phastCons utils can't handle them set TMP = /scratch/tmp/$c.clean.maf.$$ perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$resultDir/$c.maf > $TMP /cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \ -M /scratch/tmp/cavPor3.$c.fa \ -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000 rm -f /scratch/tmp/cavPor3.$c.fa rm -f $TMP popd mkdir -p $resultDir date > $resultDir/$c.out '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs # create list of maf files: (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 2320 of 2320 jobs # CPU time in finished jobs: 1710s 28.50m 0.47h 0.02d 0.000 y # IO & Wait Time: 6951s 115.85m 1.93h 0.08d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 128s 2.13m 0.04h 0.00d # Submission to last job: 1048s 17.47m 0.29h 0.01d # take the cons and noncons trees from the mouse 30-way # Estimates are not easy to make, probably more correctly, # take the 30-way .mod file, and re-use it here. ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh memk mkdir -p /cluster/data/cavPor3/bed/multiz5way/cons/run.cons cd /cluster/data/cavPor3/bed/multiz5way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast/bin set subDir = $1 set f = $2 set c = $2:r set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/cavPor3/bed/multiz5way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/cavPor3/multiz5way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir sleep 4 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir rm -f $san/$grp/pp/$subDir/$f.pp rm -f $san/$grp/bed/$subDir/$f.bed mv $tmp/$f.pp $san/$grp/pp/$subDir mv $tmp/$f.bed $san/$grp/bed/$subDir rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # Create parasol batch and run it pushd /san/sanvol1/scratch/cavPor3/multiz5way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \ > /cluster/data/cavPor3/bed/multiz5way/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.cz/tree_doctor ../../mm9.30way.mod \ --prune-all-but=bosTau3,hg18,mm9,canFam2,ornAna1 \ | sed -e "s/bosTau3/cavPor3/" > all.mod cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/cavPor3/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 2569 of 2569 jobs # CPU time in finished jobs: 8636s 143.93m 2.40h 0.10d 0.000 y # IO & Wait Time: 17371s 289.52m 4.83h 0.20d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest finished job: 44s 0.73m 0.01h 0.00d # Submission to last job: 1008s 16.80m 0.28h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all find ./bed -type f -name "chr*.bed" | xargs cat \ | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 3 minutes cp -p mostConserved.bed /cluster/data/cavPor3/bed/multiz5way/cons/all # load into database ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/cons/all time nice -n +19 hgLoadBed cavPor3 phastConsElements5way mostConserved.bed # Loaded 1005876 elements of size 5 # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits cavPor3 phastConsElements5way # 132010504 bases of 2731830700 (4.832%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all mkdir -p phastCons5wayScores for D in `ls -1d pp/file* | sort -t_ -k2n` do TOP=`pwd` F=${D/pp\/} out=${TOP}/phastCons5wayScores/${F}.data.gz echo "${D} > ${F}.data.gz" cd ${D} find . -name "*.pp" -type f \ | sed -e "s#^./##; s/chrUn.004./chrUn_004_/; s/-/.-./" \ | sort -t '.' -k1,1 -k3.3n \ | sed -e "s/.-./-/; s/chrUn_004_/chrUn.004./" | xargs cat \ | gzip > ${out} cd "${TOP}" done # copy those files to the downloads area: # /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all ls -1 phastCons5wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \ | wigEncode -noOverlap stdin phastCons5way.wig phastCons5way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 time nice -n +19 cp -p *.wi? /cluster/data/cavPor3/bed/multiz5way/cons/all # real 0m40.875s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/cons/all ln -s `pwd`/phastCons5way.wib /gbdb/cavPor3/multiz5way/phastCons5way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz5way cavPor3 \ phastCons5way phastCons5way.wig # real 1m5.667s # remove garbage rm wiggle.tab # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=cavPor3 phastCons5way > histogram.data 2>&1 # real 3m37.316s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Cow BosTau4 Histogram phastCons5way track" set xlabel " phastCons5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # These trackDb entries turn on the wiggle phastCons data track: # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons5way # spanList 1 # autoScale Off # windowingFunction mean # pairwiseHeight 12 # yLineOnOff Off ############################################################################# # Downloads (DONE - 2008-01-11 - Hiram) # Let's see if the downloads will work ssh hgwdev /cluster/data/cavPor3 # expecting to find repeat masker .out file here: ln -s bed/RepeatMasker/cavPor3.fa.out . time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev cavPor3 > jkStuff/downloads.log 2>&1 # real 24m3.210s # failed making upstream sequences: # featureBits cavPor3 mgcGenes:upstream:1000 -fa=stdout # setpriority: Permission denied. # the 'nice' from my bash shell causes trouble inside the csh # script which uses nice. Finish off the install step manually # with the mgcGenes upstreams ... ############################################################################# # PushQ entries (DONE - 2008-01-11 - Hiram) ssh hgwdev /cluster/data/cavPor3 /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql # output warnings: # cavPor3 does not have seq # cavPor3 does not have gbMiscDiff # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # genscanPep ############################################################################# # create download files (DONE - 2008-03-19 - Hiram) ssh hgwdev cd /cluster/data/cavPor3 ln -s /cluster/data/cavPor3/bed/repeatMasker/cavPor3.fa.out . makeDownloads.pl cavPor3 > makeDownloads.log 2>&1 # *EDIT* the README files and ensure they are correct ############################################################################# # PushQ entries (DONE - 2008-03-19 - Hiram) ssh hgwdev /cluster/data/cavPor3 /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql # output warnings: # hgwdev does not have /usr/local/apache/htdocs/goldenPath/cavPor3/liftOver/cavPor3ToBosTau* # cavPor3 does not have seq # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # genscanPep # looks like there should be a bosTau3 to cavPor3 liftOver run ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-03-28) ssh kkstore06 # bash if not using bash shell already mkdir /cluster/data/cavPor3/blastDb cd /cluster/data/cavPor3 grep -v chrUn chrom.sizes | awk '{print $1}' > chr.lst for i in `cat chr.lst`; do twoBitToFa cavPor3.unmasked.2bit -seq=$i stdout; done > temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft grep chrUn chrom.sizes | awk '{print $1}' > chr.lst for i in `cat chr.lst`; do twoBitToFa cavPor3.unmasked.2bit -seq=$i stdout; done > temp.fa faSplit sequence temp.fa 150 blastDb/y rm temp.fa chr.lst cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 3440 mkdir -p /san/sanvol1/scratch/cavPor3/blastDb cd /cluster/data/cavPor3/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/cavPor3/blastDb done mkdir -p /cluster/data/cavPor3/bed/tblastn.hg18KG cd /cluster/data/cavPor3/bed/tblastn.hg18KG echo /san/sanvol1/scratch/cavPor3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3440 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/3440) = 360.973943 mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa split -l 361 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/cavPor3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/cavPor3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time Completed: 350880 of 350880 jobs CPU time in finished jobs: 27082816s 451380.27m 7523.00h 313.46d 0.859 y IO & Wait Time: 2334990s 38916.50m 648.61h 27.03d 0.074 y Average job time: 84s 1.40m 0.02h 0.00d Longest finished job: 578s 9.63m 0.16h 0.01d Submission to last job: 96125s 1602.08m 26.70h 1.11d ssh kkstore06 cd /cluster/data/cavPor3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 99 of 102 jobs # Crashed: 3 jobs # CPU time in finished jobs: 113248s 1887.47m 31.46h 1.31d 0.004 y # IO & Wait Time: 86043s 1434.04m 23.90h 1.00d 0.003 y # Average job time: 2013s 33.55m 0.56h 0.02d # Longest finished job: 6139s 102.32m 1.71h 0.07d # Submission to last job: 10416s 173.60m 2.89h 0.12d # ran three crashed jobs on kolossus ssh kkstore06 cd /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/cavPor3/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/cavPor3/bed/tblastn.hg18KG hgLoadPsl cavPor3 blastHg18KG.psl # check coverage featureBits cavPor3 blastHg18KG # 40254923 bases of 2731830700 (1.474%) in intersection featureBits cavPor3 refGene:cds blastHg18KG -enrichment # refGene:cds 0.429%, blastHg18KG 1.474%, both 0.379%, cover 88.39%, enrich 59.98x ssh kkstore06 rm -rf /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # Create 5-way downloads (DONE - 2008-03-28 - Hiram) ssh hgwdev mkdir -p /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way cd /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way cp -p \ /san/sanvol1/scratch/cavPor3/multiz5way/cons/all/phastCons5wayScores/* . ln -s ../../cons/all/all.mod ./5way.mod cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt . # edit that README.txt to be correct for this 5-way alignment cd .. mkdir multiz5way cd multiz5way cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt . # edit that README.txt to be correct for this 5-way alignment ssh kkstore06 cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way ln -s ../../cavPor3.5-way.nh 5way.nh time gzip -c ../../anno/cavPor3.anno.5way.maf > cavPor3.5way.maf.gz # real 34m59.295s ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way # creating upstream files from refGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=cavPor3 GENE=refGene NWAY=multiz5way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time nice -n +19 ./mkUpstream.sh -rw-rw-r-- 1 9883443 Mar 28 13:02 upstream1000.maf.gz -rw-rw-r-- 1 17938570 Mar 28 13:06 upstream2000.maf.gz -rw-rw-r-- 1 40384656 Mar 28 13:10 upstream5000.maf.gz # # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then refGene names, e.g.: # 8806 ornAna1 # 8806 mm9 # 8806 hg18 # 8806 canFam2 # 7 NM_001077006 # 3 NM_001113231 # 3 NM_001105381 # 3 NM_001102527 # 3 NM_001102322 # ... ssh kkstore06 cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way md5sum *.maf.gz > md5sum.txt cd ../phastCons5way md5sum *.data.gz *.mod > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/multiz5way mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/phastCons5way cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/cavPor3/multiz5way cd ../phastCons5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/cavPor3/phastCons5way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ############################################################################# # BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate) ssh kkstore05 cd /cluster/data/cavPor3/bed mkdir blastzSpeTri0.2008-05-16 cd blastzSpeTri0.2008-05-16 cat << '_EOF_' > DEF # Mouse vs. Ground squirrel BLASTZ_M=50 # TARGET: Mouse MM9 SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ1_LEN=/cluster/data/cavPor3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Ground squirrel speTri0 SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit SEQ2_LEN=/cluster/data/speTri0/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium >& do.log & # got here ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastzSpeTri0 # failures... have to rescue manually # make axt's. Note -- too many scaffolds to do in a single pass # (exhausts mem) so we split ssh kksstore05 cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain netSplit noClass.net noClass ~kate/bin/x86_64/netSplit noClass.net noClass -lump=100 # can't just split to chains (exhausts open files) chainSplit chain cavPor3.speTri0.all.chain.gz -lump=100 cd noClass mkdir ../../axtNetChunks cat > netToAxt.csh << 'EOF' foreach i (*.net) set p = $i:r echo $i netToAxt $p.net ../chain/$p.chain /cluster/data/cavPor3/cavPor3.2bit /cluster/data/speTri0/speTri0.2bit ../../axtNetChunks/$i:r.axt end 'EOF' csh netToAxt.csh >&! netToAxt.log & # create unfiltered mafNet to get better squirrel coverage (2008-11-3 kate) cd ../../axtNetChunks cat *.axt | axtSort stdin stdout > ../axtChain/cavPor3.speTri0.net.axt mkdir ../mafNet axtToMaf -tPrefix=cavPor3. -qPrefix=speTri0. ../axtChain/cavPor3.speTri0.net.axt \ /cluster/data/cavPor3/chrom.sizes /cluster/data/speTri0/chrom.sizes \ ../mafNet/cavPor3.speTri0.net.maf gzip stdin > ../mafNet/cavPor3.speTri0.net.maf.gz # low cov genome, so use reciprocal best for multiple alignment # need to generate liftover chains, which failed in automation cat > over.csh << 'EOF' foreach i (*.net) set p = $i:r echo $i netChainSubset -verbose=0 $p.net ../chain/$p.chain stdout | \ chainStitchId stdin $i.over.chain end cat *.over.chain | gzip -c > cavPor3.speTri0.over.chain.gz 'EOF' csh over.csh >&! over.log & ssh hgwdev cd /cluster/data/cavPor3/bed/blastz.speTri0 mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/vsSpeTri0 /cluster/bin/scripts/doRecipBest.pl cavPor3 speTri0 >&! rbest.log & # TODO: # load chains in database and check coverage # NOTE: exhausts mem on hgwdev -- need to try this on kolossus ? cd axtChain cat > loadChains.csh << 'EOF' hgLoadChain -tIndex cavPor3 chainSpeTri0 cavPor3.speTri0.all.chain.gz featureBits cavPor3 chainSpeTri0Link > fb.cavPor3.chainSpeTri0Link.txt cat fb.cavPor3.chainSpeTri0Link.txt 'EOF' csh loadChains.csh >&! loadChains.log & ############################################################################ # SWAP SpeTri0 Blastz (2008-11-01 kate) mkdir -p /cluster/data/speTri0/bed cd /cluster/data/speTri0/bed mkdir blastz.cavPor3.swap cd blastz.cavPor3.swap cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain/chain mkdir ../chain.swap cat > swap.csh << 'EOF' foreach c (*.chain) echo $c chainSwap $c ../chain.swap/$c end 'EOF' csh swap.csh >&! swap.log & doBlastzChainNet.pl -swap -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/cavPor3/bed/blastz.speTri0/DEF >& swap.log & ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################# # N-SCAN gene predictions (nscanGene) - (2008-04-03 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/cavPor3/bed/nscan/ wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/cavPor3.gtf wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/cavPor3.prot.fa wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/readme.html bzip2 cavPor3.* chmod a-w * # load track gtfToGenePred -genePredExt cavPor3.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt cavPor3 nscanGene stdin hgPepPred cavPor3 generic nscanPep cavPor3.prot.fa.bz2 rm *.tab # update trackDb; need a cavPor3-specific page to describe informants marmoset/cavPor3/nscanGene.html (copy from readme.html) marmoset/cavPor3/trackDb.ra # set search regex to termRegex scaffold_[0-9]+\.[0-9]+\.[0-9]+ ############################################################################ # ### Found this in Kate's unchecked in cavPor3.txt. I need it to continue the blastz to get the maf for multiz !! # ### ############################################################################# # ### # BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate) # ### # ### ssh kkstore05 # ### cd /cluster/data/cavPor3/bed # ### mkdir blastzSpeTri0.2008-05-16 # ### cd blastzSpeTri0.2008-05-16 # ### # ### cat << '_EOF_' > DEF # ### # Mouse vs. Ground squirrel # ### # ### BLASTZ_M=50 # ### # ### # TARGET: Mouse MM9 # ### SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit # ### SEQ1_LEN=/cluster/data/cavPor3/chrom.sizes # ### SEQ1_CHUNK=10000000 # ### SEQ1_LAP=10000 # ### # ### # QUERY: Ground squirrel speTri0 # ### SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit # ### SEQ2_LEN=/cluster/data/speTri0/chrom.sizes # ### SEQ2_CHUNK=30000000 # ### SEQ2_LIMIT=500 # ### SEQ2_LAP=0 # ### # ### BASE=/cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16 # ### TMPDIR=/scratch/tmp # ### '_EOF_' # ### # << happy emacs # ### # ### doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ # ### -chainMinScore=3000 -chainLinearGap=medium >& do.log & # ### # ### # got here # ### ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastzSpeTri0 ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastz.speTri0 # ### 1 crashed job in axtChain: # ssh mkr0u7 zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & # invalid unsigned number: "2" ### divide and conquer ll ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz -rw-rw-r-- 1 kate protein 25560954 May 18 08:12 ../../pslParts/cavPor3.2bit:scaffold_7:0-10010000.psl.gz -rw-rw-r-- 1 kate protein 28831900 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:10000000-20010000.psl.gz -rw-rw-r-- 1 kate protein 25466248 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:20000000-30010000.psl.gz -rw-rw-r-- 1 kate protein 26433148 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:30000000-40010000.psl.gz -rw-rw-r-- 1 kate protein 38048425 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:40000000-50010000.psl.gz -rw-rw-r-- 1 kate protein 24145256 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:50000000-60010000.psl.gz -rw-rw-r-- 1 kate protein 7480033 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:60000000-61004451.psl.gz zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:0-*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & # Error reading 4 bytes: Success zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:40*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & # invalid unsigned number: "2" mv ../../pslParts/cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz ../../pslParts/cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz.bad zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & #Error reading 4 bytes: Success ssh pk; para shove -retries=6 # 233 jobs in batch # 503 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 233 of 233 jobs # CPU time in finished jobs: 116206s 1936.77m 32.28h 1.34d 0.004 y # IO & Wait Time: 3377s 56.28m 0.94h 0.04d 0.000 y # Average job time: 513s 8.55m 0.14h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 8500s 141.67m 2.36h 0.10d # Submission to last job: 967976s 16132.93m 268.88h 11.20d # Estimated complete: 0s 0.00m 0.00h 0.00d ### ############## Now need to restart the blastz: tail do.log # Command failed: # ssh -x kki nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/run/doChainRun.csh ssh kkstore05 # store12->kkstore05 cd /cluster/data/cavPor3/bed/blastz.speTri0 screen time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1 # real 220m10.699s # tail do.log # writing /dev/null # memory usage 2419843072, utime 5416 s/100, stime 389 # netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz stdout # chainStitchId stdin stdout # gzip -c # Killed # # gzip: stdout: Broken pipe # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/netChains.csh cp netChain.csh netChain.restart.csh # edit to restart in correct place ./netChain.restart.csh ### After some searching I was able to figure out that  = octal "\031" # Now I can find the original problem file and try to rebuild it: grep -n -P "\031" cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl # 4253:62 40 0 0 1 1 1 6 + scaffold_6393.1-308827 308827 149765 149868 scaffold_' 61004451 49604348 49604456 3 2,38,35, 149765,149794,149833, 49604348,49604383,49604421, # For comparison: tail -55 cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl | head -5 # 91 37 0 0 1 8 3 16 + scaffold_6393.1-308827 308827 244021 244157 scaffold_7 61004451 48951564 48951708 5 18,25,14,15,56, 244021,244039,244072,244086,244101, 48951564,48951583,48951608,48951629,48951652, # 62 32 0 0 0 0 0 0 + scaffold_6393.1-08827 308827 54194 54288 scaffold_7 61004451 49114670 49114764 1 94, 54194, 49114670, # 62 40 0 0 1 1 1 6 + scaffold_6393.1-308827 308827 149765 149868 scaffold_' 61004451 49604348 49604456 3 2,38,35, 149765,149794,149833, 49604348,49604383,49604421, # 127 63 0 0 1 18 2 5 + scaffold_6393.1-308827 308827 159033 159241 scaffold_7 61004451 49613715 49613910 4 59,84,20,27, 159033,159092,159176,159214, 49613715,49613775,49613863,49613883, # 29 8 0 0 0 0 0 0 + scaffold_6393.1-308827 308827 160121 160158 scaffold_7 61004451 49620825 49620862 1 37, 160121, 49620825, ### ### ### Yikes! the prior line as a different strange char:  = octal "\023" ## Run on same machine: ssh kkr10u47 /cluster/bin/scripts/blastz-run-ucsc -outFormat psl ../psl/cavPor3.2bit:scaffold_7:40000000-50010000 qParts/part007.lst ../DEF ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.new.psl ll ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007* # -rw-rw-r-- 1 tdreszer protein 785511 Jun 3 13:11 ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.new.psl # -rw-rw-r-- 1 kate protein 785511 May 16 19:40 ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.psl grep -n -P "\031" ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl tail -55 ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl | head -5 # 91 37 0 0 1 8 3 16 + scaffold_6393.1-308827 308827 244021 244157 scaffold_7 61004451 48951564 48951708 518,25,14,15,56, 244021,244039,244072,244086,244101, 48951564,48951583,48951608,48951629,48951652, # 62 32 0 0 0 0 0 0 + scaffold_6393.1-308827 308827 54194 54288 scaffold_7 61004451 49114670 49114764 194, 54194, 49114670, # 62 40 0 0 1 1 1 6 + scaffold_6393.1-308827 308827 149765 149868 scaffold_7 61004451 49604348 49604456 329,38,35, 149765,149794,149833, 49604348,49604383,49604421, # 127 63 0 0 1 18 2 5 + scaffold_6393.1-308827 308827 159033 159241 scaffold_7 61004451 49613715 49613910 459,84,20,27, 159033,159092,159176,159214, 49613715,49613775,49613863,49613883, # 29 8 0 0 0 0 0 0 + scaffold_6393.1-308827 308827 160121 160158 scaffold_7 61004451 49620825 49620862 137, 160121, 49620825, mv ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl.old mv ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl # Okay, we are NOW done with run.blastz so it is on to run.cat ssh kkstore05 cd ../run.cat ./cat.csh cavPor3.2bit:scaffold_7:40000000-50010000 cavPor3.2bit:scaffold_7:40000000-50010000.psl.new.gz ll cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.* # -rw-rw-r-- 1 tdreszer protein 38048414 Jun 3 13:28 cavPor3.2bit:scaffold_7:40000000-50010000.psl.new.gz # -rw-rw-r-- 1 tdreszer protein 38048475 May 18 08:13 cavPor3.2bit:scaffold_7:40000000-50010000.psl.old.gz mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.new.gz cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.old.gz cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz.old mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz* ../pslParts/ # Okay, we are now done with run.cat. What can be done for axtChain? # run on same cluster machine ssh kkr4u00 cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain/run ./chain.csh cavPor3.2bit:scaffold_7: chain/cavPor3.2bit:scaffold_7:.chain ll chain/cavPor3.2bit\:scaffold_7\:.chain # -rw-rw-r-- 1 tdreszer protein 304948646 Jun 3 15:05 chain/cavPor3.2bit:scaffold_7:.chain # Okay, NOW we are ready to consider restarting doBlastzChainNet.pl # Though I assume that some of what was already done, will be in the way. time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1 # postProcessChains: looks like this was run successfully already (cavPor3.speTri0.all.chain.gz exists). # Either run with -continue net or some later stage, or move aside/remove /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/cavPor3.speTri0.all.chain.gz and run again. mv axtChain/cavPor3.speTri0.all.chain.gz axtChain/cavPor3.speTri0.all.chain.gz.old time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1 # real 54m1.762s tail do.log # HgStepManager: executing step 'net' Thu Jun 5 12:46:09 2008. # netChains: looks like we are not starting with a clean slate. Please move aside or remove /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net and run again. mv /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net.old screen time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue net >> do.log 2>&1 real 204m20.325s tail do.log # writing /dev/null # memory usage 2427514880, utime 5453 s/100, stime 405 # netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz stdout # chainStitchId stdin stdout # gzip -c # # gzip: stdout: Broken pipe # Killed # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/netChains.csh ### Can't seem to get ahead!!! ## Doing each step in netChains.csh manually... cd /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain # Make nets ("noClass", i.e. without rmsk/class stats which are added later): chainPreNet cavPor3.speTri0.all.chain.gz /cluster/data/cavPor3/chrom.sizes /cluster/data/ speTri0/chrom.sizes stdout \ | chainNet stdin -minSpace=1 /cluster/data/cavPor3/chrom.sizes /cluster/data/speTri0/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # Completed successfully # Make liftOver chains: netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz netChainSubset.out ll netChainSubset.out # -rw-rw-r-- 1 tdreszer protein 73728 Jun 6 17:12 netChainSubset.out chainStitchId netChainSubset.out chainStitchId.out ll chainStitchId.out # -rw-rw-r-- 1 tdreszer protein 0 Jun 9 09:13 chainStitchId.out # gzip -c chainStitchId.out > cavPor3.speTri0.over.chain.gz # Duh: gzip: stdout: Broken pipe # Okay, continuing with next step, ignoring the empty cavPor3.speTri0.over.chain.gz # Make axtNet for download: one .axt for all of cavPor3. mkdir ../axtNet netToAxt -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz \ /scratch/data/cavPor3/cavPor3.2bit /scratch/data/speTri0/speTri0.2bit stdout \ | axtSort stdin stdout \ | gzip -c > ../axtNet/cavPor3.speTri0.net.axt.gz ### Okay Kate rescued this by adjusting file sizes to handle memory ssh kkstore05 cd /cluster/data/cavPor3/bed/blastz.speTri0 time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue net >> doAferChain.log 2>&1 # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1764m55.155s tail do.log # *** All done! # *** Make sure that goldenPath/mm9/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. # blastz: ranOk: 52984 cat fb.mm9.chainCavPor3Link.txt # 757283793 bases of 2620346127 (28.900%) in intersection cd /cluster/data/mm9/bed/blastzCavPor3.2008-04-10 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.mm9.swap cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 & # real 166m53.671s # Exit 25 time nice -n +19 doBlastzChainNet.pl /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF -bigC # Can't open chain/scaffold_795.chain to append: Too many open files # gzip: stdout: Broken pipe # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastz.mm9.swap/axtChain/netSynteny.csh # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > syn.log 2>&1 & # real 24m37.561s # *** All done! # *** Make sure that goldenPath/cavPor3/vsMm9/README.txt is accurate. # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainMm9Link.txt # 781173609 bases of 2663369733 (29.330%) in intersection ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # 3way Rodent Multiz (special for Jurgens (Schmitz & co.) in Muenster # Previously, Robert Baertsch handled these. I committed to these # many months ago... # 2008-07-26 kate # Redo with unfiltered net mafs to maximize squirrel sequence (2008-11-08 kate) ssh kkstore05 cd /cluster/data/cavPor3/bed mkdir multiz3way cd multiz3way mkdir -p /san/sanvol1/scratch/cavPor3/multiz3way/ cd /san/sanvol1/scratch/cavPor3/multiz3way # copy mafs to cluster-friendly disk # mm9 - high quality mammalian genome, so use syntenic net mafSplit /dev/null -byTarget mm9/ /cluster/data/cavPor3/bed/blastz.mm9/axtChain/cavPor3.mm9.synNet.maf.gz cd mm9 # rename to reflect chrom name foreach f (*.maf) set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'` echo $c mv $f $c.maf end # speTri0 - low quality mammalian genome, so use reciprocal best net # mafSplit /dev/null -byTarget speTri0/ /cluster/data/cavPor3/bed/blastz.speTri0/mafRBestNet/cavPor3.speTri0.rbest.maf.gz # redo with full net, to get more sequence mafSplit /dev/null -byTarget speTri0/ /cluster/data/cavPor3/bed/blastz.speTri0/mafNet/cavPor3.speTri0.net.maf.gz cd speTri0 # rename to reflect chrom name foreach f (*.maf) set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'` echo $c mv $f $c.maf end cd /san/sanvol1/scratch/cavPor3/multiz3way # get latest PSU utilities mkdir penn set p=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $p/{autoMZ,multiz,maf_project} penn # the autoMultiz cluster run ssh pk cd /cluster/data/cavPor3/bed/multiz3way # create species list and stripped down tree for autoMZ cat > tree.nh << 'EOF' ((cavPor3 mm9) speTri0) 'EOF' cat > species.lst << 'EOF' cavPor3 mm9 speTri0 'EOF' mkdir run maf cd run cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = cavPor3 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz3way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz3way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz3way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs mkdir /cluster/data/cavPor3/bed/multiz3way/maf/ awk '{print $1}' /cluster/data/cavPor3/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # ~3000 jobs para try para check # ~2 hours of run-time # remove empty alignment files and package up ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz3way/maf mkdir empty cat > finish.csh << 'EOF' foreach f (*.maf) head -11 $f | grep -q 's cavPor3' if ($status == 1) then echo "$f empty" mv $f empty else echo " $f NOT empty" endif end 'EOF' # oops, forgot to mkdir, so 'empty' files were lost ls *.maf | wc -l # 1491 tar cvfz rodent3way.tar.gz *.maf ######### # Also do a multiz with non-filtered mouse mafs: # mm9 syntenic net mafs are sparse (cover only ~300 scaffolds), so check net ssh hgwdev cd /cluster/data/cavPor3/bed/blastz.mm9/axtChain netFilter -minGap=10 cavPor3.mm9.syn.net.gz | hgLoadNet -verbose=0 cavPor3 synNetMm9 stdin # select distinct(tName) shows 302 rows mkdir -p /san/sanvol1/scratch/cavPor3/multiz3way.full/ cd /san/sanvol1/scratch/cavPor3/multiz3way.full mafSplit /dev/null -byTarget mm9/ /cluster/data/cavPor3/bed/blastz.mm9//cavPor3.mm9.net.maf.gz cd mm9 # rename to reflect chrom name foreach f (*.maf) set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'` echo $c mv $f $c.maf end # copy and link various files from first run ln -s ../multiz3way/{speTri0,penn} . cp ../multiz3way/{species.lst,tree.nh} . mkdir run maf cp ../multiz3way/run/chrom.lst run sed 's/multiz3way/multiz3way.full/' ../multiz3way/run/autoMultiz > run/autoMultiz sed 's/multiz3way/multiz3way.full/' ../multiz3way/run/template > run/template # check it chmod +x run/autoMultiz mkdir -p /cluster/data/cavPor3/bed/multiz3way.full/maf ssh pk cd /san/sanvol1/scratch/cavPor3/multiz3way.full/run gensub2 chrom.lst single template jobList para create jobList para try para check # ~2 hours of run-time ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz3way.full/maf mkdir empty csh ../../multiz3way/maf/finish.csh >&! finish.log & # post to hgwdev downloads area ssh hgwdev cd /cluster/data/cavPor3/bed/multiz3way mv *.gz /usr/local/apache/htdocs/goldenPath/cavPor3/multizRodent3way/syntenic cd /cluster/data/cavPor3/bed/multiz3way.full mv *.gz /usr/local/apache/htdocs/goldenPath/cavPor3/multizRodent3way/full ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: cavPor3.upstreamGeneTbl = xenoRefGene cavPor3.upstreamMaf = multiz8way /hive/data/genomes/cavPor3/bed/multiz8way/species.list ############################################################################ # cavPor3 - Guinea Pig - Ensembl Genes version 51 (DONE - 2008-12-02 - hiram) ssh hgwdev cd /hive/data/genomes/cavPor3 cat << '_EOF_' > cavPor3.ensGene.ra # required db variable db cavPor3 # do we need to translate geneScaffold coordinates # geneScaffolds yes nameTranslation "s/^MT/chrM/;" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 cavPor3.ensGene.ra ssh hgwdev cd /hive/data/genomes/cavPor3/bed/ensGene.51 featureBits cavPor3 ensGene # 30699872 bases of 2663369733 (1.153%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/cavPor3/bed/ensGene.51 ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # lastz Rabbit oryCun2 swap (DONE - 2010-01-22 - Hiram) # original alignment cd /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21 cat fb.oryCun2.chainCavPor3Link.txt # 964546600 bases of 2604023284 (37.041%) in intersection # and for the swap mkdir /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap cd /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21/DEF \ -swap -noLoadChainSplit -workhorse=hgwdev -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 186m1.649s cat fb.cavPor3.chainOryCun2Link.txt # 1003499831 bases of 2663369733 (37.678%) in intersection ############################################################################ # enable native RefSeq Genes track (2010-06-15) # in etc/genbank.conf: cavPor3.refseq.mrna.native.load = yes ssh genbank cd /cluster/data/genbank ./bin/gbAlignStep -orgCat=native -initial -srcDb=refseq cavPor3 ssh hgwdev cd /cluster/data/genbank ./bin/gbDbLoadStep -byPassGbLoaded -reload -srcDb=refseq cavPor3 ############################################################################ # liftOver for hg19 (DONE - 2011-07-14 - Hiram) cd /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain ln -s `pwd`/cavPor3.hg19.over.chain.gz \ /gbdb/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz hgAddLiftOverChain -minMatch=0.1 -multiple \ -path=/gbdb/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz \ cavPor3 hg19 ln -s `pwd`/cavPor3.hg19.over.chain.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz cd /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/liftOver md5sum cavPor3ToHg19.over.chain.gz >> md5sum.txt ############################################################################ # SWAP lastz mm10 (DONE - 2012-03-19 - Hiram) # original alignment to mm10 cat /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/fb.mm10.chainCavPor3Link.txt # 754642254 bases of 2652783500 (28.447%) in intersection # and this swap mkdir /hive/data/genomes/cavPor3/bed/blastz.mm10.swap cd /hive/data/genomes/cavPor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 80m23.870s cat fb.cavPor3.chainMm10Link.txt # 775452752 bases of 2663369733 (29.115%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cavPor3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # create ucscToINSDC name mapping (DONE - 2013-08-16 - Hiram) mkdir /hive/data/genomes/cavPor3/bed/ucscToINSDC cd /hive/data/genomes/cavPor3/bed/ucscToINSDC XXX - this one has problems, needs to be fixed # copying these scripts from the previous load and improving them # with each instance ./translateNames.sh ./verifyAll.sh ./join.sh # verify the track link to INSDC functions ############################################################################## ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## ############################################################################## # LASTZ Guinea pig/cavPor3 - Malayan flying lemur/galVar1 # (DONE - 2016-04-15 - Hiram) mkdir /hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15 cd /hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15 printf '# Malayan flying lemur vs Guinea pig BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.54/bin/lastz # TARGET: Guinea pig cavPor3 SEQ1_DIR=/hive/data/genomes/cavPor3/cavPor3.wmTrf.2bit SEQ1_LEN=/hive/data/genomes/cavPor3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Malayan flying lemur galVar1 SEQ2_DIR=/hive/data/genomes/galVar1/galVar1.2bit SEQ2_LEN=/hive/data/genomes/galVar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # cat fb.cavPor3.chainTupBel1Link.txt cat fb.cavPor3.chainGalVar1Link.txt # 1300265220 bases of 2663369733 (48.820%) in intersection time (doRecipBest.pl -buildDir=`pwd` cavPor3 galVar1) > rbest.log 2>&1 & # real 161m43.865s # and for the swap: mkdir /hive/data/genomes/galVar1/bed/blastz.cavPor3.swap cd /hive/data/genomes/galVar1/bed/blastz.cavPor3.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 355m25.890s cat fb.galVar1.chainCavPor3Link.txt # 1357961680 bases of 2802917674 (48.448%) in intersection time (doRecipBest.pl -buildDir=`pwd` galVar1 cavPor3) > rbest.log 2>&1 # real 231m8.085s ############################################################################## # LASTZ Guinea pig/cavPor3 - Tree shrew/tupBel1 # (DONE - 2016-04-15 - Hiram) mkdir /hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15 cd /hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15 printf '# Tree shrew vs Guinea pig BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Guinea pig cavPor3 SEQ1_DIR=/hive/data/genomes/cavPor3/cavPor3.wmTrf.2bit SEQ1_LEN=/hive/data/genomes/cavPor3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Tree shrew tupBel1 SEQ2_DIR=/hive/data/genomes/tupBel1/tupBel1.2bit SEQ2_LEN=/hive/data/genomes/tupBel1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 597m13.526s cat fb.cavPor3.chainTupBel1Link.txt # 745589338 bases of 2663369733 (27.994%) in intersection time (doRecipBest.pl -buildDir=`pwd` cavPor3 tupBel1) > rbest.log 2>&1 & # real 187m21.483s # and for the swap: mkdir /hive/data/genomes/tupBel1/bed/blastz.cavPor3.swap cd /hive/data/genomes/tupBel1/bed/blastz.cavPor3.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 256m16.390s cat fb.tupBel1.chainCavPor3Link.txt # 747820632 bases of 2137225476 (34.990%) in intersection time (doRecipBest.pl -buildDir=`pwd` tupBel1 cavPor3) > rbest.log 2>&1 # real 262m6.256s ############################################################################# ## 5-Way Multiz (DONE - 2015-05-03 - Hiram) ssh hgwdev mkdir /hive/data/genomes/cavPor3/bed/multiz5way cd /hive/data/genomes/cavPor3/bed/multiz5way # from the 191-way in the source tree, select out the 5 used here: /cluster/bin/phast/tree_doctor \ --prune-all-but hg38,cavPor3,galVar1,mm10,tupBel1 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/191way.nh \ > cavPor3.5way.nh.0 cat cavPor3.5way.nh.0 # ((hg38:0.143908,tupBel1:0.191140):0.002000, # (mm10:0.315424,cavPor3:0.175779):0.041059); # using TreeGraph2 tree editor on the Mac, rearrange to get cavPor3 # at the top: # what that looks like: ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.5way.nh | sed -e 's/^/# /;' # ((cavPor3:0.175779, # mm10:0.315424):0.041059, # (hg38:0.143908, # (tupBel1:0.136203, # galVar1:0.08):0.054937):0.0020); # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ cavPor3.5way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ | sed -e 's/-nosed/_nosed/; s/-eating/_eating/;' > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" cavPor3.5way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.commonNames.nh cat cavPor3.5way.commonNames.nh | sed -e 's/^/# /;' # ((Guinea_pig:0.175779, # Mouse:0.315424):0.041059, # (Human:0.143908, # (Tree_shrew:0.136203, # Malayan_flying_lemur:0.08):0.054937):0.002); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/cavPor3_5way.png ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.5way.nh > t.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.scientificNames.nh rm -f t.nh cat cavPor3.5way.scientificNames.nh | sed -e 's/^/# /;' # ((Cavia_porcellus:0.175779, # Mus_musculus:0.315424):0.041059, # (Homo_sapiens:0.143908, # (Tupaia_belangeri:0.136203, # Galeopterus_variegatus:0.08):0.054937):0.002); /cluster/bin/phast/all_dists cavPor3.5way.nh | grep cavPor3 \ | sed -e "s/cavPor3.//" | sort -k2n > 5way.distances.txt # Use this output to create the table below cat 5way.distances.txt | sed -e 's/^/# /;' # galVar1 0.353775 # hg38 0.362746 # tupBel1 0.409978 # mm10 0.491203 printf '#!/usr/bin/env perl use strict; use warnings; open (FH, "<5way.distances.txt") or die "can not read 5way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('"'"'\\s+'"'"', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/cavPor3/bed/lastz.$D/fb.cavPor3." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\\%%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.cavPor3/fb.${D}.chainCavPor3Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\\%%//; } my $orgName= `hgsql -N -e '"'"'select organism from dbDb where name="$D";'"'"' hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %%02d %%.4f (%%%% %%06.3f) (%%%% %%06.3f) - %%s %%s\\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); ' > sizeStats.pl chmod +x ./sizeStats.pl ./sizeStats.pl # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainLink # N distance on cavPor3 on other other species # 01 0.3538 (% 48.820) (% 48.448) - Malayan flying lemur galVar1 # 02 0.3627 (% 48.000) (% 42.371) - Human hg38 # 03 0.4100 (% 27.994) (% 34.990) - Tree shrew tupBel1 # 04 0.4912 (% 29.115) (% 28.447) - Mouse mm10 # None of this concern for distances matters in building the first step, the # maf files. The distances will be better calibrated later. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ cavPor3.5way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # cavPor3 mm10 hg38 tupBel1 galVar1 # survey N50 for each for db in `cat species.list` do n50.pl /hive/data/genomes/$db/chrom.sizes done # reading: /hive/data/genomes/cavPor3/chrom.sizes # contig count: 3144, total size: 2723219641, one half size: 1361609820 # cumulative N50 count contig contig size 1356838683 27 scaffold_25 28222655 1361609820 one half size 1384780737 28 scaffold_27 27942054 # reading: /hive/data/genomes/mm10/chrom.sizes # contig count: 66, total size: 2730871774, one half size: 1365435887 # cumulative N50 count contig contig size 1312176979 8 chr7 145441459 1365435887 one half size 1442871972 9 chr10 130694993 # reading: /hive/data/genomes/hg38/chrom.sizes # contig count: 455, total size: 3209286105, one half size: 1604643052 # cumulative N50 count contig contig size 1547391171 8 chrX 156040895 1604643052 one half size 1692529807 9 chr8 145138636 # reading: /hive/data/genomes/tupBel1/chrom.sizes # contig count: 150851, total size: 3660774957, one half size: # 1830387478 # cumulative N50 count contig contig size 1830345737 7910 scaffold_129972.1-127906 127906 1830387478 one half size 1830473625 7911 scaffold_147844.1-127888 127888 # reading: /hive/data/genomes/galVar1/chrom.sizes # contig count: 179514, total size: 3187660572, one half size: # 1593830286 # cumulative N50 count contig contig size 1593691350 3422 NW_007730159v1 245222 1593830286 one half size 1593936539 3423 NW_007729331v1 245189 # bash shell syntax here ... cd /hive/data/genomes/cavPor3/bed/multiz5way export H=/hive/data/genomes/cavPor3/bed mkdir mafLinks # good assemblies can use syntenic net: # hg38 mm10 cavPor3 for G in hg38 mm10 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G done # other assemblies using recip best net: # tupBel1 for G in tupBel1 galVar1 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G done # verify the symLinks are good: ls -ogrtL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' # 549742600 Mar 19 2012 mafLinks/mm10/cavPor3.mm10.synNet.maf.gz # 930458998 Apr 29 2015 mafLinks/hg38/cavPor3.hg38.synNet.maf.gz # 914003647 Apr 18 11:56 mafLinks/galVar1/cavPor3.galVar1.rbest.maf.gz # 535098561 Apr 18 12:25 mafLinks/tupBel1/cavPor3.tupBel1.rbest.maf.gz # split the maf files into a set of hashed named files # this hash named split keeps the same chr/contig names in the same # named hash file. mkdir /hive/data/genomes/cavPor3/bed/multiz5way/mafSplit cd /hive/data/genomes/cavPor3/bed/multiz5way/mafSplit time for D in `sed -e "s/cavPor3 //" ../species.list` do echo "${D}" mkdir $D cd $D echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz" mafSplit -byTarget -useHashedName=8 /dev/null . \ ../../mafLinks/${D}/*.maf.gz cd .. done # real 2m20.137s # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | wc -l # 798 find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 246 maf.list mkdir /hive/data/genomes/cavPor3/bed/multiz5way/splitRun cd /hive/data/genomes/cavPor3/bed/multiz5way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # verify the db and pairs settings are correct printf '#!/bin/csh -ef set db = cavPor3 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/cavPor3/bed/multiz5way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp ' > autoMultiz.csh chmod +x autoMultiz.csh printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/cavPor3/bed/multiz5way/splitRun/maf/$(root1).maf} #ENDLOOP ' > template ln -s ../../mafSplit/maf.list maf.list ssh ku cd /hive/data/genomes/cavPor3/bed/multiz5way/splitRun/run gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc... # Completed: 246 of 246 jobs # CPU time in finished jobs: 63385s 1056.41m 17.61h 0.73d 0.002 y # IO & Wait Time: 674s 11.24m 0.19h 0.01d 0.000 y # Average job time: 260s 4.34m 0.07h 0.00d # Longest finished job: 2448s 40.80m 0.68h 0.03d # Submission to last job: 2479s 41.32m 0.69h 0.03d # combine into one file (the 1>&2 redirect sends the echo to stderr) cd /hive/data/genomes/cavPor3/bed/multiz5way head -1 splitRun/maf/020.maf > multiz5way.maf time for F in splitRun/maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} done >> multiz5way.maf # real 0m37.210s tail -1 splitRun/maf/020.maf >> multiz5way.maf # -rw-rw-r-- 1 8260142703 May 3 15:40 multiz5way.maf # Load into database ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz5way mkdir /gbdb/cavPor3/multiz5way ln -s `pwd`/multiz5way.maf /gbdb/cavPor3/multiz5way cd /dev/shm time hgLoadMaf cavPor3 multiz5way # Loaded 8610698 mafs in 1 files from /gbdb/cavPor3/multiz5way # real 2m37.904s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 cavPor3 multiz5waySummary \ /gbdb/cavPor3/multiz5way/multiz5way.maf # Created 844437 summary blocks from 19057881 components and 8610698 mafs from /gbdb/cavPor3/multiz5way/multiz5way.maf # real 3m1.052s # -rw-rw-r-- 1 479126012 May 3 15:42 multiz5way.tab # -rw-rw-r-- 1 42429421 May 3 15:47 multiz5waySummary.tab wc -l multiz5way*.tab # 8610698 multiz5way.tab # 844437 multiz5waySummary.tab rm multiz5way*.tab ############################################################################## # GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2015-05-03 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. Need to split of the maf file into individual # maf files mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/anno/mafSplit cd /hive/data/genomes/cavPor3/bed/multiz5way/anno/mafSplit time mafSplit -outDirDepth=2 -byTarget -useFullSequenceName \ /dev/null . ../../multiz5way.maf # real 2m41.134s find . -type f | wc -l # 1082 # check for N.bed files everywhere: cd /hive/data/genomes/cavPor3/bed/multiz5way/anno for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" # cd /hive/data/genomes/${DB} # twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/cavPor3/bed/multiz5way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S gapAnno # use a screen to control this longish job ssh ku cd /hive/data/genomes/cavPor3/bed/multiz5way/anno mkdir result find ./mafSplit -type d | sed -e 's#./mafSplit/##' | while read D do echo mkdir -p result/${D} mkdir -p result/${D} done printf '#LOOP mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/cavPor3/cavPor3.2bit {check out exists+ result/$(path1)} #ENDLOOP ' > template find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list gensub2 maf.list single template jobList # limit jobs on a node with the ram=32g requirement because they go fast para -ram=32g create jobList para try ... check ... push ... # Completed: 1082 of 1082 jobs # CPU time in finished jobs: 1795s 29.92m 0.50h 0.02d 0.000 y # IO & Wait Time: 2812s 46.87m 0.78h 0.03d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 42s 0.70m 0.01h 0.00d # Submission to last job: 124s 2.07m 0.03h 0.00d # verify all result files have some content, look for 0 size files: find ./result -type f -size 0 # should see none # or in this manner: find ./result -type f | xargs ls -og | sort -k3nr | tail # combine into one file (the 1>&2 redirect sends the echo to stderr) head -q -n 1 result/1/1/scaffold_326.maf > cavPor3.5way.maf time find ./result -type f | while read F do echo "${F}" 1>&2 grep -h -v "^#" ${F} done >> cavPor3.5way.maf # real 29m25.870s # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/1/1/scaffold_326.maf >> cavPor3.5way.maf # How about an official end marker: echo "##eof maf" >> cavPor3.5way.maf ls -og # -rw-rw-r-- 1 11434713982 May 4 10:01 cavPor3.5way.maf du -hsc cavPor3.5way.maf # 11G cavPor3.5way.maf # construct symlinks to get the individual maf files into gbdb: rm /gbdb/cavPor3/multiz5way/multiz5way.maf # remove previous results ln -s `pwd`/cavPor3.5way.maf /gbdb/cavPor3/multiz5way/multiz5way.maf # Load into database cd /dev/shm time hgLoadMaf -pathPrefix=/gbdb/cavPor3/multiz5way cavPor3 multiz5way # Loaded 9842095 mafs in 1 files from /gbdb/cavPor3/multiz5way # real 3m21.378s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 cavPor3 multiz5waySummary \ /gbdb/cavPor3/multiz5way/multiz5way.maf # Created 844437 summary blocks from 19057881 components and 9842095 mafs from /gbdb/cavPor3/multiz5way/multiz5way.maf # real 3m41.604s # -rw-rw-r-- 1 549110793 May 4 10:05 multiz5way.tab # -rw-rw-r-- 1 44118295 May 4 10:11 multiz5waySummary.tab rm multiz5way*.tab ###################################################################### # MULTIZ7WAY MAF FRAMES (DONE - 2016-05-05 - Hiram) ssh hgwdev mkdir /hive/data/genomes/cavPor3/bed/multiz5way/frames cd /hive/data/genomes/cavPor3/bed/multiz5way/frames # survey all the genomes to find out what kinds of gene tracks they have printf '#!/bin/csh -fe foreach db (`cat ../species.list`) printf "# ${db}: " set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "mgcGenes" || $table == "knownGene" || \ $table == "ncbiRefSeq" || $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='"'"'$db'"'"'"` set orgId = `hgsql $db -N -e \ "select id from organism where name='"'"'$orgName'"'"'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql $db -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end ' > showGenes.csh # << happy emacs chmod +x ./showGenes.csh time ./showGenes.csh # cavPor3: ensGene: 26129, refGene: 488, xenoRefGene: 296142, Mrnas: 21235 # mm10: ensGene: 103734, knownGene: 63759, mgcGenes: 26768, ncbiRefSeq: # 116846, refGene: 35838, xenoRefGene: 172783, Mrnas: 5250157 # hg38: ensGene: 208239, knownGene: 197782, mgcGenes: 35390, ncbiRefSeq: # 161913, refGene: 63332, xenoRefGene: 181602, Mrnas: 11351569 # tupBel1: ensGene: 34727, xenoRefGene: 688682, Mrnas: 2509 # galVar1: ncbiRefSeq: 41547, xenoRefGene: 473670, Mrnas: 0 # real 2m47.018s # from that summary, use these gene sets: # knownGene - hg38 mm10 # ensGene - tupBel1 cavPor3 # ncbiRefSeq - galVar1 mkdir genes # 1. knownGene: hg38 mm10 for DB in hg38 mm10 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/${DB}.gp.gz printf "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # checked: 21375 failed: 0 # checked: 21100 failed: 0 # 2. ensGene: tupBel1 cavPor3 for DB in tupBel1 cavPor3 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz printf "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # tupBel1: checked: 29256 failed: 0 # cavPor3: checked: 18631 failed: 0 # 3. ncbiRefSeq for galVar1 for DB in galVar1 do hgsql -N -e "select * from ncbiRefSeq" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz printf "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # galVar1: checked: 23389 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do printf "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/cavPor3.gp.gz: 18631 # genes/galVar1.gp.gz: 23054 # genes/hg38.gp.gz: 21375 # genes/mm10.gp.gz: 21100 # genes/tupBel1.gp.gz: 15407 time (cat ../anno/cavPor3.5way.maf \ | genePredToMafFrames cavPor3 stdin stdout \ `cat ../species.list.txt | xargs echo \ |sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g;"` \ | gzip > multiz5wayFrames.bed.gz) # real 2m38.685s # verify there are frames on everything, should be 5 species: zcat multiz5wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' # 185620 cavPor3 # 246289 galVar1 # 245505 hg38 # 234762 mm10 # 201249 tupBel1 # load the resulting file ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz5way/frames time hgLoadMafFrames cavPor3 multiz5wayFrames multiz5wayFrames.bed.gz # real 0m12.730s time featureBits -countGaps cavPor3 multiz5wayFrames # 36929844 bases of 2723219641 (1.356%) in intersection # real 0m10.136s # enable the trackDb entries: # frames multiz5wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 5-way (DONE - 2016-05-06 - Hiram) mkdir /hive/data/genomes/cavPor3/bed/multiz5way/4d cd /hive/data/genomes/cavPor3/bed/multiz5way/4d # using the ensGene hgsql -N -e "select * from ensGene;" cavPor3 \ | cut -f2- | genePredSingleCover stdin stdout > /dev/shm/cavPor3.tmp.gp mv /dev/shm/cavPor3.tmp.gp cavPor3.ensGene.gp genePredCheck -db=cavPor3 cavPor3.ensGene.gp # checked: 18631 failed: 0 genePredSingleCover cavPor3.ensGene.gp stdout \ | sort > cavPor3.ensGeneNR.gp genePredCheck -db=cavPor3 cavPor3.ensGeneNR.gp # checked: 18631 failed: 0 # the annotated maf is: og ../anno/cavPor3.5way.maf # -rw-rw-r-- 1 11434713982 May 4 10:01 ../anno/cavPor3.5way.maf mkdir annoSplit cd annoSplit time mafSplit -verbose=2 -outDirDepth=2 -byTarget -useFullSequenceName \ /dev/null . ../../anno/cavPor3.5way.maf # real 3m39.018s find . -type f | wc -l # 1082 ssh ku mkdir /hive/data/genomes/cavPor3/bed/multiz5way/4d/run cd /hive/data/genomes/cavPor3/bed/multiz5way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fex set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set GP = cavPor3.ensGeneNR.gp set r = "/hive/data/genomes/cavPor3/bed/multiz6way" set c = $1 set infile = $r/4d/$2 set outDir = $r/4d/$3 set outfile = $r/4d/run/$4 /bin/mkdir -p $outDir cd /dev/shm /bin/awk -v C=$c '$2 == C {print}' $r/4d/$GP | sed -e "s/\t$c\t/\tcavPor3.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` echo $NL if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile else echo "" > $outfile endif /bin/rm -f /dev/shm/$c.gp /dev/shm/$c.ss _EOF_ # << happy emacs chmod +x 4d.csh find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list wc -l maf.list # 933 maf.list printf '#LOOP 4d.csh $(root1) annoSplit/$(dir1)/$(file1) mfa/$(dir1) {check out line+ ../mfa/$(dir1)/$(root1).mfa} #ENDLOOP ' > template gensub2 maf.list single template jobList para create jobList para try ... check para time # Completed: 917 of 933 jobs # Crashed: 16 jobs # CPU time in finished jobs: 622s 10.37m 0.17h 0.01d 0.000 y # IO & Wait Time: 2571s 42.84m 0.71h 0.03d 0.000 y # Average job time: 3s 0.06m 0.00h 0.00d # Longest finished job: 23s 0.38m 0.01h 0.00d # Submission to last job: 41s 0.68m 0.01h 0.00d # the crashed jobs appear to be single genes on a single scaffold and they # do not make a .ss output # Not all results have contents, that is OK # combine mfa files ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz5way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # sometimes this doesn't work, don't know why find ./mfa -type f -size 1 | xargs rm -f # when it doesn't, use this empty list procedure find ./mfa -type f | xargs ls -og | awk '$3 < 2' | awk '{print $NF}' \ > empty.list cat empty.list | xargs rm -f # see what is left: ls -ogrt mfa/*/*/*.mfa | sort -k3nr | wc # 298 2086 17178 # want comma-less species.list time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*/*/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # real 0m1.411s # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 6 grep "^>" 4d.all.mfa | sed -e 's/^/# /;' # >cavPor3 # >mm10 # >hg38 # >tupChi1 # >tupBel1 # >galVar1 sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../cavPor3.5way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh # tree_commas.nh looks like: # ((cavPor3,mm10),(hg38,((tupChi1,tupBel1),galVar1))) # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree_commas.nh 4d.all.mfa # real 0m1.991s mv phyloFit.mod all.mod grep TREE all.mod # TREE: # ((cavPor3:0.216074,mm10:0.289999):0.0256454,(hg38:0.119014, # ((tupChi1:0.00562878,tupBel1:0.00977264):0.178737, # galVar1:0.104398):0.00894324):0.0256454); # compare these calculated lengths to the tree extracted from 183way: grep TREE all.mod | sed -e 's/TREE: //' \ | /cluster/bin/phast/all_dists /dev/stdin | grep cavPor3 \ | sed -e "s/cavPor3.//;" | sort > new.dists /cluster/bin/phast/all_dists ../cavPor3.5way.nh | grep cavPor3 \ | sed -e "s/cavPor3.//;" | sort > old.dists # printing out the 'new', the 'old' the 'difference' and percent difference join new.dists old.dists | awk '{ printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' \ | sort -k3n # galVar1 0.383228 0.353775 0.029453 8.325348 # hg38 0.390633 0.362746 0.027887 7.687748 # tupBel1 0.464419 0.409978 0.054441 13.279005 # mm10 0.509725 0.491203 0.018522 3.770742 # compare this tree with the one generated from the galVar1 5-way grep TREE all.mod | sed -e 's/TREE: //' \ | /cluster/bin/phast/all_dists /dev/stdin | grep galVar1 \ | sed -e "s/galVar1.//" | sort > cavPor3.galVar1.dists XXX - waiting for galVar1 to fini - Thu May 5 16:22:43 PDT 2016 grep TREE /hive/data/genomes/galVar1/bed/multiz5way/4d/all.mod \ | sed -e 's/TREE: //' \ | /cluster/bin/phast/all_dists /dev/stdin | grep galVar1 \ | sed -e "s/galVar1.//;" | sort > galVar1.galVar1.dists # appears to be systematically longer branch lengths with this cavPor3 # reference, all by a similar percentage: # table headings: # species cavPor3 galVar1 cavPor3-galVar1 percent # dist est. dist est. difference difference join cavPor3.galVar1.dists galVar1.galVar1.dists | awk '{ printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' \ | sort -k3n # hg38 0.231963 0.227001 0.004962 2.185893 # tupBel1 0.287901 0.280636 0.007265 2.588763 # cavPor3 0.383228 0.370042 0.013186 3.563379 # mm10 0.455383 0.442507 0.012876 2.909784 ######################################################################### # phastCons 5-way (DONE - 2016-05-06 - Hiram) # split 5way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS mkdir result done printf '#!/bin/csh -ef set d = $1 set c = $2 set doneDir = done/$d set MAF = /hive/data/genomes/cavPor3/bed/multiz5way/anno/result/$d/$c.maf set WINDOWS = /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS/result/$d/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $3 ) then exit 0 endif if ( -s $3.running ) then exit 0 endif /bin/mkdir -p $doneDir /bin/date >> $3.running /bin/rm -fr $WINDOWS /bin/mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \\ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null /bin/date >> $3 /bin/rm -f $3.running ' > mkSS.csh chmod +x mkSS.csh printf '#LOOP mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)} #ENDLOOP ' > template find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list wc -l maf.list # 1082 maf.list ssh ku cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS gensub2 maf.list single template jobList # beware overwhelming the cluster with these quick high I/O jobs para create jobList para try ... check ... etc para -maxJob=64 push # Completed: 1082 of 1082 jobs # CPU time in finished jobs: 964s 16.06m 0.27h 0.01d 0.000 y # IO & Wait Time: 2888s 48.14m 0.80h 0.03d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 33s 0.55m 0.01h 0.00d # Submission to last job: 89s 1.48m 0.02h 0.00d find ./result -type f | wc -l # 800 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/cons/run.cons cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set d = $2 set f = $3 set len = $4 set cov = $5 set rho = $6 set grp = $cwd:t set cons = /hive/data/genomes/cavPor3/bed/multiz5way/cons set tmp = $cons/tmp/${d}_${c} mkdir -p $tmp set ssSrc = $cons/SS/result set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$d/$f $tmp else ln -s $ssSrc/$d/$f $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f $useGrp \\ --rho $rho --expected-length $len --target-coverage $cov --quiet \\ --not-informative `cat $grp.non-inf` \\ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp else $PHASTBIN/phastCons $f $useGrp \\ --rho $rho --expected-length $len --target-coverage $cov --quiet \\ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp endif popd > /dev/null mkdir -p pp/$d bed/$d sleep 4 touch pp/$d bed/$d rm -f pp/$d/$c.pp rm -f bed/$d/$c.bed mv $tmp/$c.pp pp/$d mv $tmp/$c.bed bed/$d rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h ' > doPhast.csh chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix printf '#LOOP ../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp} #ENDLOOP ' > template find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list wc -l ss.list # 800 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/cavPor3/bed/multiz5way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=32g create jobList para try ... check ... para push # Completed: 800 of 800 jobs # CPU time in finished jobs: 4961s 82.68m 1.38h 0.06d 0.000 y # IO & Wait Time: 5314s 88.57m 1.48h 0.06d 0.000 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest finished job: 32s 0.53m 0.01h 0.00d # Submission to last job: 105s 1.75m 0.03h 0.00d # create Most Conserved track cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/?/?/${C} 2> /dev/null | while read D do echo ${D}/${C}*.bed 1>&2 cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 1m8.280s time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ w mostConserved.bed # real 0m13.098s # -rw-rw-r-- 1 40630220 May 6 10:26 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/all time hgLoadBed cavPor3 phastConsElements5way mostConserved.bed # Read 1019383 elements of size 5 from mostConserved.bed # real 0m10.697s # on human we often try for 5% overall cov, and 70% CDS cov # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits cavPor3 -enrichment ensGene:cds phastConsElements5way # ensGene:cds 1.119%, phastConsElements5way 4.394%, both 0.818%, # cover 73.11%, enrich 16.64x # real 0m14.013s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/all mkdir downloads # the third sed fixes the chrom names, removing the partition extensions time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ | gzip -c > downloads/phastCons5way.wigFix.gz) # real 20m14.674s # -rw-rw-r-- 1 1844583276 May 6 10:49 phastCons5way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons5way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \ phastCons5way.bw) > bigWig.log 2>&1 tail bigWig.log # pid=54646: VmPeak: 16771468 kB # real 25m26.987s bigWigInfo phastCons5way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 2,832,765,911 # primaryIndexSize: 75,227,328 # zoomLevels: 10 # chromCount: 625 # basesCovered: 1,531,042,459 # mean: 0.139598 # min: 0.000000 # max: 1.000000 # std: 0.262845 # encode those files into wiggle data time (zcat downloads/phastCons5way.wigFix.gz \ | wigEncode stdin phastCons5way.wig phastCons5way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 8m33.871s du -hsc *.wi? # 1.5G phastCons5way.wib # 244M phastCons5way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phastCons5way.wib /gbdb/cavPor3/multiz5way/phastCons5way.wib time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz5way \ cavPor3 phastCons5way phastCons5way.wig # real 0m29.220s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh cavPor3 phastCons5way # db.table min max mean count sumData # cavPor3.phastCons5way 0 1 0.139598 1531042459 2.1373e+08 # stdDev viewLimits # 0.262845 viewLimits=0:1 # Create histogram to get an overview of all the data time hgWiggle -doHistogram -db=cavPor3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons5way > histogram.data 2>&1 # real 1m45.784s # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Guinea pig cavPor3 Histogram phastCons5way track" set xlabel " phastCons5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \\ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png display histo.png & ######################################################################### # phyloP for 5-way (DONE - 2016-05-09 - Hiram) # run phyloP with score=LRT ssh ku mkdir /cluster/data/cavPor3/bed/multiz5way/consPhyloP cd /cluster/data/cavPor3/bed/multiz5way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.569 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../4d/all.mod 0.569 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.215500 0.284500 0.284500 0.215500 printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set d = $f:h set file1 = $f:t set out = $2 set cName = $f:t:r set grp = $cwd:t set cons = /hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "/hive/data/genomes/cavPor3/bed/multiz5way/cons/SS/result/$f" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \\ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$file1.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp ' > doPhyloP.csh chmod +x doPhyloP.csh # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list # make sure the list looks good wc -l ss.list # 800 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP/all cd /hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overwhelming the cluster with these fast running high I/O jobs para create jobList para try ... check ... push ... etc ... para -maxJob=53 push para time > run.time # Completed: 798 of 800 jobs # Crashed: 2 jobs # CPU time in finished jobs: 1334s 22.23m 0.37h 0.02d 0.000 y # IO & Wait Time: 5599s 93.32m 1.56h 0.06d 0.000 y # Average job time: 9s 0.14m 0.00h 0.00d # Longest finished job: 15s 0.25m 0.00h 0.00d # Submission to last job: 88s 1.47m 0.02h 0.00d # the two failed jobs actually finished, their error was at the end # of the script: # /bin/rmdir: failed to remove `/hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP/tmp/all/7/4/scaffold_13': No such file or directory mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP5way.wigFix.gz) # real 19m35.007s # check integrity of data with wigToBigWig time (zcat downloads/phyloP5way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \ phyloP5way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=24836: VmPeak: 16771468 kB # real 25m9.670s bigWigInfo phyloP5way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 2,473,672,935 # primaryIndexSize: 75,227,328 # zoomLevels: 10 # chromCount: 625 # basesCovered: 1,531,042,459 # mean: 0.092304 # min: -2.562000 # max: 0.863000 # std: 0.610474 # encode those files into wiggle data time (zcat downloads/phyloP5way.wigFix.gz \ | wigEncode stdin phyloP5way.wig phyloP5way.wib) # Converted stdin, upper limit 0.86, lower limit -2.56 # real 8m56.205s du -hsc *.wi? # 1.5G phyloP5way.wib # 249M phyloP5way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP5way.wib /gbdb/cavPor3/multiz5way/phyloP5way.wib time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz5way cavPor3 \ phyloP5way phyloP5way.wig # real 0m30.072s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh cavPor3 phyloP5way # db.table min max mean count sumData # cavPor3.phyloP5way -2.562 0.863 0.0923044 1531042459 1.41322e+08 # stdDev viewLimits # 0.610474 viewLimits=-2.562:0.863 # that range is: 0.863+2.562 = 3.425 for hBinSize=0.003425 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.003425 -hBinCount=1000 -hMinVal=-2.562 -verbose=2 \ -db=cavPor3 phyloP5way > histogram.data 2>&1 XXX - running - Fri May 6 21:25:03 PDT 2016 # real 1m49.237s # find the Y range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ | sed -e 's/^/# /;' # Q1 0.000034 # median 0.000279 # Q3 0.001310 # average 0.001110 # min 0.000000 # max 0.017827 # count 901 # total 0.999993 # standard deviation 0.002086 # find the X range for the 2:5 graph grep "^[0-9]" histogram.data | ave -col=2 stdin \ | sed -e 's/^/# /;' # Q1 -1.755415 # median -0.986500 # Q3 -0.210737 # average -0.945154 # min -2.562000 # max 0.801350 # count 901 # total -851.583895 # standard deviation 0.953370 # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Guinea pig cavPor3 Histogram phyloP5way track" set xlabel " phyloP5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set xtics set xrange [-2.6:0.9] set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png display histo.png & # appears to have an odd hole in the data near X=0 ? ############################################################################# # construct download files for 5-way (DONE - 2016-05-09 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way mkdir /hive/data/genomes/cavPor3/bed/multiz5way/downloads cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads mkdir multiz5way phastCons5way phyloP5way cd multiz5way time cp -p ../../anno/cavPor3.5way.maf . # real 2m22.776s # -rw-rw-r-- 1 84345995664 Apr 14 09:27 cavPor3.5way.maf du -hsc * # 79G cavPor3.5way.maf time gzip *.maf # real 87m37.865s # -rw-rw-r-- 1 10893506431 Apr 14 09:27 cavPor3.5way.maf.gz grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh cavPor3.5way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh cavPor3.5way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.scientificNames.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m35.144s ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way du -hsc *.maf.gz ../../anno/cavPor3.5way.maf # 11G cavPor3.5way.maf.gz # 79G ../../anno/cavPor3.5way.maf # obtain the README.txt from cavPor3/multiz17way and update for this # situation ##################################################################### cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phastCons5way ln -s ../../cons/all/downloads/phastCons5way.wigFix.gz \ ./cavPor3.phastCons5way.wigFix.gz ln -s ../../cons/all/phastCons5way.bw ./cavPor3.phastCons5way.bw ln -s ../../cons/all/all.mod ./cavPor3.phastCons5way.mod time md5sum *.gz *.mod *.bw > md5sum.txt # real 0m20.354s # obtain the README.txt from hg38/phastCons5way and update for this # situation ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way ##################################################################### cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phyloP5way ln -s ../../consPhyloP/all/downloads/phyloP5way.wigFix.gz \ ./cavPor3.phyloP5way.wigFix.gz ln -s ../../consPhyloP/run.phyloP/all.mod cavPor3.phyloP5way.mod ln -s ../../consPhyloP/all/phyloP5way.bw cavPor3.phyloP5way.bw time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m29.662s # obtain the README.txt from cavPor3/phyloP17way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way ############################################################################# # hgPal downloads (DONE - 2016-05-09 - Hiram) # FASTA from 5-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S cavPor3HgPal mkdir /hive/data/genomes/cavPor3/bed/multiz5way/pal cd /hive/data/genomes/cavPor3/bed/multiz5way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop takes about 2.5 hours on this large count contig assembly export mz=multiz5way export gp=ensGene export db=cavPor3 export I=0 export D=0 mkdir exonAA exonNuc printf '#!/bin/sh\n' > $gp.jobs time for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done >> $gp.jobs # real 1m25.411s echo "date" >> $gp.jobs echo "wait" >> $gp.jobs chmod +x $gp.jobs time (./$gp.jobs) > $gp.jobs.log 2>&1 # real 7m28.075s export mz=multiz5way export gp=ensGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m14.448s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 0m42.669s # -rw-rw-r-- 1 30006064 May 9 10:12 ensGene.multiz5way.exonAA.fa.gz # -rw-rw-r-- 1 47566626 May 9 10:13 ensGene.multiz5way.exonNuc.fa.gz export mz=multiz5way export gp=ensGene export db=cavPor3 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # wiki page for 5-way (DONE - 2016-05-09 - Hiram) mkdir /hive/users/hiram/bigWays/cavPor3.5way cd /hive/users/hiram/bigWays echo "cavPor3" > cavPor3.5way/ordered.list awk '{print $1}' /hive/data/genomes/cavPor3/bed/multiz5way/5way.distances.txt \ >> cavPor3.5way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh cavPor3.5way/ordered.list # dbDb.sh constructs cavPor3.5way/CavPor3_5-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh cavPor3 5way # sizeStats.pl constructs cavPor3.5way/CavPor3_5-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl cavPor3 5way # defCheck.pl constructs CavPor3_5-way_conservation_lastz_parameters.html ./defCheck.pl cavPor3 5way # this constructs the html pages in cavPor3.5way/: # -rw-rw-r-- 1 3287 May 9 10:24 CavPor3_5-way_conservation_alignment.html # -rw-rw-r-- 1 4805 May 9 10:25 CavPor3_5-way_Genome_size_statistics.html # -rw-rw-r-- 1 3293 May 9 10:25 CavPor3_5-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # CavPor3_5-way_conservation_alignment # CavPor3_5-way_Genome_size_statistics # CavPor3_5-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################ ## create upstream refGene maf files (DONE - 2016-05-09 - Hiram) mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way # bash script #!/bin/sh export geneTbl="ensGene" for S in 1000 2000 5000 do printf "making upstream${S}.maf\n" 1>&2 featureBits cavPor3 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags cavPor3 multiz5way \ stdin stdout \ -orgs=/hive/data/genomes/cavPor3/bed/multiz5way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz printf "done upstream${S}.${geneTbl}.maf.gz\n" 1>&2 done time (./mkUpstream.sh ) > upstream.log 2>&1 # real 0m33.868s md5sum upstream*.gz >> md5sum.txt # script to help create document information: for db in `cat ../species.list` do orgName=`hgsql -N -e "select organism from dbDb where name=\"${db}\";" hgcentraltest` sciName=`hgsql -N -e "select scientificName from dbDb where name=\"${db}\";" hgcentraltest` genome=`hgsql -N -e "select genome from dbDb where name=\"${db}\";" hgcentraltest` descr=`hgsql -N -e "select description from dbDb where name=\"${db}\";" hgcentraltest` printf "%-18s %-24s %-20s\n" "$genome" "$sciName" "$descr" done # Guinea pig Cavia porcellus Feb. 2008 (Broad/cavPor3) # Mouse Mus musculus Dec. 2011 (GRCm38/mm10) # Human Homo sapiens Dec. 2013 (GRCh38/hg38) # Tree shrew Tupaia belangeri Dec. 2006 (Broad/tupBel1) # Malayan flying lemur Galeopterus variegatus Jun. 2014 (G_variegatus-3.0.2/galVar1) # obtain the README.txt from ce11/multiz26way and update for this # situation mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way ln -s `pwd`/upstream*.gz README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way ############################################################################# # construct download files for 5-way (DONE - 2016-05-09 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way mkdir /hive/data/genomes/cavPor3/bed/multiz5way/downloads cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads mkdir multiz5way phastCons5way phyloP5way cd multiz5way time cp -p ../../anno/cavPor3.5way.maf . # real 0m18.941s # -rw-rw-r-- 1 11434713982 May 4 10:01 cavPor3.5way.maf du -hsc * # 11G cavPor3.5way.maf time gzip *.maf & # real 36m27.668s # -rw-rw-r-- 1 2787306473 May 4 10:01 cavPor3.5way.maf.gz grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh cavPor3.5way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh cavPor3.5way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.5way.scientificNames.nh time md5sum *.nh *.maf.gz >> md5sum.txt # real 0m35.144s ln -s `pwd`/*.maf.gz `pwd`/*.txt `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way du -hsc *.maf.gz ../../anno/cavPor3.5way.maf # 11G cavPor3.5way.maf.gz # 79G ../../anno/cavPor3.5way.maf # obtain the README.txt from ce11/multiz26way and update for this # situation ##################################################################### cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phastCons5way ln -s ../../cons/all/downloads/phastCons5way.wigFix.gz \ ./cavPor3.phastCons5way.wigFix.gz ln -s ../../cons/all/phastCons5way.bw ./cavPor3.phastCons5way.bw ln -s ../../cons/all/all.mod ./cavPor3.phastCons5way.mod md5sum *.mod *.gz *.bw README.txt > md5sum.txt # about 15 to 20 seconds # obtain the README.txt from ce11/phastCons26way and update for this # situation ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way ##################################################################### cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phyloP5way ln -s ../../consPhyloP/all/downloads/phyloP5way.wigFix.gz \ ./cavPor3.phyloP5way.wigFix.gz ln -s ../../consPhyloP/run.phyloP/all.mod cavPor3.phyloP5way.mod ln -s ../../consPhyloP/all/phyloP5way.bw cavPor3.phyloP5way.bw md5sum *.mod *.gz *.bw README.txt > md5sum.txt # about 15 to 20 seconds # obtain the README.txt from ce11/phyloP26way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way ############################################################################# # hgPal downloads (DONE - 2016-05-09 - Hiram) # FASTA from 5-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S cavPor3HgPal mkdir /hive/data/genomes/cavPor3/bed/multiz5way/pal cd /hive/data/genomes/cavPor3/bed/multiz5way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop takes about 2.5 hours on this large count contig assembly export mz=multiz5way export gp=ensGene export db=cavPor3 export I=0 export D=0 mkdir exonAA exonNuc printf '#!/bin/sh\n' > $gp.jobs time for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done >> $gp.jobs # real 1m25.411s echo "date" >> $gp.jobs echo "wait" >> $gp.jobs chmod +x $gp.jobs time (./$gp.jobs) > $gp.jobs.log 2>&1 # real 7m28.075s export mz=multiz5way export gp=ensGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m14.448s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 0m42.669s # -rw-rw-r-- 1 30006064 May 9 10:12 ensGene.multiz5way.exonAA.fa.gz # -rw-rw-r-- 1 47566626 May 9 10:13 ensGene.multiz5way.exonNuc.fa.gz export mz=multiz5way export gp=ensGene export db=cavPor3 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # wiki page for 5-way (DONE - 2016-05-09 - Hiram) mkdir /hive/users/hiram/bigWays/cavPor3.5way cd /hive/users/hiram/bigWays echo "cavPor3" > cavPor3.5way/ordered.list awk '{print $1}' /hive/data/genomes/cavPor3/bed/multiz5way/5way.distances.txt \ >> cavPor3.5way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh cavPor3.5way/ordered.list # dbDb.sh constructs cavPor3.5way/CavPor3_5-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh cavPor3 5way # sizeStats.pl constructs cavPor3.5way/CavPor3_5-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl cavPor3 5way # defCheck.pl constructs CavPor3_5-way_conservation_lastz_parameters.html ./defCheck.pl cavPor3 5way # this constructs the html pages in cavPor3.5way/: # -rw-rw-r-- 1 3287 May 9 10:24 CavPor3_5-way_conservation_alignment.html # -rw-rw-r-- 1 4805 May 9 10:25 CavPor3_5-way_Genome_size_statistics.html # -rw-rw-r-- 1 3293 May 9 10:25 CavPor3_5-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # CavPor3_5-way_conservation_alignment # CavPor3_5-way_Genome_size_statistics # CavPor3_5-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################ ## create upstream refGene maf files (DONE - 2016-05-09 - Hiram) mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way # bash script #!/bin/sh export geneTbl="ensGene" for S in 1000 2000 5000 do printf "making upstream${S}.maf\n" 1>&2 featureBits cavPor3 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags cavPor3 multiz5way \ stdin stdout \ -orgs=/hive/data/genomes/cavPor3/bed/multiz5way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz printf "done upstream${S}.${geneTbl}.maf.gz\n" 1>&2 done time (./mkUpstream.sh ) > upstream.log 2>&1 # real 0m33.868s md5sum upstream*.gz >> md5sum.txt # script to help create document information: for db in `cat ../species.list` do orgName=`hgsql -N -e "select organism from dbDb where name=\"${db}\";" hgcentraltest` sciName=`hgsql -N -e "select scientificName from dbDb where name=\"${db}\";" hgcentraltest` genome=`hgsql -N -e "select genome from dbDb where name=\"${db}\";" hgcentraltest` descr=`hgsql -N -e "select description from dbDb where name=\"${db}\";" hgcentraltest` printf "%-18s %-24s %-20s\n" "$genome" "$sciName" "$descr" done # Guinea pig Cavia porcellus Feb. 2008 (Broad/cavPor3) # Mouse Mus musculus Dec. 2011 (GRCm38/mm10) # Human Homo sapiens Dec. 2013 (GRCh38/hg38) # Tree shrew Tupaia belangeri Dec. 2006 (Broad/tupBel1) # Malayan flying lemur Galeopterus variegatus Jun. 2014 (G_variegatus-3.0.2/galVar1) # obtain the README.txt from ce11/multiz26way and update for this # situation mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way ln -s `pwd`/upstream*.gz README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way ############################################################################## # LASTZ Guinea pig/cavPor3 - Chinese tree shrew/tupChi1 # (DONE - 2017-11-17 - Hiram) mkdir /hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17 cd /hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17 printf '# Guinea pig vs. Chinese tree shrew BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Guinea pig cavPor3 SEQ1_DIR=/hive/data/genomes/cavPor3/cavPor3.wmTrf.2bit SEQ1_LEN=/hive/data/genomes/cavPor3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Chinese tree shrew SEQ2_DIR=/hive/data/genomes/tupChi1/tupChi1.2bit SEQ2_LEN=/hive/data/genomes/tupChi1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -stop=partition -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 201m20.581s cat fb.cavPor3.chainTupChi1Link.txt # 1105585848 bases of 2663369733 (41.511%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` \ cavPor3 tupChi1) > rbest.log 2>&1 & # real 357m30.385s cat fb.cavPor3.chainRBestTupChi1Link.txt # 965471669 bases of 2663369733 (36.250%) in intersection # and for the swap: mkdir /hive/data/genomes/tupChi1/bed/blastz.cavPor3.swap cd /hive/data/genomes/tupChi1/bed/blastz.cavPor3.swap time ($HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 120m19.543s cat fb.tupChi1.chainCavPor3Link.txt # 1115700960 bases of 2706389135 (41.225%) in intersection cat fb.tupChi1.chainSynCavPor3Link.txt # 936793671 bases of 2706389135 (34.614%) in intersection time (~/kent/src/hg/utils/automation/doRecipBest.pl \ -buildDir=`pwd` tupChi1 cavPor3) > rbest.log 2>&1 # real 245m56.588s ############################################################################# ## 6-Way Multiz (DONE - 2017-12-16 - Hiram) ssh hgwdev mkdir /hive/data/genomes/cavPor3/bed/multiz6way cd /hive/data/genomes/cavPor3/bed/multiz6way # from the 218-way in the source tree, select out the 6 used here: /cluster/bin/phast/tree_doctor \ --prune-all-but hg38,galVar1,cavPor3,mm10,tupBel1,tupChi1 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/218way.nh \ > cavPor3.6way.nh.0 cat cavPor3.6way.nh.0 # ((hg38:0.143908,((tupChi1:0.070000,tupBel1:0.086203):0.050000, # galVar1:0.080000):0.054937):0.002000,(mm10:0.315424, # cavPor3:0.175779):0.041059); # using TreeGraph2 tree editor on the Mac, rearrange to get cavPor3 # at the top: # what that looks like: ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.6way.nh | sed -e 's/^/# /;' # ((cavPor3:0.175779, # mm10:0.315424):0.041059, # (hg38:0.143908, # ((tupChi1:0.07, # tupBel1:0.086203):0.05, # galVar1:0.08):0.054937):0.002); # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ cavPor3.6way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ | sed -e 's/-nosed/_nosed/; s/-eating/_eating/;' > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" cavPor3.6way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.6way.commonNames.nh cat cavPor3.6way.commonNames.nh | sed -e 's/^/# /;' # ((Guinea_pig:0.175779, # Mouse:0.315424):0.041059, # (Human:0.143908, # ((Chinese_tree_shrew:0.07, # Tree_shrew:0.086203):0.05, # Malayan_flying_lemur:0.08):0.054937):0.002); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/cavPor3_6way.png ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.6way.nh > t.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.6way.scientificNames.nh cat cavPor3.6way.scientificNames.nh | sed -e 's/^/# /;' # ((Cavia_porcellus:0.175779, # Mus_musculus:0.315424):0.041059, # (Homo_sapiens:0.143908, # ((Tupaia_chinensis:0.07, # Tupaia_belangeri:0.086203):0.05, # Galeopterus_variegatus:0.08):0.054937):0.002); /cluster/bin/phast/all_dists cavPor3.6way.nh | grep cavPor3 \ | sed -e "s/cavPor3.//" | sort -k2n > 6way.distances.txt # Use this output to create the table below cat 6way.distances.txt | sed -e 's/^/# /;' # galVar1 0.353775 # hg38 0.362746 # tupChi1 0.393775 # tupBel1 0.409978 # mm10 0.491203 printf '#!/usr/bin/env perl use strict; use warnings; open (FH, "<6way.distances.txt") or die "can not read 6way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('"'"'\\s+'"'"', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/cavPor3/bed/lastz.$D/fb.cavPor3." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\\%%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.cavPor3/fb.${D}.chainCavPor3Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\\%%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='"'\$D'"';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %%02d %%.4f (%%%% %%05.3f) (%%%% %%05.3f) - %%s %%s\\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); ' > sizeStats.pl chmod +x ./sizeStats.pl ./sizeStats.pl # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainLink # N distance on cavPor3 on other other species # 01 0.3538 (% 48.820) (% 48.448) - Malayan flying lemur galVar1 # 02 0.3627 (% 48.000) (% 42.371) - Human hg38 # 03 0.3938 (% 41.511) (% 41.225) - Chinese tree shrew tupChi1 # 04 0.4100 (% 27.994) (% 34.990) - Tree shrew tupBel1 # 05 0.4912 (% 29.115) (% 28.447) - Mouse mm10 # None of this concern for distances matters in building the first step, the # maf files. The distances will be better calibrated later. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ cavPor3.6way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # cavPor3 mm10 hg38 tupChi1 tupBel1 galVar1 # survey N60 for each for db in `cat species.list` do n50.pl /hive/data/genomes/$db/chrom.sizes done # reading: /hive/data/genomes/cavPor3/chrom.sizes # contig count: 3144, total size: 2723219641, one half size: 1361609820 # cumulative N50 count contig contig size 1356838683 27 scaffold_25 28222655 1361609820 one half size 1384780737 28 scaffold_27 27942054 # reading: /hive/data/genomes/mm10/chrom.sizes # contig count: 66, total size: 2730871774, one half size: 1365435887 # cumulative N50 count contig contig size 1312176979 8 chr7 145441459 1365435887 one half size 1442871972 9 chr10 130694993 # reading: /hive/data/genomes/hg38/chrom.sizes # contig count: 455, total size: 3209286105, one half size: 1604643052 # cumulative N50 count contig contig size 1547391171 8 chrX 156040895 1604643052 one half size 1692529807 9 chr8 145138636 # reading: /hive/data/genomes/tupChi1/chrom.sizes # contig count: 50750, total size: 2846580235, one half size: 1423290117 # cumulative N50 count contig contig size 1419920836 231 KB321095 3691413 1423290117 one half size 1423590960 232 KB321106 3670124 # reading: /hive/data/genomes/tupBel1/chrom.sizes # contig count: 150851, total size: 3660774957, one half size: 1830387478 # cumulative N50 count contig contig size 1830345737 7910 scaffold_129972.1-127906 127906 1830387478 one half size 1830473625 7911 scaffold_147844.1-127888 127888 # reading: /hive/data/genomes/galVar1/chrom.sizes # contig count: 179514, total size: 3187660572, one half size: 1593830286 # cumulative N50 count contig contig size 1593691350 3422 NW_007730159v1 245222 1593830286 one half size 1593936539 3423 NW_007729331v1 245189 # bash shell syntax here ... cd /hive/data/genomes/cavPor3/bed/multiz6way export H=/hive/data/genomes/cavPor3/bed mkdir mafLinks # good assemblies can use syntenic net: # hg38 mm10 galVar1 for G in hg38 mm10 galVar1 tupChi1 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G done # other assemblies using recip best net: # tupBel1 for G in tupBel1 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G done # verify the symLinks are good: ls -ogrtL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' # 549742600 Mar 19 2012 mafLinks/mm10/cavPor3.mm10.synNet.maf.gz # 930458998 Apr 29 2015 mafLinks/hg38/cavPor3.hg38.synNet.maf.gz # 881164370 Apr 15 2016 mafLinks/galVar1/cavPor3.galVar1.synNet.maf.gz # 535098561 Apr 18 2016 mafLinks/tupBel1/cavPor3.tupBel1.rbest.maf.gz # 741020498 Nov 17 16:35 mafLinks/tupChi1/cavPor3.tupChi1.synNet.maf.gz # split the maf files into a set of hashed named files # this hash named split keeps the same chr/contig names in the same # named hash file. mkdir /hive/data/genomes/cavPor3/bed/multiz6way/mafSplit cd /hive/data/genomes/cavPor3/bed/multiz6way/mafSplit time for D in `sed -e "s/cavPor3 //" ../species.list` do echo "${D}" mkdir $D cd $D echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz" mafSplit -byTarget -useHashedName=8 /dev/null . \ ../../mafLinks/${D}/*.maf.gz cd .. done # real 2m53.479s # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | wc -l # 873 find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 239 maf.list mkdir /hive/data/genomes/cavPor3/bed/multiz6way/splitRun cd /hive/data/genomes/cavPor3/bed/multiz6way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # verify the db and pairs settings are correct printf '#!/bin/csh -ef set db = cavPor3 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/cavPor3/bed/multiz6way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp ' > autoMultiz.csh chmod +x autoMultiz.csh printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/cavPor3/bed/multiz6way/splitRun/maf/$(root1).maf} #ENDLOOP ' > template ln -s ../../mafSplit/maf.list maf.list ssh ku cd /hive/data/genomes/cavPor3/bed/multiz6way/splitRun/run gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc... # Completed: 239 of 239 jobs # CPU time in finished jobs: 88973s 1482.88m 24.71h 1.03d 0.003 y # IO & Wait Time: 713s 11.88m 0.20h 0.01d 0.000 y # Average job time: 375s 6.25m 0.10h 0.00d # Longest finished job: 3383s 56.38m 0.94h 0.04d # Submission to last job: 5424s 90.40m 1.51h 0.06d # combine into one file (the 1>&2 redirect sends the echo to stderr) cd /hive/data/genomes/cavPor3/bed/multiz6way head -1 splitRun/maf/020.maf > multiz6way.maf time for F in splitRun/maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} done >> multiz6way.maf # real 0m37.947s tail -1 splitRun/maf/020.maf >> multiz6way.maf # -rw-rw-r-- 1 9866798945 Dec 16 22:59 multiz6way.maf # Load into database ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz6way mkdir /gbdb/cavPor3/multiz6way ln -s `pwd`/multiz6way.maf /gbdb/cavPor3/multiz6way cd /dev/shm time hgLoadMaf cavPor3 multiz6way # Loaded 9177087 mafs in 1 files from /gbdb/cavPor3/multiz6way # real 2m51.288s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 cavPor3 multiz6waySummary \ /gbdb/cavPor3/multiz6way/multiz6way.maf # Created 1021373 summary blocks from 25418473 components and 9177087 mafs from /gbdb/cavPor3/multiz6way/multiz6way.maf # real 3m27.813s # -rw-rw-r-- 1 511006120 Dec 16 23:01 multiz6way.tab # -rw-rw-r-- 1 51570723 Dec 16 23:06 multiz6waySummary.tab wc -l multiz6way*.tab # 9177087 multiz6way.tab # 1021373 multiz6waySummary.tab rm multiz6way*.tab ############################################################################## # GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2017-12-16 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. Need to split of the maf file into individual # maf files mkdir -p /hive/data/genomes/cavPor3/bed/multiz6way/anno/mafSplit cd /hive/data/genomes/cavPor3/bed/multiz6way/anno/mafSplit time mafSplit -outDirDepth=2 -byTarget -useFullSequenceName \ /dev/null . ../../multiz6way.maf # real 2m49.707s find . -type f | wc -l # 933 # check for N.bed files everywhere: cd /hive/data/genomes/cavPor3/bed/multiz6way/anno for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" # cd /hive/data/genomes/${DB} # twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/cavPor3/bed/multiz6way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S gapAnno # use a screen to control this longish job ssh ku cd /hive/data/genomes/cavPor3/bed/multiz6way/anno mkdir result find ./mafSplit -type d | sed -e 's#./mafSplit/##' | while read D do echo mkdir -p result/${D} mkdir -p result/${D} done printf '#LOOP mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/cavPor3/cavPor3.2bit {check out exists+ result/$(path1)} #ENDLOOP ' > template find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list gensub2 maf.list single template jobList # limit jobs on a node with the ram=32g requirement because they go fast para -ram=32g create jobList para try ... check ... push ... # Completed: 933 of 933 jobs # CPU time in finished jobs: 1328s 22.14m 0.37h 0.02d 0.000 y # IO & Wait Time: 2690s 44.83m 0.75h 0.03d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 32s 0.53m 0.01h 0.00d # Submission to last job: 91s 1.52m 0.03h 0.00d # verify all result files have some content, look for 0 size files: find ./result -type f -size 0 # should see none # or in this manner: find ./result -type f | xargs ls -og | sort -k3nr | tail # combine into one file (the 1>&2 redirect sends the echo to stderr) head -q -n 1 result/0/0/scaffold_1476.maf > cavPor3.6way.maf time find ./result -type f | while read F do echo "${F}" 1>&2 grep -h -v "^#" ${F} done >> cavPor3.6way.maf # real 1m7.444s # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/0/0/scaffold_1476.maf >> cavPor3.6way.maf # How about an official end marker: echo "##eof maf" >> cavPor3.6way.maf ls -og # -rw-rw-r-- 1 13600527773 Dec 16 23:23 cavPor3.6way.maf du -hsc cavPor3.6way.maf ../*.maf # 13G cavPor3.6way.maf # 9.2G ../multiz6way.maf # construct symlinks to get the individual maf files into gbdb: rm /gbdb/cavPor3/multiz6way/multiz6way.maf # remove previous results ln -s `pwd`/cavPor3.6way.maf /gbdb/cavPor3/multiz6way/multiz6way.maf # Load into database cd /dev/shm time hgLoadMaf -pathPrefix=/gbdb/cavPor3/multiz6way cavPor3 multiz6way # Loaded 10188349 mafs in 1 files from /gbdb/cavPor3/multiz6way # real 3m38.228s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 cavPor3 multiz6waySummary \ /gbdb/cavPor3/multiz6way/multiz6way.maf # Created 1021373 summary blocks from 25418473 components and 10188349 mafs from /gbdb/cavPor3/multiz6way/multiz6way.maf # real 4m14.633s # -rw-rw-r-- 1 570229240 Dec 16 23:27 multiz6way.tab # -rw-rw-r-- 1 53613469 Dec 16 23:33 multiz6waySummary.tab rm multiz6way*.tab ###################################################################### # MULTIZ7WAY MAF FRAMES (DONE - 2017-12-17 - Hiram) ssh hgwdev mkdir /hive/data/genomes/cavPor3/bed/multiz6way/frames cd /hive/data/genomes/cavPor3/bed/multiz6way/frames # survey all the genomes to find out what kinds of gene tracks they have printf '#!/bin/csh -fe foreach db (`cat ../species.list`) printf "# ${db}: " set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "ncbiRefSeq" || $table == "mgcGenes" || \ $table == "knownGene" || $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='"'"'$db'"'"'"` set orgId = `hgsql hgFixed -N -e \ "select id from organism where name='"'"'$orgName'"'"'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql hgFixed -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end ' > showGenes.csh chmod +x ./showGenes.csh time ./showGenes.csh # cavPor3: ensGene: 34846, refGene: 488, xenoRefGene: 316945, Mrnas: 21241 # mm10: ensGene: 103734, knownGene: 63759, mgcGenes: 27612, ncbiRefSeq: 106520, refGene: 39240, xenoRefGene: 183459, Mrnas: 5371140 # hg38: ensGene: 208239, knownGene: 196838, mgcGenes: 35312, ncbiRefSeq: 159322, refGene: 74453, xenoRefGene: 187376, Mrnas: 11508577 # tupChi1: refGene: 206, xenoRefGene: 353563, Mrnas: 50709 # tupBel1: ensGene: 34727, xenoRefGene: 751689, Mrnas: 2543 # galVar1: ncbiRefSeq: 41547, xenoRefGene: 516902, Mrnas: 0 # from that summary, use these gene sets: # knownGene - hg38 mm10 # ensGene - cavPor3 tupBel1 # none - tupChi1 galVar1 mkdir genes # 1. knownGene: hg38 mm10 for DB in hg38 mm10 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/${DB}.gp.gz printf "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # hg38: checked: 21554 failed: 0 # mm10: checked: 21100 failed: 0 # 2. ensGene: cavPor3 tupBel1 for DB in cavPor3 tupBel1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz printf "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # cavPor3: checked: 18034 failed: 0 # tupBel1: checked: 29256 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/cavPor3.gp.gz: 18034 # genes/hg38.gp.gz: 21554 # genes/mm10.gp.gz: 21100 # genes/tupBel1.gp.gz: 15407 time (cat ../anno/cavPor3.6way.maf \ | genePredToMafFrames cavPor3 stdin stdout \ `sed -e 's/tupChi1//; s/galVar1//;' ../species.list.txt | xargs echo \ | sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g;"` \ | gzip > multiz6wayFrames.bed.gz) # real 2m42.982s # verify there are frames on everything, should be 6 species: zcat multiz6wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' # 178463 cavPor3 # 246612 hg38 # 234751 mm10 # 201885 tupBel1 # load the resulting file ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz6way/frames time hgLoadMafFrames cavPor3 multiz6wayFrames multiz6wayFrames.bed.gz # real 0m9.457s time featureBits -countGaps cavPor3 multiz6wayFrames # 35046322 bases of 2723219641 (1.287%) in intersection # real 0m6.884s # enable the trackDb entries: # frames multiz6wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 6-way (DONE - 2017-12-17 - Hiram) mkdir /hive/data/genomes/cavPor3/bed/multiz6way/4d cd /hive/data/genomes/cavPor3/bed/multiz6way/4d # using the ensGene hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" cavPor3 \ | genePredSingleCover stdin stdout > cavPor3.ensGeneNR.gp genePredCheck -db=cavPor3 cavPor3.ensGeneNR.gp # checked: 18034 failed: 0 # the annotated maf is: og ../anno/cavPor3.6way.maf # -rw-rw-r-- 1 13063660432 May 4 10:33 ../anno/cavPor3.6way.maf mkdir annoSplit cd annoSplit time mafSplit -verbose=2 -outDirDepth=2 -byTarget -useFullSequenceName \ /dev/null . ../../anno/cavPor3.6way.maf # real 4m19.176s find . -type f | wc -l # 933 ssh ku mkdir /hive/data/genomes/cavPor3/bed/multiz6way/4d/run cd /hive/data/genomes/cavPor3/bed/multiz6way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set GP = cavPor3.ensGeneNR.gp set r = "/hive/data/genomes/cavPor3/bed/multiz6way" set c = $1:r set infile = $r/4d/annoSplit/$2 set outDir = $r/4d/mfa/$3:h set outfile = $r/4d/mfa/$3 /bin/mkdir -p $outDir cd /dev/shm /bin/awk -v C=$c '"'"'$2 == C {print}'"'"' $r/4d/$GP | sed -e "s/\\t$c\\t/\\tcavPor3.$c\\t/" > $c.gp set NL=`wc -l $c.gp| gawk '"'"'{print $1}'"'"'` echo $NL if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile else echo "" > $outfile endif /bin/rm -f /dev/shm/$c.gp /dev/shm/$c.ss ' > 4d.csh chmod +x 4d.csh find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list wc -l maf.list # 76237 maf.list printf '#LOOP 4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(dir2)$(root1).mfa} #ENDLOOP ' > template gensub2 maf.list single template jobList para create jobList para try ... check para time # Completed: 76896 of 76237 jobs # Crashed: 342 jobs # CPU time in finished jobs: 2831s 47.19m 0.79h 0.03d 0.000 y # IO & Wait Time: 192714s 3211.90m 63.63h 2.23d 0.006 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 6s 0.10m 0.00h 0.00d # Submission to last job: 1616s 26.92m 0.46h 0.02d # Not all results have contents, or finish successfully, that is OK # it is because not all contigs have genes, only gene sequences are measured # combine mfa files ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz6way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # sometimes this doesn't work, don't know why find ./mfa -type f -size 1 | xargs rm -f # when it doesn't, use this empty list procedure find ./mfa -type f | xargs ls -og | awk '$3 < 2' | awk '{print $NF}' \ > empty.list cat empty.list | xargs rm -f # see what is left: ls -ogrt mfa/*/*/*.mfa | sort -k3nr | wc # 8027 66189 473683 # want comma-less species.list time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*/*/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # real 1m10.731s # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 6 grep "^>" 4d.all.mfa | sed -e 's/^/# /;' # >cavPor3 # >tupBel1 # >hg38 # >mm10 # >galVar1 sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../cavPor3.6way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh # tree_commas.nh looks like: # (((cavPor3,tupBel1),hg38),(mm10,galVar1)) # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree_commas.nh 4d.all.mfa # real 0m1.209s mv phyloFit.mod all.mod grep TREE all.mod # TREE: # (((cavPor3:0.101018,tupBel1:0.179618):0.00922862,hg38:0.116764):0.0263261, # (mm10:0.281608,galVar1:0.209143):0.0263261); # compare these calculated lengths to the tree extracted from 218way: grep TREE all.mod | sed -e 's/TREE: //' \ | /cluster/bin/phast/all_dists /dev/stdin | grep cavPor3 \ | sed -e "s/cavPor3.//;" | sort > new.dists /cluster/bin/phast/all_dists ../cavPor3.6way.nh | grep cavPor3 \ | sed -e "s/cavPor3.//;" | sort > old.dists # printing out the 'new', the 'old' the 'difference' and percent difference join new.dists old.dists | awk '{ printf "#\t%s\t%8.5f\t%8.5f\t%8.5f\t%8.5f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' \ | sort -k3n # galVar1 0.38071 0.35378 0.02693 7.61247 # hg38 0.38638 0.36275 0.02363 6.51503 # tupChi1 0.46067 0.39377 0.06690 16.98914 # tupBel1 0.46482 0.40998 0.05484 13.37633 # mm10 0.50607 0.49120 0.01487 3.02726 XXX - ready to continue - Sun Dec 17 21:00:07 PST 2017 ######################################################################### # phastCons 6-way (TBD - 2016-06-06 - Hiram) # split 6way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS mkdir result done printf '#!/bin/csh -ef set d = $1 set c = $2 set doneDir = done/$d set MAF = /hive/data/genomes/cavPor3/bed/multiz6way/anno/result/$d/$c.maf set WINDOWS = /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS/result/$d/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $3 ) then exit 0 endif if ( -s $3.running ) then exit 0 endif /bin/mkdir -p $doneDir /bin/date >> $3.running /bin/rm -fr $WINDOWS /bin/mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \\ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null /bin/date >> $3 /bin/rm -f $3.running ' > mkSS.csh chmod +x mkSS.csh printf '#LOOP mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)} #ENDLOOP ' > template find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list wc -l maf.list # 76237 maf.list ssh ku cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS gensub2 maf.list single template jobList # beware overwhelming the cluster with these quick high I/O jobs para create jobList para try ... check ... etc para -maxJob=64 push # Completed: 76237 of 76237 jobs # CPU time in finished jobs: 3491s 68.19m 0.97h 0.04d 0.000 y # IO & Wait Time: 321266s 6364.26m 89.24h 3.72d 0.010 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 1683s 26.38m 0.44h 0.02d find ./result -type f | wc -l # 24863 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/cavPor3/bed/multiz6way/cons/run.cons cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set d = $2 set f = $3 set len = $4 set cov = $5 set rho = $6 set grp = $cwd:t set cons = /hive/data/genomes/cavPor3/bed/multiz6way/cons set tmp = $cons/tmp/${d}_${c} mkdir -p $tmp set ssSrc = $cons/SS/result set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$d/$f $tmp else ln -s $ssSrc/$d/$f $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \\ --not-informative `cat $grp.non-inf` \\ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp else $PHASTBIN/phastCons $f $useGrp \\ --rho $rho --expected-length $len --target-coverage $cov --quiet \\ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp endif popd > /dev/null mkdir -p pp/$d bed/$d sleep 4 touch pp/$d bed/$d rm -f pp/$d/$c.pp rm -f bed/$d/$c.bed mv $tmp/$c.pp pp/$d mv $tmp/$c.bed bed/$d rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h ' > doPhast.csh chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix printf '#LOOP ../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp} #ENDLOOP ' > template find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list wc -l ss.list # 24863 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/cavPor3/bed/multiz6way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=32g create jobList para try ... check ... para push # Completed: 24816 of 24863 jobs # Crashed: 38 jobs # CPU time in finished jobs: 6246s 104.09m 1.73h 0.07d 0.000 y # IO & Wait Time: 222621s 3710.36m 61.84h 2.68d 0.007 y # Average job time: 9s 0.16m 0.00h 0.00d # Longest finished job: 18s 0.30m 0.01h 0.00d # Submission to last job: 936s 16.68m 0.26h 0.01d # the 38 crash jobs were actually finished, they failed the last rmdir: # rmdir: failed to remove `/hive/data/genomes/cavPor3/bed/multiz6way/cons/tmp/7/6': No such file or directory # create Most Conserved track cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/?/?/${C} 2> /dev/null | while read D do echo ${D}/${C}*.bed 1>&2 cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $6, $6;}' done > tmpMostConserved.bed # real 19m26.846s time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ > mostConserved.bed # real 0m6.667s # -rw-rw-r-- 1 36626033 May 6 14:66 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/all time hgLoadBed cavPor3 phastConsElements6way mostConserved.bed # Read 932866 elements of size 6 from mostConserved.bed # real 0m9.898s # on human we often try for 6% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between # --rho 0.3 --expected-length 46 --target-coverage 0.3 time featureBits cavPor3 -enrichment ensGene:cds phastConsElements6way # ensGene:cds 1.217%, phastConsElements6way 3.976%, both 0.819%, # cover 67.27%, enrich 16.92x # real 2m33.330s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/all mkdir downloads # the third sed fixes the chrom names, removing the partition extensions time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ | gzip -c > downloads/phastCons6way.wigFix.gz) # real 27m47.146s # -rw-rw-r-- 1 2207346680 May 6 16:33 phastCons6way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons6way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \ phastCons6way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=37326: VmPeak: 20943944 kB # real 30m30.283s bigWigInfo phastCons6way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 3,364,371,979 # primaryIndexSize: 81,969,008 # zoomLevels: 10 # chromCount: 24863 # basesCovered: 1,910,088,693 # mean: 0.117236 # min: 0.000000 # max: 1.000000 # std: 0.237776 # encode those files into wiggle data time (zcat downloads/phastCons6way.wigFix.gz \ | wigEncode stdin phastCons6way.wig phastCons6way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 10m31.797s du -hsc *.wi? # 1.8G phastCons6way.wib # 276M phastCons6way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phastCons6way.wib /gbdb/cavPor3/multiz6way/phastCons6way.wib time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz6way \ cavPor3 phastCons6way phastCons6way.wig # real 0m30.803s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh cavPor3 phastCons6way # db.table min max mean count sumData # cavPor3.phastCons6way 0 1 0.117236 1910088693 2.23929e+08 # stdDev viewLimits # 0.237776 viewLimits=0:1 # Create histogram to get an overview of all the data time hgWiggle -doHistogram -db=cavPor3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons6way > histogram.data 2>&1 # real 4m6.489s # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Malayan flying lemur cavPor3 Histogram phastCons6way track" set xlabel " phastCons6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:6 title " RelFreq" with impulses, \\ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png display histo.png & ######################################################################### # phyloP for 6-way (TBD - 2016-06-09,11 - Hiram) # run phyloP with score=LRT ssh ku mkdir /cluster/data/cavPor3/bed/multiz6way/consPhyloP cd /cluster/data/cavPor3/bed/multiz6way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.662 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../4d/all.mod 0.662 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.219000 0.281000 0.281000 0.219000 printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set d = $f:h set file1 = $f:t set out = $2 set cName = $f:t:r set grp = $cwd:t set cons = /hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "/hive/data/genomes/cavPor3/bed/multiz6way/cons/SS/result/$f" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \\ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$file1.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp ' > doPhyloP.csh chmod +x doPhyloP.csh # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list # make sure the list looks good wc -l ss.list # 24863 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP/all cd /hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overwhelming the cluster with these fast running high I/O jobs para create jobList para try ... check ... push ... etc ... para -maxJob=63 push para time > run.time # Completed: 24862 of 24863 jobs # Crashed: 1 jobs # CPU time in finished jobs: 7617s 126.29m 2.09h 0.09d 0.000 y # IO & Wait Time: 166287s 2771.46m 46.19h 1.92d 0.006 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest finished job: 11s 0.18m 0.00h 0.00d # Submission to last job: 1799s 29.98m 0.60h 0.02d # the one failed job was just the last rmdir command: # /bin/rmdir: failed to remove `/hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP/tmp/all/6/2 mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP6way.wigFix.gz) # real 30m20.672s # check integrity of data with wigToBigWig time (zcat downloads/phyloP6way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \ phyloP6way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=19896: VmPeak: 20943916 kB # real 216m36.688s bigWigInfo phyloP6way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 3,067,676,886 # primaryIndexSize: 81,969,008 # zoomLevels: 10 # chromCount: 24863 # basesCovered: 1,910,088,693 # mean: 0.061208 # min: -2.699000 # max: 0.833000 # std: 0.611669 # encode those files into wiggle data time (zcat downloads/phyloP6way.wigFix.gz \ | wigEncode stdin phyloP6way.wig phyloP6way.wib) # Converted stdin, upper limit 0.83, lower limit -2.60 # real 9m4.643s du -hsc *.wi? # 1.8G phyloP6way.wib # 279M phyloP6way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP6way.wib /gbdb/cavPor3/multiz6way/phyloP6way.wib time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz6way cavPor3 \ phyloP6way phyloP6way.wig # real 0m30.869s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh cavPor3 phyloP6way # db.table min max mean count sumData # cavPor3.phyloP6way -2.699 0.833 0.0612084 1910088693 1.16914e+08 # stdDev viewLimits # 0.611669 viewLimits=-2.699:0.833 # that range is: 0.833+2.699 = 3.432 for hBinSize=0.003432 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.003432 -hBinCount=1000 -hMinVal=-2.669 -verbose=2 \ -db=cavPor3 phyloP6way > histogram.data 2>&1 # real 4m20.444s # find the Y range for the 2:6 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=6 stdin \ | sed -e 's/^/# /;' # Q1 0.000087 # median 0.000381 # Q3 0.001361 # average 0.001112 # min 0.000000 # max 0.032088 # count 899 # total 0.999990 # standard deviation 0.002166 # find the X range for the 2:6 graph grep "^[0-9]" histogram.data | ave -col=2 stdin \ | sed -e 's/^/# /;' # Q1 -1.773070 # median -1.004300 # Q3 -0.228672 # average -0.962349 # min -2.669000 # max 0.831816 # count 899 # total -866.161612 # standard deviation 0.964068 # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Guinea pig cavPor3 Histogram phyloP6way track" set xlabel " phyloP6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set xtics set xrange [-2.6:0.86] set yrange [0:0.033] plot "histogram.data" using 2:6 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png display histo.png & # appears to have an odd hole in the data near X=0 ? ############################################################################# # hgPal downloads (TBD - 2016-06-09,11 - Hiram) # FASTA from 6-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S cavPor3HgPal mkdir /hive/data/genomes/cavPor3/bed/multiz6way/pal cd /hive/data/genomes/cavPor3/bed/multiz6way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop takes about 2.6 hours on this large count contig assembly export mz=multiz6way export gp=ensGene export db=cavPor3 export I=0 export D=0 mkdir exonAA exonNuc printf '#!/bin/sh\n' > $gp.jobs time for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done >> $gp.jobs # real 116m16.333s echo "date" >> $gp.jobs echo "wait" >> $gp.jobs chmod +x ensGene.jobs time (./$gp.jobs) > $gp.jobs.log 2>&1 & # real 14m60.760s export mz=multiz6way export gp=ensGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 4m20.026s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 4m46.761s # -rw-rw-r-- 1 36201970 May 11 11:26 ensGene.multiz6way.exonAA.fa.gz # -rw-rw-r-- 1 69404213 May 11 11:30 ensGene.multiz6way.exonNuc.fa.gz export mz=multiz6way export gp=ensGene export db=cavPor3 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md6sum *.fa.gz > md6sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md6sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # construct download files for 6-way (TBD - 2016-06-11 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz6way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons6way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP6way mkdir /hive/data/genomes/cavPor3/bed/multiz6way/downloads cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads mkdir multiz6way phastCons6way phyloP6way cd multiz6way time cp -p ../../anno/cavPor3.6way.maf . # real 0m21.617s # -rw-rw-r-- 1 13600527773 Dec 16 23:23 cavPor3.6way.maf du -hsc * # 13G cavPor3.6way.maf time gzip *.maf # real 37m41.433s # -rw-rw-r-- 1 3122744783 Dec 16 23:23 cavPor3.6way.maf.gz du -hsc *.maf.gz ../../anno/*.maf # 3.0G cavPor3.6way.maf.gz # 13G ../../anno/cavPor3.6way.maf grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.6way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh cavPor3.6way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.6way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh cavPor3.6way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > cavPor3.6way.scientificNames.nh time md6sum *.nh *.maf.gz > md6sum.txt # real 0m36.144s ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz6way du -hsc *.maf.gz ../../anno/cavPor3.6way.maf # 3.0G cavPor3.6way.maf.gz # 13G ../../anno/cavPor3.6way.maf # obtain the README.txt from galVar1/multiz6way and update for this # situation ##################################################################### cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads/phastCons6way ln -s ../../cons/all/downloads/phastCons6way.wigFix.gz \ ./cavPor3.phastCons6way.wigFix.gz ln -s ../../cons/all/phastCons6way.bw ./cavPor3.phastCons6way.bw ln -s ../../cons/all/all.mod ./cavPor3.phastCons6way.mod time md6sum *.gz *.mod *.bw > md6sum.txt # real 0m20.364s # obtain the README.txt from galVar1/phastCons6way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons6way ##################################################################### cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads/phyloP6way ln -s ../../consPhyloP/all/downloads/phyloP6way.wigFix.gz \ ./cavPor3.phyloP6way.wigFix.gz ln -s ../../consPhyloP/run.phyloP/all.mod cavPor3.phyloP6way.mod ln -s ../../consPhyloP/all/phyloP6way.bw cavPor3.phyloP6way.bw time md6sum *.mod *.bw *.gz > md6sum.txt # real 0m29.662s # obtain the README.txt from cavPor3/phyloP17way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP6way ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads/multiz6way # bash script #!/bin/sh export geneTbl="ensGene" for S in 1000 2000 6000 do echo "making upstream${S}.maf" featureBits cavPor3 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags cavPor3 multiz6way \ stdin stdout \ -orgs=/hive/data/genomes/cavPor3/bed/multiz6way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 12m47.636s md6sum *.maf.gz *.nh upstream*.gz README.txt >> md6sum.txt # some other symlinks were already made above # obtain the README.txt from galVar1/multiz6way and update for this # situation ln -s `pwd`/upstream*.gz `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz6way ############################################################################# # hgPal downloads (TBD - 2016-06-11 - Hiram) # FASTA from 6-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S cavPor3HgPal mkdir /hive/data/genomes/cavPor3/bed/multiz6way/pal cd /hive/data/genomes/cavPor3/bed/multiz6way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop takes about 2.6 hours on this large count contig assembly export mz=multiz6way export gp=xenoRefGene export db=cavPor3 export I=0 export D=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x ./$gp.jobs > $gp.jobs.log 2>&1 & # real 176m60.376s export mz=multiz6way export gp=xenoRefGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 10m29.600s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 16m9.974s # -rw-rw-r-- 1 611281644 Apr 16 20:37 xenoRefGene.multiz6way.exonAA.fa.gz # -rw-rw-r-- 1 966671426 Apr 16 21:06 xenoRefGene.multiz6way.exonNuc.fa.gz export mz=multiz6way export gp=xenoRefGene export db=cavPor3 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md6sum *.fa.gz > md6sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md6sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # wiki page for 6-way (DONE - 2017-12-18 - Hiram) mkdir /hive/users/hiram/bigWays/cavPor3.6way cd /hive/users/hiram/bigWays echo "cavPor3" > cavPor3.6way/ordered.list awk '{print $1}' /hive/data/genomes/cavPor3/bed/multiz6way/6way.distances.txt \ >> cavPor3.6way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh cavPor3.6way/ordered.list # dbDb.sh constructs cavPor3.6way/CavPor3_6-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh cavPor3 6way # sizeStats.pl constructs cavPor3.6way/CavPor3_6-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl cavPor3 6way # defCheck.pl constructs CavPor3_6-way_conservation_lastz_parameters.html ./defCheck.pl cavPor3 6way # this constructs the html pages in cavPor3.6way/: # -rw-rw-r-- 3818 Dec 18 13:48 CavPor3_6-way_conservation_alignment.html # -rw-rw-r-- 5480 Dec 18 13:48 CavPor3_6-way_Genome_size_statistics.html # -rw-rw-r-- 3595 Dec 18 13:48 CavPor3_6-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # CavPor3_6-way_conservation_alignment # CavPor3_6-way_Genome_size_statistics # CavPor3_6-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################