############################################################################# ## 26-Way Multiz (DONE - 2015-08-13 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ce11/bed/multiz26way cd /hive/data/genomes/ce11/bed/multiz26way # constructed a tree from NCBI taxonomy and then edit to make it # a binary tree: (((((((((((((ce11:0.1, (caePb3:0.1, (caeRem4:0.1, cb4:0.1):0.1):0.1):0.1, caeJap4:0.1):0.1, caeSp111:0.1):0.1, caeAng2:0.1):0.1, caeSp51:0.1):0.1, hetBac1:0.1):0.1, (strRat2:0.1, panRed1:0.1):0.1):0.1, ((ancCey1:0.1, necAme1:0.1):0.1, haeCon2:0.1):0.1):0.1, ascSuu1:0.1):0.1, (priExs1:0.1, priPac3:0.1):0.1):0.1, ((melHap1:0.1, melInc2:0.1):0.1, burXyl1:0.1):0.1):0.1, (((dirImm1:0.1, loaLoa1:0.1):0.1, oncVol1:0.1):0.1, bruMal2:0.1):0.1):0.1, (triSpi1:0.1, triSui1:0.1):0.1); # what that looks like: ~/kent/src/hg/utils/phyloTrees/asciiTree.pl ce11.26way.nh | sed -e 's/^/# /;' # (((((((((((((ce11:0.1, # (caePb3:0.1, # (caeRem4:0.1, # cb4:0.1):0.1):0.1):0.1, # caeJap4:0.1):0.1, # caeSp111:0.1):0.1, # caeAng2:0.1):0.1, # caeSp51:0.1):0.1, # hetBac1:0.1):0.1, # (strRat2:0.1, # panRed1:0.1):0.1):0.1, # ((ancCey1:0.1, # necAme1:0.1):0.1, # haeCon2:0.1):0.1):0.1, # ascSuu1:0.1):0.1, # (priExs1:0.1, # priPac3:0.1):0.1):0.1, # ((melHap1:0.1, # melInc2:0.1):0.1, # burXyl1:0.1):0.1):0.1, # (((dirImm1:0.1, # loaLoa1:0.1):0.1, # oncVol1:0.1):0.1, # bruMal2:0.1):0.1):0.1, # (triSpi1:0.1, # triSui1:0.1):0.1); # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ce11.26way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ | sed -e 's/-nosed/_nosed/; s/-eating/_eating/;' > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" ce11.26way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.26way.commonNames.nh cat ce11.26way.commonNames.nh | sed -e 's/^/# /;' # (((((((((((((C__elegans:0.1, # (C__brenneri:0.1, # (C__remanei:0.1, # C__briggsae:0.1):0.1):0.1):0.1, # C__japonica:0.1):0.1, # C__tropicalis:0.1):0.1, # C__angaria:0.1):0.1, # C__sp__5_ju800:0.1):0.1, # H__bacteriophora:0.1):0.1, # (S__ratti_ed321:0.1, # P__redivivus:0.1):0.1):0.1, # ((Hookworm:0.1, # Hookworm:0.1):0.1, # H__contortus:0.1):0.1):0.1, # Pig_roundworm:0.1):0.1, # (P__exspectatus:0.1, # P__pacificus:0.1):0.1):0.1, # ((M__hapla:0.1, # M__incognita:0.1):0.1, # Pine_wood_nematode:0.1):0.1):0.1, # (((Dog_heartworm_nematode:0.1, # Eye_worm:0.1):0.1, # O__volvulus:0.1):0.1, # B__malayi:0.1):0.1):0.1, # (T__spiralis:0.1, # Whipworm:0.1):0.1); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/ce11_26way.png grep TREE 4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin > t.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.26way.scientificNames.nh ~/kent/src/hg/utils/phyloTrees/asciiTree.pl ce11.26way.nh > t.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.26way.scientificNames.nh rm -f t.nh cat ce11.26way.scientificNames.nh | sed -e 's/^/# /;' # (((((((((((((Caenorhabditis_elegans:0.1, # (Caenorhabditis_brenneri:0.1, # (Caenorhabditis_remanei:0.1, # Caenorhabditis_briggsae:0.1):0.1):0.1):0.1, # Caenorhabditis_japonica:0.1):0.1, # Caenorhabditis_tropicalis:0.1):0.1, # Caenorhabditis_angaria:0.1):0.1, # Caenorhabditis_sp5_ju800:0.1):0.1, # Heterorhabditis_bacteriophora:0.1):0.1, # (Strongyloides_ratti:0.1, # Panagrellus_redivivus:0.1):0.1):0.1, # ((Ancylostoma_ceylanicum:0.1, # Necator_americanus:0.1):0.1, # Haemonchus_contortus:0.1):0.1):0.1, # Ascaris_suum:0.1):0.1, # (Pristionchus_exspectatus:0.1, # Pristionchus_pacificus:0.1):0.1):0.1, # ((Meloidogyne_hapla:0.1, # Meloidogyne_incognita:0.1):0.1, # Bursaphelenchus_xylophilus:0.1):0.1):0.1, # (((Dirofilaria_immitis:0.1, # Loa_loa:0.1):0.1, # Onchocerca_volvulus:0.1):0.1, # Brugia_malayi:0.1):0.1):0.1, # (Trichinella_spiralis:0.1, # Trichuris_suis:0.1):0.1); /cluster/bin/phast/all_dists ce11.26way.nh | grep ce11 \ | sed -e "s/ce11.//" | sort -k2n > 26way.distances.txt # Use this output to create the table below cat 26way.distances.txt | sed -e 's/^/# /;' # caeJap4 0.300000 # caePb3 0.300000 # caeRem4 0.400000 # caeSp111 0.400000 # cb4 0.400000 # caeAng2 0.500000 # caeSp51 0.600000 # hetBac1 0.700000 # panRed1 0.900000 # strRat2 0.900000 # ascSuu1 1.000000 # haeCon2 1.000000 # ancCey1 1.100000 # necAme1 1.100000 # priExs1 1.200000 # priPac3 1.200000 # burXyl1 1.300000 # bruMal2 1.400000 # melHap1 1.400000 # melInc2 1.400000 # oncVol1 1.500000 # triSpi1 1.500000 # triSui1 1.500000 # dirImm1 1.600000 # loaLoa1 1.600000 cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<26way.distances.txt") or die "can not read 26way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/ce11/bed/lastz.$D/fb.ce11." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.ce11/fb.${D}.chainCe11Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainLink # N distance on ce11 on other other species # 01 0.3000 (% 27.709) (% 19.533) - C. japonica caeJap4 # 02 0.3000 (% 40.638) (% 32.296) - C. brenneri caePb3 # 03 0.4000 (% 41.715) (% 33.416) - C. remanei caeRem4 # 04 0.4000 (% 37.870) (% 47.362) - C. tropicalis caeSp111 # 05 0.4000 (% 39.384) (% 36.204) - C. briggsae cb4 # 06 0.5000 (% 17.682) (% 20.987) - C. angaria caeAng2 # 07 0.6000 (% 40.332) (% 31.721) - C. sp. 5 ju800 caeSp51 # 08 0.7000 (% 10.486) (% 13.556) - H. bacteriophora/m31e hetBac1 # 09 0.9000 (% 06.585) (% 10.359) - Microworm panRed1 # 10 0.9000 (% 05.669) (% 11.848) - Threadworm strRat2 # 11 1.0000 (% 05.722) (% 02.224) - Pig roundworm ascSuu1 # 12 1.0000 (% 08.936) (% 03.405) - Barber pole worm haeCon2 # 13 1.1000 (% 09.736) (% 03.237) - A. ceylanicum ancCey1 # 14 1.1000 (% 08.902) (% 04.115) - N. americanus necAme1 # 15 1.2000 (% 06.222) (% 04.321) - P. exspectatus priExs1 # 16 1.2000 (% 06.308) (% 04.534) - P. pacificus priPac3 # 17 1.3000 (% 06.291) (% 08.293) - Pine wood nematode burXyl1 # 18 1.4000 (% 05.065) (% 05.521) - Filarial worm bruMal2 # 19 1.4000 (% 03.916) (% 06.840) - M. hapla melHap1 # 20 1.4000 (% 03.516) (% 05.539) - M. incognita melInc2 # 21 1.5000 (% 05.193) (% 05.092) - O. volvulus oncVol1 # 22 1.5000 (% 02.848) (% 04.678) - Trichinella triSpi1 # 23 1.5000 (% 02.917) (% 03.714) - Whipworm triSui1 # 24 1.6000 (% 05.005) (% 05.474) - Dog heartworm dirImm1 # 25 1.6000 (% 05.056) (% 05.274) - Eye worm loaLoa1 # None of this concern for distances matters in building the first step, the # maf files. # Survey N50 stats to determine quality of assembly: echo -e "# db\tcontigCount\tN50\ttotalSize\tperCentAtN50" export T=/tmp/txy.$$ cat species.list.txt | while read D do rm -f ${T} n50.pl /hive/data/genomes/${D}/chrom.sizes > ${T} 2>&1 cc=`grep "contig count:" ${T} | awk '{print $4}' | sed -e 's/,//g;'` ts=`grep "contig count:" ${T} | awk '{print $7}' | sed -e 's/,//g;'` n50Size=`tail -1 ${T} | awk '{print $NF}'` perCent=`echo $n50Size $ts | awk '{printf "%.6f", 100*$1/$2}'` echo -e "# ${D}\t${cc}\t${n50Size}\t${ts}\t% ${perCent}" done | sort -k4,4nr # db contigCount N50 totalSize perCentAtN50 # oncVol1 710 25485961 97402144 % 26.165708 # caeSp111 665 20921866 79321433 % 26.376057 # ce11 7 17493829 100286401 % 17.443870 # cb4 13 17485439 108434085 % 16.125408 # strRat2 135 11693564 43150242 % 27.099649 # triSpi1 6864 6373445 63542128 % 10.030267 # priPac3 18084 1244534 172510819 % 0.721424 # burXyl1 5528 949830 74576239 % 1.273636 # ancCey1 1736 668412 313092710 % 0.213487 # triSui1 4293 503034 74248995 % 0.677496 # caeRem4 3670 435512 145442736 % 0.299439 # ascSuu1 12989 413062 269573965 % 0.153228 # caePb3 3305 381961 190369721 % 0.200642 # hetBac1 1241 312328 76992477 % 0.405660 # panRed1 941 262414 65110236 % 0.403030 # necAme1 11865 211861 244088665 % 0.086797 # bruMal2 9781 200860 95156665 % 0.211083 # loaLoa1 5765 174388 91379422 % 0.190839 # priExs1 4369 142245 177536987 % 0.080121 # caeJap4 18817 94149 166256191 % 0.056629 # haeCon2 23823 83299 369720058 % 0.022530 # caeAng2 34621 79858 105997628 % 0.075339 # dirImm1 16062 71281 88323343 % 0.080705 # melInc2 2996 62516 86079534 % 0.072626 # melHap1 3452 37608 53017507 % 0.070935 # caeSp51 15261 25228 131797386 % 0.019142 # Conclusion: Syntenic Net: oncVol1 caeSp111 cb4 strRat2 triSpi1 # ordinary net: the rest # XXX took a look at the syntenic net alignments, way too much sequence # was missing, therefore always use the 'ordinary' net # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ce11.26way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # should be 26: wc -w species.list # 26 species.list # ce11 caePb3 caeRem4 cb4 caeJap4 caeSp111 caeAng2 caeSp51 hetBac1 # strRat2 panRed1 ancCey1 necAme1 haeCon2 ascSuu1 priExs1 priPac3 melHap1 # melInc2 burXyl1 dirImm1 loaLoa1 oncVol1 bruMal2 triSpi1 triSui1 # bash shell syntax here ... cd /hive/data/genomes/ce11/bed/multiz26way export H=/hive/data/genomes/ce11/bed mkdir mafLinks # using 'ordinary' net for all: for G in `sed -e 's/ce11 //' species.list` do mkdir -p mafLinks/$G echo ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # verify the symLinks are good: ls -ogrtL mafLinks/*/* | sed -e 's#.*/chr#chr#;' | sort | uniq -c # 25 chrI.maf.gz # 25 chrII.maf.gz # 25 chrIII.maf.gz # 25 chrIV.maf.gz # 24 chrM.maf.gz # 25 chrV.maf.gz # 25 chrX.maf.gz # it appears one of them did not have any matching sequence to chrM ls -lR | grep chrM | awk '{print $NF}' \ | sed -e 's#.*lastz.##; s#/mafNet.*##;' | sort \ | xargs echo | fold -w72 -s # ancCey1 ascSuu1 bruMal2 burXyl1 caeAng2 caeJap4 caePb3 caeRem4 caeSp51 # cb4 dirImm1 haeCon2 hetBac1 loaLoa1 melHap1 melInc2 necAme1 oncVol1 # panRed1 priExs1 priPac3 strRat2 triSpi1 triSui1 # compared to all of them: sort species.list.txt | xargs echo | fold -w 72 -s # ancCey1 ascSuu1 bruMal2 burXyl1 caeAng2 caeJap4 caePb3 caeRem4 caeSp111 # caeSp51 cb4 ce11 dirImm1 haeCon2 hetBac1 loaLoa1 melHap1 melInc2 # necAme1 oncVol1 panRed1 priExs1 priPac3 strRat2 triSpi1 triSui1 # looks like it is caeSp111 that has no chrM alignment # found same pattern on ce10, curious. # even though there are 26 of these, they can still be done as # full chromosomes without the splitting procedure mkdir /hive/data/genomes/ce11/bed/multiz26way/fullChromRun cd /hive/data/genomes/ce11/bed/multiz26way/fullChromRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn find ../../mafLinks -type l | grep ".maf.gz$" | xargs -L 1 basename \ | sed -e 's/.gz//;' | sort -u > maf.list wc -l maf.list # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = ce11 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/ce11/bed/multiz26way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/ce11/bed/multiz26way/fullChromRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs ssh ku cd /hive/data/genomes/ce11/bed/multiz26way/fulChromRun/run gensub2 maf.list single template jobList para -ram=32g create jobList # Completed: 7 of 7 jobs # CPU time in finished jobs: 11928s 198.80m 3.31h 0.14d 0.000 y # IO & Wait Time: 29s 0.48m 0.01h 0.00d 0.000 y # Average job time: 1708s 28.47m 0.47h 0.02d # Longest finished job: 2176s 36.27m 0.60h 0.03d # Submission to last job: 2189s 36.48m 0.61h 0.03d cd /hive/data/genomes/ce11/bed/multiz26way du -hsc maf # 1.3G maf # Load into database ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz26way/fullChromRun/maf mkdir /gbdb/ce11/multiz26way ln -s `pwd`/*.maf /gbdb/ce11/multiz26way cd /dev/shm time hgLoadMaf ce11 multiz26way # Loaded 1469313 mafs in 7 files from /gbdb/ce11/multiz26way # real 0m28.115s time cat /gbdb/ce11/multiz26way/*.maf \ | hgLoadMafSummary ce11 -minSize=10000 -verbose=2 \ -mergeGap=500 -maxSize=50000 multiz26waySummary stdin # Created 397940 summary blocks from 12243969 components and 1469313 mafs # from stdin # real 0m38.197s # -rw-rw-r-- 1 70164926 Aug 13 14:34 multiz26way.tab # -rw-rw-r-- 1 17851130 Aug 13 14:35 multiz26waySummary.tab wc -l multiz26way* # 1469313 multiz26way.tab # 397940 multiz26waySummary.tab rm multiz26way*.tab ############################################################################## # GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2015-08-13 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. If there is a single maf file, need to split # into individual per-chrom maf files mkdir -p /hive/data/genomes/ce11/bed/multiz26way/anno cd /hive/data/genomes/ce11/bed/multiz26way/anno # this would split a single file into per-chrom files # mkdir -p /hive/data/genomes/ce11/bed/multiz26way/anno/mafSplit # cd /hive/data/genomes/ce11/bed/multiz26way/anno/mafSplit # time mafSplit -outDirDepth=1 -byTarget -useFullSequenceName \ # /dev/null . ../../multiz26way.maf # already per-chrom files here: find ../fullChromRun/maf -type f | wc -l # 7 # check for N.bed files everywhere: cd /hive/data/genomes/ce11/bed/multiz26way/anno for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/ce11/bed/multiz26way/anno # they end up empty for ce11 and melHap1, no gaps: hgsql -e 'select count(*) from gap;' ce11 # +----------+ # | count(*) | # +----------+ # | 0 | # +----------+ hgsql -e 'select count(*) from gap;' melHap1 # +----------+ # | count(*) | # +----------+ # | 0 | # +----------+ cd /hive/data/genomes/ce11/bed/multiz26way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S ce11 # use a screen to control this longish job ssh ku cd /hive/data/genomes/ce11/bed/multiz26way/anno mkdir result cat << '_EOF_' > template #LOOP mafAddIRows -nBeds=nBeds ../fullChromRun/maf/$(path1) /hive/data/genomes/ce11/ce11.2bit {check out exists+ result/$(path1)} #ENDLOOP '_EOF_' # << happy emacs ls ../maf > maf.list gensub2 maf.list single template jobList # limit jobs on a node with the -maxJob=64 requirement because they go fast # XXX - use more memory next time to allow all big jobs to finish para -ram=32g create jobList para try ... check ... push ... # don't run too many at once, these go very fast para -maxJob=64 push # Completed: 7 of 7 jobs # CPU time in finished jobs: 89s 1.48m 0.02h 0.00d 0.000 y # IO & Wait Time: 18s 0.30m 0.01h 0.00d 0.000 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest finished job: 21s 0.35m 0.01h 0.00d # Submission to last job: 25s 0.42m 0.01h 0.00d # verify all result files have some content, look for 0 size files: find ./result -type f -size 0 # should see none # or in this manner: find ./result -type f | xargs ls -og | sort -k3nr | tail # -rw-rw-r-- 1 785220 Aug 13 14:41 ./result/chrM.maf du -hsc result # 2.4G result # construct symlinks to get the individual maf files into gbdb: cd /hive/data/genomes/ce11/bed/multiz26way/anno/result rm /gbdb/ce11/multiz26way/*.maf # remove previous results ln -s `pwd`/*.maf /gbdb/ce11/multiz26way # Load into database cd /dev/shm time hgLoadMaf -pathPrefix=/gbdb/ce11/multiz26way \ ce11 multiz26way # Loaded 1512814 mafs in 7 files from /gbdb/ce11/multiz26way # real 0m42.327s time cat /gbdb/ce11/multiz26way/*.maf \ | hgLoadMafSummary ce11 -minSize=10000 -verbose=2 \ -mergeGap=500 -maxSize=50000 multiz26waySummary stdin # Indexing and tabulating stdin # Created 397940 summary blocks from 12243969 components and # 1512814 mafs from stdin # real 0m51.963s # -rw-rw-r-- 1 72568758 Aug 13 14:44 multiz26way.tab # -rw-rw-r-- 1 18647010 Aug 13 14:45 multiz26waySummary.tab wc -l multiz26way* # 1512814 multiz26way.tab # 397940 multiz26waySummary.tab rm multiz26way*.tab ###################################################################### # MULTIZ7WAY MAF FRAMES (DONE - 2015-09-08 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ce11/bed/multiz26way/frames cd /hive/data/genomes/ce11/bed/multiz26way/frames # survey all the genomes to find out what kinds of gene tracks they have cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "mgcGenes" || $table == "knownGene" || \ $table == "ws245Genes" || $table == "ncbiGenes" || \ $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql ce11 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql ce11 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh time ./showGenes.csh # ce11: ensGene: 57834, refGene: 53470, ws245Genes: 57528, Mrnas: 52714 # caePb3: ws245Genes: 33210, xenoRefGene: 189584, # caeRem4: ws245Genes: 32899, xenoRefGene: 155137, # cb4: ws245Genes: 23050, xenoRefGene: 130721, # caeJap4: ws245Genes: 38144, xenoRefGene: 158453, # caeSp111: ws245Genes: 27721, # caeAng2: ws245Genes: 33934, # caeSp51: ws245Genes: 46280, # hetBac1: ws245Genes: 20928, # strRat2: ncbiGenes: 12459, # panRed1: ws245Genes: 26372, # ancCey1: ws245Genes: 65583, # necAme1: ws245Genes: 19643, # haeCon2: ws245Genes: 26359, # ascSuu1: ws245Genes: 3082, xenoRefGene: 134962, # priExs1: ws245Genes: 24642, xenoRefGene: 0, # priPac3: ws245Genes: 24243, xenoRefGene: 89943, # melHap1: ws245Genes: 14420, xenoRefGene: 88300, # melInc2: ws245Genes: 20365, # burXyl1: ws245Genes: 18074, # dirImm1: ws245Genes: 12857, # loaLoa1: ws245Genes: 15577, # oncVol1: ws245Genes: 12385, # bruMal2: ws245Genes: 18709, # triSpi1: ws245Genes: 16380, # triSui1: ws245Genes: 14435, # from that summary, use these gene sets: # ws245Genes - ce11 caePb3 caeRem4 cb4 caeJap4 caeSp111 caeAng2 caeSp51 # hetBac1 panRed1 ancCey1 necAme1 haeCon2 # priExs1 priPac3 melHap1 melInc2 burXyl1 dirImm1 loaLoa1 # oncVol1 bruMal2 triSpi1 triSui1 # ncbiGenes - strRat2 # ascSuu1 - assembly confusion, can not determine genes mkdir genes # 1. ws245Genes for DB in ce11 caePb3 caeRem4 cb4 caeJap4 caeSp111 caeAng2 caeSp51 hetBac1 panRed1 ancCey1 necAme1 haeCon2 priExs1 priPac3 melHap1 melInc2 burXyl1 dirImm1 loaLoa1 oncVol1 bruMal2 triSpi1 triSui1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ws245Genes" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo -n "$DB: " genePredCheck -db=${DB} genes/${DB}.gp.gz done ce11: checked: 20412 failed: 0 caePb3: checked: 30626 failed: 0 caeRem4: checked: 31433 failed: 0 cb4: checked: 21737 failed: 0 caeJap4: checked: 29913 failed: 0 caeSp111: checked: 22306 failed: 0 caeAng2: checked: 27919 failed: 0 caeSp51: checked: 34679 failed: 0 hetBac1: checked: 20928 failed: 0 panRed1: checked: 24071 failed: 0 ancCey1: checked: 35775 failed: 0 necAme1: checked: 18984 failed: 0 haeCon2: checked: 21837 failed: 0 priExs1: checked: 23767 failed: 0 priPac3: checked: 22872 failed: 0 melHap1: checked: 14103 failed: 0 melInc2: checked: 19189 failed: 0 burXyl1: checked: 18074 failed: 0 dirImm1: checked: 12857 failed: 0 loaLoa1: checked: 14903 failed: 0 oncVol1: checked: 12133 failed: 0 bruMal2: checked: 13471 failed: 0 triSpi1: checked: 16370 failed: 0 triSui1: checked: 13600 failed: 0 # 1. ncbiGenes: strRat2 for DB in strRat2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiGenes" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo -n "$DB: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # strRat2: checked: 12439 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/ancCey1.gp.gz: 35775 # genes/bruMal2.gp.gz: 13471 # genes/burXyl1.gp.gz: 18074 # genes/caeAng2.gp.gz: 27919 # genes/caeJap4.gp.gz: 29913 # genes/caePb3.gp.gz: 30626 # genes/caeRem4.gp.gz: 31433 # genes/caeSp111.gp.gz: 22306 # genes/caeSp51.gp.gz: 34679 # genes/cb4.gp.gz: 21737 # genes/ce11.gp.gz: 20412 # genes/dirImm1.gp.gz: 12857 # genes/haeCon2.gp.gz: 21837 # genes/hetBac1.gp.gz: 20928 # genes/loaLoa1.gp.gz: 14903 # genes/melHap1.gp.gz: 14103 # genes/melInc2.gp.gz: 19189 # genes/necAme1.gp.gz: 18984 # genes/oncVol1.gp.gz: 12133 # genes/panRed1.gp.gz: 24071 # genes/priExs1.gp.gz: 23767 # genes/priPac3.gp.gz: 22872 # genes/strRat2.gp.gz: 12439 # genes/triSpi1.gp.gz: 16370 # genes/triSui1.gp.gz: 13600 time (cat ../anno/result/*.maf \ | genePredToMafFrames ce11 stdin stdout \ `egrep -v "ascSuu1" ../species.list.txt | xargs echo | sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g;"` \ | gzip > multiz26wayFrames.bed.gz) # real 1m40.091s # verify there are frames on everything, should be 15 species: zcat multiz26wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' # 102228 ancCey1 # 62865 bruMal2 # 93717 burXyl1 # 129846 caeAng2 # 162169 caeJap4 # 183626 caePb3 # 219355 caeRem4 # 190425 caeSp111 # 214566 caeSp51 # 204669 cb4 # 123221 ce11 # 62882 dirImm1 # 89757 haeCon2 # 75007 hetBac1 # 62443 loaLoa1 # 53145 melHap1 # 46477 melInc2 # 83965 necAme1 # 65069 oncVol1 # 75620 panRed1 # 59341 priExs1 # 61095 priPac3 # 75605 strRat2 # 43472 triSpi1 # 41941 triSui1 # load the resulting file ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz26way/frames time hgLoadMafFrames ce11 multiz26wayFrames multiz26wayFrames.bed.gz # real 0m29.914s time featureBits -countGaps ce11 multiz26wayFrames # 29102814 bases of 100286401 (29.020%) in intersection # real 0m16.783s # enable the trackDb entries: # frames multiz26wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 26-way (DONE - 2015-08-18 - Hiram) mkdir /hive/data/genomes/ce11/bed/multiz26way/4d cd /hive/data/genomes/ce11/bed/multiz26way/4d # the annotated mafs are in ../anno/result/*.maf # using ensGene for ce11 hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ce11 > ce11.ensGene.gp genePredSingleCover ce11.ensGene.gp stdout | sort > ce11.ensGeneNR.gp wc -l * # 57834 ce11.ensGene.gp # 20412 ce11.ensGeneNR.gp ssh ku mkdir /hive/data/genomes/ce11/bed/multiz26way/4d/run cd /hive/data/genomes/ce11/bed/multiz26way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/ce11/bed/multiz26way" set c = $1:r set infile = $r/anno/result/$2 set outDir = $r/4d/mfa/$3:h set outfile = $r/4d/mfa/$3 /bin/mkdir -p $outDir cd /scratch/tmp /bin/awk -v C=$c '$2 == C {print}' $r/4d/ce11.ensGeneNR.gp | sed -e "s/\t$c\t/\tce11.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` echo $NL if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile else echo "" > $outfile endif /bin/rm -f $c.gp $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh cat << '_EOF_' > template #LOOP 4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs ls ../../anno/result > maf.list gensub2 maf.list single template jobList # -ram=32g to make sure they don't fail out of memory and don't run too fast para -ram=32g create jobList para try ... check # the try runs all 7 jobs para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 95s 1.59m 0.03h 0.00d 0.000 y # IO & Wait Time: 21s 0.35m 0.01h 0.00d 0.000 y # Average job time: 17s 0.28m 0.00h 0.00d # Longest finished job: 21s 0.35m 0.01h 0.00d # Submission to last job: 25s 0.42m 0.01h 0.00d # Not all results have contents, that is OK # combine mfa files ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz26way/4d # none of these results are empty: ls -ogrt mfa # -rw-rw-r-- 1 1417 Aug 18 15:14 chrM.mfa # -rw-rw-r-- 1 5577097 Aug 18 15:14 chrII.mfa # -rw-rw-r-- 1 4732903 Aug 18 15:14 chrIV.mfa # -rw-rw-r-- 1 4185213 Aug 18 15:14 chrIII.mfa # -rw-rw-r-- 1 4402105 Aug 18 15:14 chrX.mfa # -rw-rw-r-- 1 6934921 Aug 18 15:14 chrV.mfa # -rw-rw-r-- 1 4628253 Aug 18 15:14 chrI.mfa #want comma-less species.list time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # real 0m4.558s # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 26 grep "^>" 4d.all.mfa | sed -e 's/^/# /;' # >ce11 # >caePb3 # >caeRem4 # >cb4 # >caeJap4 # >caeSp111 # >caeAng2 # >caeSp51 # >hetBac1 # >strRat2 # >panRed1 # >ancCey1 # >necAme1 # >haeCon2 # >ascSuu1 # >priExs1 # >priPac3 # >melHap1 # >melInc2 # >burXyl1 # >dirImm1 # >loaLoa1 # >oncVol1 # >bruMal2 # >triSpi1 # >triSui1 sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../ce11.26way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh # tree_commas.nh looks like: # (((((((((((((ce11,(caePb3,(caeRem4,cb4))),caeJap4),caeSp111),caeAng2), # caeSp51),hetBac1),(strRat2,panRed1)),((ancCey1,necAme1),haeCon2)), # ascSuu1),(priExs1,priPac3)),((melHap1,melInc2),burXyl1)), # (((dirImm1,loaLoa1),oncVol1),bruMal2)),(triSpi1,triSui1)) # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree_commas.nh 4d.all.mfa # real 102m23.619s mv phyloFit.mod all.mod grep TREE all.mod # TREE: (((((((((((((ce11:0.641433,(caePb3:0.487648, # (caeRem4:0.443246,cb4:0.445294):0.0464154):0.0165647):0.00599635, # caeJap4:0.828929):0.00213273,caeSp111:0.526296):0.00774532, # caeAng2:0.983829):0.0266384,caeSp51:0.415189):0.385011, # hetBac1:1.35734):0.0946305,(strRat2:1.37255, # panRed1:1.10699):0.139767):0.0425575, # ((ancCey1:0.387375,necAme1:0.494722):0.308076, # haeCon2:0.755625):0.31865):0.0426445,ascSuu1:1.32614):0.0433534, # (priExs1:0.112444,priPac3:0.0999032):0.752898):0.0633886, # ((melHap1:0.237743,melInc2:0.205426):1.14699, # burXyl1:0.685906):0.156962):0.334957,(((dirImm1:0.213158, # loaLoa1:0.198781):0.00222742,oncVol1:0.16899):0.0727414, # bruMal2:0.174965):1.0837):0.137915,(triSpi1:0.820134, # triSui1:0.850407):0.137915); # resulting distance table: /cluster/bin/phast/all_dists ce11.26way.nh | grep ce11 \ | sed -e "s/ce11.//" | sort -k2n | sed -e 's/^/# /;' # caeSp51 1.099135 # caePb3 1.145646 # caeRem4 1.147659 # cb4 1.149707 # caeSp111 1.175858 # caeJap4 1.476358 # caeAng2 1.641136 # priPac3 2.144944 # priExs1 2.157485 # burXyl1 2.198399 # ancCey1 2.220246 # haeCon2 2.280420 # necAme1 2.327593 # panRed1 2.410344 # hetBac1 2.426297 # ascSuu1 2.574929 # strRat2 2.675904 # triSpi1 2.786452 # triSui1 2.816725 # melInc2 2.864909 # melHap1 2.897226 # bruMal2 2.949153 # oncVol1 3.015920 # loaLoa1 3.047938 # dirImm1 3.062315 ######################################################################### # phastCons 26-way (DONE - 2015-09-08 - Hiram) # split 26way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/ce11/bed/multiz26way/cons/SS cd /hive/data/genomes/ce11/bed/multiz26way/cons/SS mkdir result done cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/ce11/bed/multiz26way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/ce11/bed/multiz26way/cons/SS/result/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif /bin/date >> $2.running /bin/rm -fr $WINDOWS /bin/mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null /bin/date >> $2 /bin/rm -f $2.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(root1) {check out line+ done/$(root1)} #ENDLOOP '_EOF_' # << happy emacs ls ../../anno/result > maf.list gensub2 maf.list single template jobList # beware overloaded the cluster with these fast running high I/O jobs # however, in this case, there are only 7 para -ram=32g create jobList para try # this runs all 7 para time > run.time # Completed: 7 of 7 jobs # CPU time in finished jobs: 161s 2.68m 0.04h 0.00d 0.000 y # IO & Wait Time: 15s 0.25m 0.00h 0.00d 0.000 y # Average job time: 25s 0.42m 0.01h 0.00d # Longest finished job: 34s 0.57m 0.01h 0.00d # Submission to last job: 45s 0.75m 0.01h 0.00d find ./result -type f | wc -l # 14 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/ce11/bed/multiz26way/cons/run.cons cd /hive/data/genomes/ce11/bed/multiz26way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set d = $2 set f = $3 set len = $4 set cov = $5 set rho = $6 set grp = $cwd:t set cons = /hive/data/genomes/ce11/bed/multiz26way/cons set tmp = $cons/tmp/${d}_${c} mkdir -p $tmp set ssSrc = $cons/SS/result set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$d/$f $tmp else ln -s $ssSrc/$d/$f $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp else $PHASTBIN/phastCons $f $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp endif popd > /dev/null mkdir -p pp/$d bed/$d sleep 4 touch pp/$d bed/$d rm -f pp/$d/$c.pp rm -f bed/$d/$c.bed mv $tmp/$c.pp pp/$d mv $tmp/$c.bed bed/$d rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list wc -l ss.list # 14 ss.list # Create parasol batch and run it # run for all species mkdir /hive/data/genomes/ce11/bed/multiz26way/cons/all cd /hive/data/genomes/ce11/bed/multiz26way/cons/all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList # beware overwhelming the cluster with these fast running high I/O jobs # however, in this case, there are only 14 jobs para -ram=32g create jobList para try ... check ... # Completed: 14 of 14 jobs # CPU time in finished jobs: 412s 6.87m 0.11h 0.00d 0.000 y # IO & Wait Time: 81s 1.35m 0.02h 0.00d 0.000 y # Average job time: 35s 0.59m 0.01h 0.00d # Longest finished job: 51s 0.85m 0.01h 0.00d # Submission to last job: 108s 1.80m 0.03h 0.00d # create Most Conserved track cd /hive/data/genomes/ce11/bed/multiz26way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do echo ${D}/${C}*.bed 1>&2 cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m40.828s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # -rw-rw-r-- 1 13454157 Sep 8 11:44 tmpMostConserved.bed # -rw-rw-r-- 1 13671554 Sep 8 11:45 mostConserved.bed wc -l *.bed # 406479 mostConserved.bed # 406479 tmpMostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz26way/cons/all time hgLoadBed ce11 phastConsElements26way mostConserved.bed # Read 406479 elements of size 5 from mostConserved.bed # real 0m3.735s # on human we often try for 5% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits ce11 -enrichment ensGene:cds phastConsElements26way # ensGene:cds 25.321%, phastConsElements26way 41.217%, both 19.293%, # cover 76.20%, enrich 1.85x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/ce11/bed/multiz26way/cons/all mkdir downloads # the third sed fixes the chrom names, removing the partition extensions time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ | gzip -c > downloads/phastCons26way.wigFix.gz) # real 0m43.836s # check integrity of data with wigToBigWig time (zcat downloads/phastCons26way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/ce11/chrom.sizes \ phastCons26way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=22748: VmPeak: 676500 kB # real 0m49.827s bigWigInfo phastCons26way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 118,518,400 # primaryIndexSize: 2,746,816 # zoomLevels: 10 # chromCount: 7 # basesCovered: 58,852,043 # mean: 0.686605 # min: 0.000000 # max: 1.000000 # std: 0.385078 # encode those files into wiggle data time (zcat downloads/phastCons26way.wigFix.gz \ | wigEncode stdin phastCons26way.wig phastCons26way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 0m17.307s du -hsc *.wi? # 57M phastCons26way.wib # 7.8M phastCons26way.wig # -rw-rw-r-- 1 58852043 Sep 8 11:49 phastCons26way.wib # -rw-rw-r-- 1 8107673 Sep 8 11:49 phastCons26way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phastCons26way.wib /gbdb/ce11/multiz26way/phastCons26way.wib time hgLoadWiggle -pathPrefix=/gbdb/ce11/multiz26way \ ce11 phastCons26way phastCons26way.wig # real 0m1.059s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh ce11 phastCons26way # db.table min max mean count sumData # ce11.phastCons26way 0 1 0.686605 58852043 4.04081e+07 # stdDev viewLimits # 0.385078 viewLimits=0:1 # Create histogram to get an overview of all the data time hgWiggle -doHistogram -db=ce11 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons26way > histogram.data 2>&1 # real 0m3.288s # create plot of histogram: cat << '_EOF_' | gnuplot > ce11.26way.phastCons.histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " C. elegans - ce11 Histogram phastCons26way track" set xlabel " phastCons26way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display ce11.26way.phastCons.histo.png & ######################################################################### # phyloP for 26-way (DONE - 2015-09-08 - Hiram) # run phyloP with score=LRT ssh ku mkdir -p /cluster/data/ce11/bed/multiz26way/consPhyloP/run.phyloP cd /cluster/data/ce11/bed/multiz26way/consPhyloP/run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.567 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.567 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.216500 0.283500 0.283500 0.216500 cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set d = $f:h set file1 = $f:t set out = $2 set cName = $f:t:r set grp = $cwd:t set cons = /hive/data/genomes/ce11/bed/multiz26way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "/hive/data/genomes/ce11/bed/multiz26way/cons/SS/result/$f" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$file1.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list # make sure the list looks good wc -l ss.list # 14 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/ce11/bed/multiz26way/consPhyloP/all cd /hive/data/genomes/ce11/bed/multiz26way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overloading the cluster with these quick and high I/O jobs # -ram=32g will ensure they can all complete and will not run too many # at the same time, there are only 14 here anyway para -ram=32g create jobList para try ... check ... push ... etc ... para time > run.time # Completed: 14 of 14 jobs # CPU time in finished jobs: 10816s 180.26m 3.00h 0.13d 0.000 y # IO & Wait Time: 88s 1.47m 0.02h 0.00d 0.000 y # Average job time: 779s 12.98m 0.22h 0.01d # Longest finished job: 1283s 21.38m 0.36h 0.01d # Submission to last job: 1294s 21.57m 0.36h 0.01d # back on dev cd /hive/data/genomes/ce11/bed/multiz26way/consPhyloP/all mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP26way.wigFix.gz) # real 0m54.344s # -rw-rw-r-- 1 102990761 Sep 8 12:40 phyloP26way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phyloP26way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/ce11/chrom.sizes \ phyloP26way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=39486: VmPeak: 676500 kB # real 0m54.488s bigWigInfo phyloP26way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 136,567,148 # primaryIndexSize: 2,746,816 # zoomLevels: 10 # chromCount: 7 # basesCovered: 58,852,043 # mean: 0.970667 # min: -3.992000 # max: 9.159000 # std: 1.577332 # encode those files into wiggle data time (zcat downloads/phyloP26way.wigFix.gz \ | wigEncode stdin phyloP26way.wig phyloP26way.wib) # Converted stdin, upper limit 9.16, lower limit -3.99 # real 0m19.281s du -hsc *.wi? # 57M phyloP26way.wib # 8.0M phyloP26way.wig # -rw-rw-r-- 1 58852043 Sep 8 12:46 phyloP26way.wib # -rw-rw-r-- 1 8322927 Sep 8 12:46 phyloP26way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP26way.wib /gbdb/ce11/multiz26way/phyloP26way.wib time hgLoadWiggle -pathPrefix=/gbdb/ce11/multiz26way ce11 \ phyloP26way phyloP26way.wig # real 0m0.842s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh ce11 phyloP26way # db.table min max mean count sumData # ce11.phyloP26way -3.992 9.159 0.970667 58852043 5.71257e+07 # stdDev viewLimits # 1.57733 viewLimits=-3.992:8.85733 # that range is: 9.159+3.992 = 13.151 for hBinSize=0.013151 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.013151 -hBinCount=1000 -hMinVal=-3.992 -verbose=2 \ -db=ce11 phyloP26way > histogram.data 2>&1 # real 0m3.666s # find the Y range for the 2:5 graph grep "^[0-9]" histogram.data | ave -col=5 stdin | sed -e 's/^/# /;' # Q1 0.000065 # median 0.000179 # Q3 0.000924 # average 0.001026 # min 0.000000 # max 0.030633 # count 975 # total 0.999992 # standard deviation 0.002321 # find the X range for the 2:5 graph grep "^[0-9]" histogram.data | ave -col=2 stdin | sed -e 's/^/# /;' # Q1 -0.533287 # median 2.662410 # Q3 5.884400 # average 2.660747 # min -3.992000 # max 9.159000 # count 975 # total 2594.228452 # standard deviation 3.707122 # create plot of histogram: cat << '_EOF_' | gnuplot > ce11.26way.phyloP.histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " C. elegans ce11 Histogram phyloP26way track" set xlabel " phyloP26way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set xrange [-2:4] set yrange [0:0.015] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display ce11.26way.phyloP.histo.png ############################################################################# # construct download files for 26-way (DONE - 2015-09-09 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz26way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phastCons26way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phyloP26way mkdir /hive/data/genomes/ce11/bed/multiz26way/downloads cd /hive/data/genomes/ce11/bed/multiz26way/downloads mkdir multiz26way phastCons26way phyloP26way cd multiz26way # using the separate per-chrom maf files as more # convenient than the single one time rsync -a -P ../../anno/result/ . # sent 2499852297 bytes received 148 bytes 161280802.90 bytes/sec # total size is 2499546722 speedup is 1.00 # real 0m15.117s du -hsc *.maf # 2.4G maf time gzip *.maf # real 2m32.825s du -hsc maf.gz # 333M maf grep TREE ../../4d/all.mod | sed -e 's/TREE: //' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.26way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh ce11.26way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ | sed -e 's/__/_/;' > ce11.26way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh ce11.26way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.26way.scientificNames.nh time md5sum *.nh > md5sum.txt # real 1m55.320s ln -s `pwd`/*.nh `pwd`/*.txt `pwd`/*.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz26way # obtain the README.txt from ce10/multiz9way and update for this # situation ##################################################################### cd /hive/data/genomes/ce11/bed/multiz26way/downloads/phastCons26way ln -s ../../cons/all/downloads/phastCons26way.wigFix.gz \ ./ce11.phastCons26way.wigFix.gz ln -s ../../cons/all/phastCons26way.bw ./ce11.phastCons26way.bw ln -s ../../cons/all/all.mod ./ce11.phastCons26way.mod time md5sum *.gz *.mod *.bw > md5sum.txt # real 0m33.880s # obtain the README.txt from ce10/phastCons9way and update for this # situation ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phastCons26way ##################################################################### cd /hive/data/genomes/ce11/bed/multiz26way/downloads/phyloP26way ln -s ../../consPhyloP/all/downloads/phyloP26way.wigFix.gz \ ./ce11.phyloP26way.wigFix.gz ln -s ../../consPhyloP/run.phyloP/all.mod ce11.phyloP26way.mod ln -s ../../consPhyloP/all/phyloP26way.bw ce11.phyloP26way.bw time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m40.489s # obtain the README.txt from ce10/phyloP9way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phyloP26way ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/ce11/bed/multiz26way/downloads/multiz26way # going to construct upstream for ws245Genes, refGene and ensGene # bash script #!/bin/sh export geneTbl="ws245Genes" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ce11 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags ce11 multiz26way \ stdin stdout \ -orgs=/hive/data/genomes/ce11/bed/multiz26way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 13m46.813s # and run up the refGene upstreams #!/bin/sh export geneTbl="refGene" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ce11 ${geneTbl}:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags ce11 multiz26way \ stdin stdout \ -orgs=/hive/data/genomes/ce11/bed/multiz26way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 63m58.714s # and run up the ensGene upstreams #!/bin/sh export geneTbl="ensGene" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ce11 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags ce11 multiz26way \ stdin stdout \ -orgs=/hive/data/genomes/ce11/bed/multiz26way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 14m4.262s # obtain the README.txt from ce9/multiz10way and update for this # situation # information for table of species in the README files, need to # edit it in after adding it to the end of this file: cat ../../species.list | tr '[ ]' '[\n]' | while read D do netType=`ls ../../mafLinks/${D}/ce11.${D}.*.maf.gz | sed -e "s#.*ce11.${D}.##; s#.maf.gz##;" | sed -e 's/synNet/syntenic/; s/rbest/reciprocal best/;'` info=`hgsql -N -e "select organism,\" - \",scientificName,description from dbDb where name=\"$D\";" hgcentraltest` echo "${info} ${netType}" done | tr '[\t]' '[ ]' >> README.txt C. elegans Caenorhabditis elegans Aug. 2014 (WBcel245/ce11) reference A. ceylanicum Ancylostoma ceylanicum Mar. 2014 (WS243/Acey_2013.11.30.genDNA/ancCey1) Barber pole worm Haemonchus contortus Jul. 2013 (WormBase WS239/haeCon2) C. angaria Caenorhabditis angaria Mar. 2012 (WS245/caeAng2) C. brenneri Caenorhabditis brenneri Nov. 2010 (C. brenneri 6.0.1b/caePb3) C. briggsae Caenorhabditis briggsae Apr. 2011 (WS225/cb4) C. japonica Caenorhabditis japonica Aug. 2010 (WUSTL 7.0.1/caeJap4) C. remanei Caenorhabditis remanei Jul. 2007 (WS220/caeRem4) C. sp. 5 ju800 Caenorhabditis sp5 ju800 Jan. 2012 (WS230/Caenorhabditis_sp_5-JU800-1.0/caeSp51) C. tropicalis Caenorhabditis tropicalis Nov. 2010 (WUSTL 3.0.1/caeSp111) Dog heartworm Dirofilaria immitis Sep. 2013 (WS240/D. immitis v2.2/dirImm1) Eye worm Loa loa Jul. 2012 (WS235/L_loa_Cameroon_isolate/loaLoa1) Filarial worm Brugia malayi May. 2014 (WS244/B_malayi-3.1/bruMal2) H. bacteriophora/m31e Heterorhabditis bacteriophora Aug. 2011 (WS229/H. bacteriophora 7.0/hetBac1) M. hapla Meloidogyne hapla Sep. 2008 (M. hapla VW9 WS210/melHap1) M. incognita Meloidogyne incognita Feb. 2008 (M. incognita WS245/PRJEA28837/melInc2) Microworm Panagrellus redivivus Feb. 2013 (Pred3/panRed1) N. americanus Necator americanus Dec. 2013 (WS242/N_americanus_v1/necAme1) O. volvulus Onchocerca volvulus Nov. 2013 (WS241/O_volvulus_Cameroon_v3/oncVol1) P. exspectatus Pristionchus exspectatus Mar. 2014 (WS243/P_exspectatus_v1/priExs1) P. pacificus Pristionchus pacificus Aug. 2014 (P_pacificus-v2/priPac3) Pig roundworm Ascaris suum Sep. 2012 (WS229/AscSuum_1.0/ascSuu1) Pine wood nematode Bursaphelenchus xylophilus Nov. 2011 (WS229/B. xylophilus Ka4C1/burXyl1) Threadworm Strongyloides ratti Sep. 2014 (S. ratti ED321/strRat2) Trichinella Trichinella spiralis Jan. 2011 (WS225/Trichinella_spiralis-3.7.1/triSpi1) Whipworm Trichuris suis Jul. 2014 (WS243/T. suis DCEP-RM93M male/triSui1) # some other symlinks were already made above ln -s `pwd`/upstream*.gz README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz26way ############################################################################# # hgPal downloads (DONE - 2015-09-09 - Hiram) # FASTA from 26-way for ws245Genes, refGene and ensGene ssh hgwdev screen -S ce11HgPal mkdir /hive/data/genomes/ce11/bed/multiz26way/pal cd /hive/data/genomes/ce11/bed/multiz26way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list export mz=multiz26way export gp=ws245Genes export db=ce11 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x ./$gp.jobs > $gp.jobs.log 2>&1 # real 23m34.032s time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m18.841s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 2m8.738s export mz=multiz26way export gp=ws245Genes export db=ce11 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ### need other gene track alignments also # running up refGene cd /hive/data/genomes/ce11/bed/multiz26way/pal export mz=multiz26way export gp=refGene export db=ce11 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 # real 22m52.785s export mz=multiz26way export gp=refGene export db=ce11 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m17.738s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 1m55.287s du -hsc exonAA exonNuc $gp.*fa.gz # 88M exonAA # 172M exonNuc # 88M refGene.multiz26way.exonAA.fa.gz # 172M refGene.multiz26way.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment export mz=multiz26way export gp=refGene export db=ce11 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ # running up ensGene cd /hive/data/genomes/ce11/bed/multiz26way/pal export mz=multiz26way export gp=ensGene export db=ce11 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 XXX - running - Wed Sep 9 10:57:22 PDT 2015 # real 24m14.022s export mz=multiz26way export gp=ensGene export db=ce11 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m19.157s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 2m4.686s du -hsc exonAA exonNuc $gp.*fa.gz # 88M exonAA # 187M exonNuc # 88M ensGene.multiz26way.exonAA.fa.gz # 187M ensGene.multiz26way.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment export mz=multiz26way export gp=ensGene export db=ce11 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ ############################################################################# # wiki page for 26-way (DONE - 2015-07-28 - Hiram) mkdir /hive/users/hiram/bigWays/ce11.26way cd /hive/users/hiram/bigWays echo "ce11" > ce11.26way/ordered.list awk '{print $1}' /hive/data/genomes/ce11/bed/multiz26way/26way.distances.txt \ >> ce11.26way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They may already be done. ./sizeStats.sh ce11.26way/ordered.list # dbDb.sh constructs ce11.26way/Ce11_26-way_conservation_alignment.html ./dbDb.sh ce11 26way # sizeStats.pl constructs ce11.26way/Ce11_26-way_Genome_size_statistics.html ./sizeStats.pl ce11 26way # defCheck.pl constructs Ce11_26-way_conservation_lastz_parameters.html ./defCheck.pl ce11 26way # this constructs the html pages in ce11.26way/: # -rw-rw-r-- 1 4153 Jun 5 11:03 Ce11_26-way_conservation_alignment.html # -rw-rw-r-- 1 5833 Jun 5 11:04 Ce11_26-way_Genome_size_statistics.html # -rw-rw-r-- 1 3854 Jun 5 11:04 Ce11_26-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # Ce11_26-way_conservation_alignment # Ce11_26-way_Genome_size_statistics # Ce11_26-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. #############################################################################