############################################################################# ## 369-Way Multiz (TBD - 2018-12-10 - Hiram) ssh hgwdev # the phylo tree was done by kmer counting ssh ku mkdir /hive/data/genomes/staAur2/bed/kmers cd /hive/data/genomes/staAur2/bed/kmers # working on this set of sequences: ls -d ../sequences/GC* | sed -e 's#.*sequences/##;' > accession.list # with this template #LOOP kmerOne $(file1) {check out exists+ $(file1)/$(file1).31mers.profile.txt.gz} #ENDLOOP # this script does the kmer preparation, specifically set to 31 here: #!/bin/bash set -beEu -o pipefail export SORT="$HOME/bin/x86_64/gnusort -S32G --parallel=4 -T /dev/shm" db=$1 fa=`ls ../sequences/$db/*_genomic.fna.gz | grep -v "_from_"` if [ ! -s "${db}/${db}.31mers.profile.txt.gz" ]; then mkdir -p "${db}" ./kmerPrint.pl 31 ${fa} | gzip -c > ${db}/${db}.31mers.txt.gz zcat ${db}/${db}.31mers.txt.gz | cut -f1 | ${SORT} | uniq -c \ | awk '{printf "%s\t%d\n", $2, $1}' | ${SORT} | gzip -c > ${db}/${db}.31mers.profile.txt.gz rm -f ${db}/${db}.31mers.txt.gz fi # first batch: gensub2 accession.list single template kmer31.jobList para -ram-32g create kmer31.jobList para try ... check ... push ... etc... time # Completed: 369 of 369 jobs # CPU time in finished jobs: 42269s 704.48m 11.74h 0.49d 0.001 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 65s 1.09m 0.02h 0.00d # Longest finished job: 153s 2.55m 0.04h 0.00d # Submission to last job: 853s 14.22m 0.24h 0.01d # in this same directory, on ku, clear this previous batch: cd /hive/data/genomes/staAur2/bed/kmers para clearSickNodes; para flushResults; para resetCounts; para freeBatch # the next job list is made by this perl script. It creates a comparison # between each kmer result. The number of jobs is going to be: # N * ( N / 2 ) since it is the sum of: (N-1)+(N-2)+(N-3)... + 1 #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: ./allByAllPairs.pl N\n"; printf STDERR "A_albimanus 68.19 A_aquasalis 68.846969\n"; printf STDERR "A_albimanus 70.85 A_arabiensis 54.805332\n"; printf STDERR "A_aquasalis 71.54 A_arabiensis 54.812861\n"; exit 255; } my $N = shift; my @workList; my $listCount = 0; open (FH, "ls -drt */*${N}mers.profile.txt.gz | sed -e 's/.gz//;' | sort |") or die "can not read ls -drt */*.profile.txt"; while (my $line = ) { chomp $line; $line =~ s#/.*##; $workList[$listCount++] = $line; } close (FH); foreach my $db (@workList) { printf STDERR "%s\n", $db; } my @remainingList = @workList; for (my $i = 0; $i < $listCount; ++$i) { my $s1 = $workList[$i]; for (my $j = $i+1; $j < $listCount; ++$j) { my $s2 = $workList[$j]; printf "runOne %s %s %d {check out exists+ compare.%d/%s/%s.txt}\n", $s1, $s2, $N, $N, $s1, $s2; } } printf STDERR "# listCount: %d, job count = %d * (%d / 2) = %d\n", $listCount, $listCount, $listCount, $listCount*($listCount/2); ############################################################################### ./mkJobList.pl > jobList.kmer31.compare para -ram=32g create jobList.kmer31.compare para try ... check ... push ... etc... time # Completed: 67896 of 67896 jobs # CPU time in finished jobs: 2277675s 37961.26m 632.69h 26.36d 0.072 y # IO & Wait Time: 260509s 4341.81m 72.36h 3.02d 0.008 y # Average job time: 37s 0.62m 0.01h 0.00d # Longest finished job: 70s 1.17m 0.02h 0.00d # Submission to last job: 2649s 44.15m 0.74h 0.03d # collect all the results in one file: find ./compare.31 -type f | xargs cat > allByAll.kmers31.compared.txt # this matrixUp.pl script constructs an N by N 'similarity' matrix ~/kent/src/hg/makeDb/doc/staAur2/matrixUp.pl > data31mer.csv # AND it writes the file: matrixKey.txt which is a naming index # of the rows and columns in the matrix in data31mer.csv, rename it: mv matrixKey.txt matrixKey.31mer.txt # prepare for running the phylip command 'neighbor' # construct a 'distance' matrix input for phylip from the 'similarity' # matrix. This perl script reads the data31mer.csv and matrixKey # to construct the infile for 'neighbor'. It needs to mangle the # sequence names down to a length of 10 characters since that is # all that 'neighbor' can handle. It writes to STDERR the translation # two column file to restore the names from the result #!/usr/bin/env perl use strict; use warnings; my %orderKey; # key is number, value is name my $keyCount = 0; my %shortName; # key is long name, value is short name my %shortKey; # key is short name, value is number my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: ./mkTestData.pl N\n"; printf STDERR "where N is Nmer, will be reading:\n"; printf STDERR " ../matrixKey.Nmer.txt and ../dataNmer.csv\n"; exit 255; } my $N = shift; # -rw-rw-r-- 1 7270 Dec 23 13:36 matrixKey.31mer.txt # -rw-rw-r-- 1 1049999 Dec 23 13:36 data31mer.csv # 24 A_stephensi # 25 Aedes_aegypti # 26 Aedes_albopictus open (FH, "<../matrixKey.${N}mer.txt") or die "can not read ../matrixKey.${N}mer.txt"; while (my $line = ) { chomp $line; my ($key, $name) = split('\s+', $line); $orderKey{$key} = $name; if ($name =~ m/_/) { my @nameParts = split('_', $name); my $shortName = $nameParts[1]; $shortName =~ s/0//; $shortName =~ s/v//g; for (my $i = 1; $i < 4; ++$i) { my $tryName = sprintf("%s%d", $shortName, $i); if (! defined($shortKey{$tryName})) { $shortName = $tryName; last; } } printf STDERR "short name is longer than 10 chars ? $shortName\n" if (length($shortName) > 10); die "name conflict: $name" if (defined($shortKey{$name})); $shortName{$name} = $shortName; $shortKey{$keyCount} = $shortName; } else { printf STDERR "short name is longer than 10 chars ? $name\n" if (length($name) > 10); die "name conflict: $name" if (defined($shortKey{$name})); $shortName{$name} = $name; $shortKey{$keyCount} = $name; } ++$keyCount; } close (FH); for (my $i = 0; $i < $keyCount; ++$i) { printf STDERR "%s\t%s\n", $shortName{$orderKey{$i}}, $orderKey{$i}; } my @rows; my $rowCount = 0; open (FH, "<../data${N}mer.csv") or die "can not read ../data${N}mer.csv"; while (my $line = ) { chomp $line; my @a = split(',', $line); $rows[$rowCount++] = \@a } close (FH); for (my $i = 0; $i < $keyCount; ++$i) { for (my $j = 0; $j < $keyCount; ++$j) { next if ($i == $j); my $ave = ($rows[$i]->[$j] + $rows[$j]->[$i]) / 2; $rows[$i]->[$j] = $ave; $rows[$j]->[$i] = $ave; } } printf "%5d\n", $keyCount; for (my $i = 0; $i < $keyCount; ++$i) { printf "%-10.10s", $shortKey{$i}; for (my $j = 0; $j < $keyCount; ++$j) { my $value = $rows[$i]->[$j]; printf "%8.4f", (100.0 - $value) / 100.0; } printf "\n"; } ########################################################################### ./mkTestData.pl > infile 2> name.31mer.translate.txt # running 'neighbor' select the UPGMA option, results # end up in outfile and outtree. There doesn't seem to be # any command line options to this command, it starts with # an interactive menu where option selections are made. ./neighbor # rename the results: mv outfile outfile.31mer mv outtree outtree.31mer # restore the full sequence names: cat outtree.31mer \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=name.31mer.translate.txt -noInternal -lineOutput /dev/stdin \ > upgma.31mer.nh # then from GCF names to the sequence names sed -e 's/\./v/;' ../../nameCorrespond/gcToShortName.tab > gc.shortName.tab cat upgma.31mer.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=gc.shortName.tab -noInternal -lineOutput /dev/stdin \ > upgma.31mer.shortName.nh # take that result to TreeGraph2 to reorganize to get the staAur2 # reference at the top: Sa_USA300_FPR3757 # the result is the staAur2.369way.nh used to run the multiz mkdir /hive/data/genomes/staAur2/bed/multiz369way cd /hive/data/genomes/staAur2/bed/multiz369way # the resulting staAur2.369way.nh tree from the above kmer procedure starts: head staAur2.369way.nh | sed -e 's/^/# /;' # ((((((((((((((((((((((((((staAur2:0.00475, # (Sa_USA300_SUR12:0.00381, # (Sa_USA300_TCH1516:0.00335, # ((Sa_JE2:0.00055, # Sa_C2406:0.00055):0.00086, # (Sa_USA300_NRS384:0.00129, # (Sa_UA_S391_USA300:0.00081, # ((Sa_CIT:0.000001, # Sa_OXLIM:0.000001):0.00068, # (Sa_33b:0.00029, # and ends, note the Bacillus_subtilis and E_coli here for outgroups # Sa_NRS271:0.16326):0.00391, # Sa_MOZ66:0.16717):0.01913):0.02373):0.0805, # Sa_K17:0.29052):0.18744):0.02154, # Bacillus_subtilis_168:0.4995):0.00045, # (((((E_coli_K_12_MG1655:0.22665, # E_coli_O157_H7_Sakai_RIMD_0509952:0.22665):0.0382, # E_coli_UMN026:0.26485):0.02508, # E_coli_O83_H1_NRG_857C:0.28993):0.0484, # E_coli_IAI39:0.33834):0.14118, # E_coli_O104_H4_2011C_3493:0.47952):0.02043):0.1:0.1; # convert it to other naming schemes: cut -f1,4 \ ../../nameCorrespond/taxId.acc.asmId.shortName.sciName.noDot.tab \ | awk '{printf "%s\t%s\n", $2,$1}' > shortName.to.taxId.tab sed -e 's/\./v/;' ../../nameCorrespond/gcToShortName.tab \ | awk '{printf "%s\t%s\n", $2, $1}' > shortName.to.GCFname.tab awk '{printf "%s\t%s\n", $2, $1}' ../../nameCorrespond/gcToShortName.tab \ > shortName.to.GCFaccession.tab cat staAur2.369way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=shortName.to.taxId.tab -noInternal -lineOutput /dev/stdin \ > staAur2.369way.taxId.nh cat staAur2.369way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=shortName.to.GCFaccession.tab -noInternal \ -lineOutput /dev/stdin \ > staAur2.369way.accession.nh ######################################################################### # extract species list from that .nh file # this sed only works on this type of ascii printout that has one # name per line. sed -e 's/:.*//; s/(//g; s/ //g;' staAur2.369way.nh \ | xargs echo > species.list.txt # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/staAur2_369way.png # all distances for sizeStats.pl below: /cluster/bin/phast/all_dists staAur2.369way.nh | grep staAur2 \ | sed -e "s/staAur2.//" | sort -k2n > 369way.distances.txt # this is a specialized sizeStats.pl from the usual kind cat > sizeStats.pl << '_EOF_' #!/usr/bin/env perl use strict; use warnings; my %dbToName; # key is name from 369way.distances.txt, value is sciName open (FH , ") { chomp $line; my ($db, $name) = split('\s+',$line); $name =~ s/__/. /; $dbToName{$db} = $name; } close (FH); open (FH, "<369way.distances.txt") or die "can not read 369way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('\s+', $line); my $Db = ucfirst($D); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/staAur2/bed/lastz.$D/fb.staAur2." . $chain . "Link.txt"; if ( $D =~ m/^G/) { $B=`ls /hive/data/genomes/staAur2/bed/awsMultiz/lastzRun/lastz_$D/fb.staAur2.chain.${D}*`; chomp $B; } my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $chainSynMeasure = ""; if ( $D =~ m/^G/) { $B=`ls /hive/data/genomes/staAur2/bed/awsMultiz/lastzRun/lastz_$D/fb.staAur2.chainSyn.${D}*`; chomp $B; } else { $B="/hive/data/genomes/staAur2/bed/lastz.${D}/fb.staAur2.chainSyn${Db}Link.txt"; } $chainSynMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainSynMeasure; $chainSynMeasure = 0.0 if (length($chainSynMeasure) < 1); $chainSynMeasure =~ s/\%//; my $chainRBestMeasure = ""; if ( $D =~ m/^G/) { $B=`ls /hive/data/genomes/staAur2/bed/awsMultiz/lastzRun/lastz_$D/fb.staAur2.chainRBest.${D}*`; chomp $B; } else { $B="/hive/data/genomes/staAur2/bed/lastz.${D}/fb.staAur2.chainRBest.${Db}.txt"; } $chainRBestMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainRBestMeasure; $chainRBestMeasure = 0.0 if (length($chainRBestMeasure) < 1); $chainRBestMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.staAur2/fb.${D}.chainGalGal6Link.txt"; my $swapMeasure = "0"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { if (defined($dbToName{$D})) { $orgName = $dbToName{$D}; } else { $orgName="N/A"; } } ++$count; my $percentAlike = 100.0 * ($chainLinkMeasure - $chainRBestMeasure) / $chainLinkMeasure; printf "# %03d %.4f (%% %06.3f) (%% %06.3f) (%% %06.3f) %5.2f - %s %s\n", $count, $dist, $chainLinkMeasure, $chainSynMeasure, $chainRBestMeasure, $percentAlike, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure. # the unlabeled column between rBestLink and sciName is the % ratio # of recipBest to chainLink: # 100.0 * (chainLink - rBestLink) / chainLink # N dist chainLink synLink rBestLink sciName - accession/db # # 001 0.4253 (% 43.814) (% 12.216) (% 32.785) 25.17 - Caenorhabditis_sp._38_MB-2015 C_sp38_MB_2015 # 002 0.4357 (% 59.069) (% 37.574) (% 48.812) 17.36 - C. brenneri caePb3 # 003 0.4425 (% 40.332) (% 25.464) (% 36.514) 9.47 - C. sp. 5 ju800 caeSp51 # 004 0.4434 (% 39.417) (% 06.460) (% 27.675) 29.79 - C. angaria caeAng2 # 005 0.4466 (% 57.873) (% 39.138) (% 47.416) 18.07 - Caenorhabditis_latens C_latens # 006 0.4480 (% 61.020) (% 40.374) (% 50.049) 17.98 - C. remanei caeRem4 # 007 0.4569 (% 15.004) (% 00.079) (% 12.189) 18.76 - Ancylostoma_duodenale # 008 0.4576 (% 16.527) (% 00.580) (% 13.743) 16.85 - Ancylostoma_caninum # 009 0.4606 (% 08.936) (% 00.180) (% 07.705) 13.78 - Barber pole worm haeCon2 # 010 0.4607 (% 14.484) (% 00.219) (% 11.872) 18.03 - Teladorsagia_circumcincta # 011 0.4615 (% 55.203) (% 30.009) (% 43.133) 21.86 - C. tropicalis caeSp111 # 012 0.4615 (% 41.026) (% 18.314) (% 31.414) 23.43 - Caenorhabditis_sp._31_LS-2015 C_sp31_LS_2015 # 013 0.4630 (% 03.444) (% 00.018) (% 02.414) 29.91 - Opisthorchis_viverrini # 014 0.4634 (% 14.994) (% 00.164) (% 12.019) 19.84 - Oesophagostomum_dentatum # 015 0.4638 (% 03.450) (% 00.007) (% 02.410) 30.14 - Clonorchis_sinensis # 016 0.4645 (% 11.502) (% 00.228) (% 09.476) 17.61 - Toxocara_canis # 017 0.4647 (% 14.406) (% 00.306) (% 11.734) 18.55 - Haemonchus_contortus # 018 0.4651 (% 05.722) (% 00.132) (% 04.869) 14.91 - Pig roundworm ascSuu1 # 019 0.4660 (% 00.832) (% 00.000) (% 00.495) 40.50 - Dicrocoelium_dendriticum # 020 0.4672 (% 13.175) (% 00.264) (% 10.946) 16.92 - Ascaris_suum # 021 0.4681 (% 05.601) (% 00.019) (% 04.196) 25.08 - Hymenolepis_microstoma # 022 0.4686 (% 53.670) (% 38.079) (% 44.232) 17.59 - Caenorhabditis_sp._34_TK-2017 C_sp34_TK_2017 # 023 0.4689 (% 11.720) (% 00.230) (% 09.718) 17.08 - Parascaris_univalens # 024 0.4715 (% 59.618) (% 37.200) (% 48.546) 18.57 - Caenorhabditis_sp._26_LS-2015 C_sp26_LS_2015 # 025 0.4715 (% 15.792) (% 00.540) (% 13.351) 15.46 - Nippostrongylus_brasiliensis # 026 0.4721 (% 15.964) (% 00.578) (% 13.279) 16.82 - A. ceylanicum ancCey1 # 027 0.4726 (% 24.970) (% 01.150) (% 20.772) 16.81 - Diploscapter_coronatus # 028 0.4730 (% 14.696) (% 00.369) (% 12.240) 16.71 - Heligmosomoides_polygyrus_bakeri # 029 0.4740 (% 03.502) (% 00.000) (% 02.468) 29.53 - Fasciola_gigantica # 030 0.4740 (% 62.531) (% 41.703) (% 51.026) 18.40 - Caenorhabditis_nigoni C_nigoni # 031 0.4742 (% 23.687) (% 00.814) (% 19.743) 16.65 - Diploscapter_pachys # 032 0.4747 (% 13.074) (% 00.292) (% 10.610) 18.85 - Pristionchus_arcanus # 033 0.4747 (% 60.930) (% 40.599) (% 49.316) 19.06 - Caenorhabditis_briggsae C_briggsae # 034 0.4747 (% 39.384) (% 31.995) (% 35.681) 9.40 - C. briggsae cb4 # 035 0.4752 (% 52.955) (% 36.731) (% 44.297) 16.35 - Caenorhabditis_sp._40_LS-2015 C_sp40_LS_2015 # 036 0.4758 (% 15.754) (% 00.618) (% 12.952) 17.79 - Necator_americanus # 037 0.4758 (% 08.902) (% 00.270) (% 07.682) 13.70 - N. americanus necAme1 # 038 0.4762 (% 02.894) (% 00.000) (% 02.011) 30.51 - Spirometra_erinaceieuropaei # 039 0.4786 (% 04.266) (% 00.019) (% 03.130) 26.63 - Fasciola_hepatica # 040 0.4788 (% 17.480) (% 00.631) (% 14.549) 16.77 - Dictyocaulus_viviparus # 041 0.4830 (% 08.805) (% 00.022) (% 06.772) 23.09 - Romanomermis_culicivorax # 042 0.4859 (% 06.222) (% 00.068) (% 05.303) 14.77 - P. exspectatus priExs1 # 043 0.4859 (% 12.342) (% 00.199) (% 10.056) 18.52 - Pristionchus_exspectatus # 044 0.4863 (% 15.278) (% 00.137) (% 11.986) 21.55 - Oscheius_sp._TEL-2014 Oscheius_TEL_2014 # 045 0.4889 (% 16.319) (% 00.084) (% 12.626) 22.63 - Setaria_digitata # 046 0.4902 (% 11.406) (% 00.118) (% 09.141) 19.86 - Plectus_sambesii # 047 0.4959 (% 12.624) (% 00.190) (% 10.610) 15.95 - Angiostrongylus_cantonensis # 048 0.4969 (% 14.600) (% 00.179) (% 11.254) 22.92 - Setaria_equina # 049 0.4980 (% 16.698) (% 00.276) (% 12.631) 24.36 - Loa_loa # 050 0.4980 (% 05.056) (% 00.107) (% 04.254) 15.86 - Eye worm loaLoa1 # 051 0.4984 (% 14.413) (% 00.233) (% 11.306) 21.56 - Ditylenchus_destructor # 052 0.4988 (% 12.602) (% 00.312) (% 10.270) 18.50 - Pristionchus_entomophagus # 053 0.5006 (% 45.308) (% 20.405) (% 36.074) 20.38 - C. japonica caeJap4 # 054 0.5007 (% 15.520) (% 00.269) (% 11.600) 25.26 - Onchocerca_flexuosa # 055 0.5010 (% 06.308) (% 00.187) (% 05.354) 15.12 - P. pacificus priPac3 # 056 0.5013 (% 03.234) (% 00.019) (% 02.304) 28.76 - Echinococcus_granulosus # 057 0.5028 (% 19.777) (% 00.117) (% 14.787) 25.23 - Onchocerca_ochengi # 058 0.5038 (% 03.457) (% 00.018) (% 02.482) 28.20 - Taenia_saginata # 059 0.5038 (% 03.268) (% 00.012) (% 02.310) 29.31 - Taenia_solium # 060 0.5040 (% 03.383) (% 00.019) (% 02.395) 29.20 - Echinococcus_canadensis # 061 0.5042 (% 03.396) (% 00.018) (% 02.434) 28.33 - Taenia_asiatica # 062 0.5042 (% 12.880) (% 00.310) (% 10.474) 18.68 - Pristionchus_maxplancki # 063 0.5042 (% 23.081) (% 01.821) (% 18.534) 19.70 - Caenorhabditis_sp._21_LS-2015 C_sp21_LS_2015 # 064 0.5067 (% 05.193) (% 00.151) (% 04.375) 15.75 - O. volvulus oncVol1 # 065 0.5071 (% 25.170) (% 00.609) (% 18.668) 25.83 - Onchocerca_volvulus # 066 0.5074 (% 12.987) (% 00.445) (% 10.449) 19.54 - Pristionchus_pacificus # 067 0.5088 (% 03.403) (% 00.019) (% 02.440) 28.30 - Echinococcus_multilocularis # 068 0.5098 (% 44.315) (% 17.195) (% 35.657) 19.54 - Caenorhabditis_sp._39_LS-2015 C_sp39_LS_2015 # 069 0.5136 (% 20.673) (% 00.156) (% 15.260) 26.18 - Wuchereria_bancrofti # 070 0.5143 (% 10.486) (% 00.600) (% 09.313) 11.19 - H. bacteriophora/m31e hetBac1 # 071 0.5156 (% 19.944) (% 00.441) (% 14.559) 27.00 - Brugia_pahangi # 072 0.5165 (% 12.825) (% 00.184) (% 09.853) 23.17 - Steinernema_monticolum # 073 0.5170 (% 13.379) (% 00.020) (% 10.156) 24.09 - Macrostomum_lignano # 074 0.5183 (% 07.825) (% 00.000) (% 06.305) 19.42 - Schistosoma_japonicum # 075 0.5185 (% 15.865) (% 00.344) (% 12.506) 21.17 - Parapristionchus_giblindavisi # 076 0.5190 (% 20.305) (% 00.186) (% 15.022) 26.02 - Dirofilaria_immitis # 077 0.5190 (% 05.005) (% 00.129) (% 04.188) 16.32 - Dog heartworm dirImm1 # 078 0.5205 (% 09.723) (% 00.007) (% 07.927) 18.47 - Schistosoma_haematobium # 079 0.5217 (% 18.703) (% 00.294) (% 13.109) 29.91 - Rhabditophanes_sp._KR3021 Rhabditophanes_KR3021 # 080 0.5257 (% 11.415) (% 00.065) (% 07.962) 30.25 - Trichinella_papuae # 081 0.5262 (% 10.699) (% 00.051) (% 07.442) 30.44 - Trichinella_nativa # 082 0.5263 (% 10.754) (% 00.051) (% 07.465) 30.58 - Trichinella_nelsoni # 083 0.5272 (% 37.989) (% 00.013) (% 30.954) 18.52 - Girardia_tigrina # 084 0.5277 (% 16.345) (% 00.160) (% 11.694) 28.46 - Rotylenchulus_reniformis # 085 0.5287 (% 10.902) (% 00.051) (% 07.665) 29.69 - Trichinella_sp._T8 Trichinella_T8 # 086 0.5292 (% 05.065) (% 00.102) (% 04.259) 15.91 - Filarial worm bruMal2 # 087 0.5295 (% 10.328) (% 00.018) (% 08.457) 18.12 - Schistosoma_mansoni # 088 0.5300 (% 10.885) (% 00.063) (% 07.626) 29.94 - Trichinella_sp._T9 Trichinella_T9 # 089 0.5304 (% 06.686) (% 00.020) (% 04.946) 26.02 - Gyrodactylus_salaris # 090 0.5314 (% 11.462) (% 00.066) (% 07.980) 30.38 - Trichinella_pseudospiralis # 091 0.5324 (% 10.836) (% 00.052) (% 07.561) 30.22 - Trichinella_patagoniensis # 092 0.5331 (% 02.006) (% 00.000) (% 01.536) 23.43 - C. intestinalis ci3 # 093 0.5335 (% 11.008) (% 00.066) (% 07.671) 30.31 - Trichinella_sp._T6 Trichinella_T6 # 094 0.5345 (% 02.848) (% 00.007) (% 02.239) 21.38 - Trichinella triSpi1 # 095 0.5346 (% 10.697) (% 00.052) (% 07.591) 29.04 - Trichinella_spiralis # 096 0.5350 (% 11.705) (% 00.054) (% 08.203) 29.92 - Trichinella_zimbabwensis # 097 0.5355 (% 12.640) (% 00.334) (% 09.947) 21.31 - Steinernema_carpocapsae # 098 0.5361 (% 10.944) (% 00.063) (% 07.639) 30.20 - Trichinella_britovi # 099 0.5393 (% 16.763) (% 00.185) (% 12.386) 26.11 - Brugia_malayi # 100 0.5453 (% 06.137) (% 00.064) (% 04.468) 27.20 - Trichuris_trichiura # 101 0.5466 (% 06.291) (% 00.145) (% 05.125) 18.53 - Pine wood nematode burXyl1 # 102 0.5466 (% 13.470) (% 00.169) (% 09.909) 26.44 - Bursaphelenchus_xylophilus # 103 0.5474 (% 16.438) (% 01.183) (% 13.268) 19.28 - Oscheius_tipulae # 104 0.5477 (% 03.712) (% 00.018) (% 02.756) 25.75 - Taenia_multiceps # 105 0.5478 (% 10.278) (% 00.197) (% 08.144) 20.76 - Steinernema_glaseri # 106 0.5480 (% 06.585) (% 00.136) (% 05.461) 17.07 - Microworm panRed1 # 107 0.5494 (% 39.093) (% 00.095) (% 31.402) 19.67 - Schmidtea_mediterranea # 108 0.5507 (% 10.269) (% 00.081) (% 07.827) 23.78 - Globodera_pallida # 109 0.5516 (% 11.885) (% 00.259) (% 09.311) 21.66 - Steinernema_feltiae # 110 0.5526 (% 17.785) (% 00.000) (% 14.029) 21.12 - Dugesia_japonica # 111 0.5532 (% 11.549) (% 00.085) (% 08.872) 23.18 - Subanguina_moxae # 112 0.5547 (% 11.318) (% 00.138) (% 08.695) 23.18 - Globodera_ellingtonae # 113 0.5551 (% 11.987) (% 00.319) (% 09.412) 21.48 - Steinernema_scapterisci # 114 0.5561 (% 10.968) (% 00.132) (% 08.386) 23.54 - Globodera_rostochiensis # 115 0.5565 (% 12.683) (% 00.000) (% 08.999) 29.05 - Meloidogyne_floridensis # 116 0.5593 (% 38.533) (% 24.831) (% 31.789) 17.50 - Caenorhabditis_sp._32_LS-2015 C_sp32_LS_2015 # 117 0.5613 (% 02.917) (% 00.036) (% 02.285) 21.67 - Whipworm triSui1 # 118 0.5615 (% 26.293) (% 00.097) (% 19.830) 24.58 - Meloidogyne_javanica # 119 0.5633 (% 03.516) (% 00.090) (% 02.816) 19.91 - M. incognita melInc2 # 120 0.5634 (% 26.760) (% 00.186) (% 19.868) 25.75 - Meloidogyne_incognita # 121 0.5653 (% 31.839) (% 00.889) (% 19.485) 38.80 - Strongyloides_venezuelensis # 122 0.5653 (% 31.051) (% 00.685) (% 19.457) 37.34 - Strongyloides_papillosus # 123 0.5662 (% 28.310) (% 00.661) (% 17.250) 39.07 - Parastrongyloides_trichosuri # 124 0.5689 (% 06.427) (% 00.000) (% 04.011) 37.59 - Oscheius_sp._MCB Oscheius_MCB # 125 0.5690 (% 05.651) (% 00.066) (% 04.144) 26.67 - Trichuris_muris # 126 0.5694 (% 11.930) (% 00.052) (% 08.389) 29.68 - Trichinella_murrelli # 127 0.5714 (% 03.916) (% 00.074) (% 03.186) 18.64 - M. hapla melHap1 # 128 0.5744 (% 31.303) (% 00.285) (% 23.952) 23.48 - Meloidogyne_arenaria # 129 0.5789 (% 07.254) (% 00.013) (% 05.201) 28.30 - Heterodera_glycines # 130 0.5932 (% 17.004) (% 00.081) (% 10.827) 36.33 - Meloidogyne_graminicola # 131 0.5953 (% 35.984) (% 01.356) (% 20.257) 43.71 - Strongyloides_stercoralis # 132 0.5981 (% 36.164) (% 01.850) (% 20.838) 42.38 - Threadworm strRat2 # 133 0.6420 (% 23.468) (% 00.240) (% 18.204) 22.43 - Acrobeloides_nanus # 134 0.7045 (% 00.158) (% 00.000) (% 00.090) 43.04 - Elaeophora_elaphi # None of this concern for distances matters in building the first step, the # maf files. The distances will be better calibrated later. # create species list and stripped down tree for autoMZ sed -e 's/,//g; s/:[0-9]\+.[0-9]\+//g; s/;//g;' staAur2.369way.nh \ | xargs echo > tree.nh cat tree.nh | fold -s -w 78 | sed -e 's/^/# /;' # ((((((((((((((((((((((((((staAur2 (Sa_USA300_SUR12 (Sa_USA300_TCH1516 # ((Sa_JE2 Sa_C2406) (Sa_USA300_NRS384 (Sa_UA_S391_USA300 ((Sa_CIT Sa_OXLIM) # (Sa_33b (Sa_26b_MRSA (Sa_27b_MRSA (Sa_29b_MRSA (Sa_31b_MRSA # Sa_25b_MRSA)))))))))))) Sa_CAR) Sa_AR_0216) Sa_USA300_ISMMS1) # ((((((Sa_USA300_SUR9 Sa_USA300_SUR10) Sa_USA300_SUR4) Sa_USA300_SUR8) # Sa_USA300_SUR11) (((((Sa_USA300_SUR13 Sa_USA300_SUR21) Sa_USA300_SUR3) # Sa_USA300_SUR6) (((((Sa_USA300_SUR14 Sa_USA300_SUR19) Sa_USA300_SUR16) # Sa_USA300_SUR17) Sa_USA300_SUR18) Sa_USA300_SUR7)) ((Sa_USA300_SUR20 # Sa_USA300_SUR22) Sa_USA300_SUR23))) (Sa_USA300_SUR15 Sa_USA300_SUR24))) # (Sa_USA300_2014C01 (Sa_USA300_SUR1_a (Sa_USA300_SUR1_b Sa_USA300_SUR2)))) # Sa_UTSW_MRSA_55) (((Sa_TCH_959 Sa_CA15) Sa_HUV05) (Sa_M121 Sa_CA12))) # Sa_V2200) ((((((Sa_Newman Sa_Newman_NYU_Newman) Sa_Newman_D2C) # Sa_Sa_Newman_UoM) ((Sa_COL ((((Sa_NCTC8325_2006 Sa_HG001_RN1) # Sa_NCTC8325_2018) ((((Sa_VC40 Sa_NRS146) (Sa_FDAARGOS_6 Sa_NRS107)) # Sa_FDAARGOS_15) Sa_DSM_20231)) ((Sa_FDAARGOS_412 ((Sa_54 Sa_MSSA54) Sa_164)) # Sa_EDCC5458))) (Sa_28 Sa_82))) Sa_JH4899) (((Sa_M1 Sa_Clinical) Sa_NCTC13395) # Sa_NCTC13394))) (Sa_199 Sa_64)) (Sa_2395_USA500 Sa_SVH7513)) ((Sa_165 Sa_546) # Sa_545)) (((((((Sa_TW20 Sa_CMRSA_6) Sa_V521) Sa_M92) Sa_Z172) Sa_CMRSA_3) # (((Sa_JKD6008 (Sa_T0131 (Sa_WCH_SK2 Sa_NCTC11940))) Sa_NCTC9944) ((Sa_Gv51 # ((Sa_Be62 Sa_Gv88) Sa_HC1335)) Sa_HC1340))) Sa_NMR08)) (((((((((Sa_MW2_2004 # Sa_USA400_0051) Sa_JMUB3031) (((Sa_MSSA476 Sa_FORC_026) (Sa_No10 # Sa_CFSAN007851)) Sa_FORC_045)) (Sa_FORC59 Sa_JMUB1273)) (((((Sa_ST20130940 # Sa_ST20130941) (Sa_45 Sa_135)) (Sa_ST20130938 Sa_ST20130939)) # Sa_S_aureus_171) Sa_121560027_1)) (Sa_11819_97 (Sa_GR2 Sa_NCTC13435))) # ((((((((Sa_CN1 Sa_FORC_040) (Sa_TMUS2126 Sa_TMUS2134)) ((Sa_FORC_012 # Sa_E16SA093) Sa_F17SA003)) Sa_FORC_061) Sa_08_02300) Sa_NCTC13137) (((Sa_RKI4 # Sa_CFSAN018750) Sa_NCTC6136) Sa_BA01611)) (Sa_ST20130942 Sa_ST20130943))) # Sa_AUS0325) (((((Sa_FDA209P Sa_ATCC_6538) Sa_NCTC10344) Sa_NCTC9752) # Sa_NCTC3761) Sa_MOK042))) (((((((((((((Sa_N315 (Sa_ECT_R_2 Sa_CC5)) # (((Sa_04_02981 Sa_AR_0220) Sa_CFBR_105) Sa_UCI_28_ST5)) (((Sa_FCFHV36 # Sa_ZJ5499) (Sa_AR461 Sa_AR_0468)) Sa_MI)) Sa_UCI62) ((Sa_JH9 Sa_JH1) # Sa_AR_474)) (Sa_SA112 Sa_CR14_035)) (((((((Sa_ST228_16035 (Sa_ST228_10388 # Sa_ST228_10497)) Sa_ST228_18341) Sa_ST228_15532) (Sa_ST228_18583 # Sa_ST228_18412)) Sa_ST228_16125) (((((Sa_NRS149 Sa_CFSAN007894) (Sa_SA564 # Sa_CFSAN007850)) (Sa_RIVM6519 (Sa_AR462 Sa_CHU15_056))) Sa_FDAARGOS_159) # Sa_NZAK3)) (Sa_FDAARGOS_48 Sa_FDAARGOS_1))) ((((Sa_Mu50 Sa_AR_0219) Sa_Mu3) # (Sa_FORC_027 Sa_FORC_062)) (Sa_NCCP14558 Sa_NCCP14562))) Sa_HPV107) # Sa_ISU935_ST5) Sa_HOU1444_VR) ((Sa_ED98 Sa_ch21) Sa_ch22)) # Sa_ST772_MRSA_V_DAR4145)) (((Sa_HO_5096_0412 Sa_H_EMRSA_15) Sa_EDCC5464) # (Sa_IT1_S Sa_IT4_R))) (((((Sa_RF122 Sa_NCTC7485) ((((Sa_O46 Sa_O326) # Sa_CFSAN064037) ((Sa_ED133 Sa_O17) Sa_LGA251)) (((Sa_NRS153 (Sa_422 Sa_128)) # (Sa_27 Sa_78)) Sa_XQ))) Sa_NCTC5663) (((((((Sa_M013 Sa_SA957) Sa_HZW450) # Sa_SA268) (Sa_SA40 Sa_SA40TW)) Sa_CFSAN007883) Sa_MS4) (Sa_6850 (Sa_GN3 # Sa_GN1)))) Sa_JKD6159)) (((((((((Sa_ST398_S0385 (Sa_08BA02176 ((Sa_RIVM1295 # ((Sa_RIVM1607 Sa_08S00974) Sa_10_5235)) (Sa_LA_MRSA_ST398_E154 # Sa_ISU926_ST398)))) (Sa_16_LA_309 Sa_12_LA_293)) (Sa_8_LA_272 (Sa_2_LA_86 # Sa_19_LA_388))) ((((Sa_71193 Sa_293G) Sa_GD5) (Sa_GD705 Sa_GD1539)) # Sa_GD1677)) Sa_PTDrAP2) Sa_17_LA_343) Sa_RIVM3897) ((((((((Sa_MRSA252 # Sa_NCTC13277) (((Sa_FORC_001 Sa_0201753_2) (Sa_CFSAN007847 (((Sa_468 Sa_143) # Sa_466) (Sa_277 Sa_85)))) Sa_CFSAN007896)) ((((Sa_ATCC_25923 # Sa_Seattle_1945_G477) Sa_Seattle_1945_G478) Sa_80wphwpl_v1) Sa_NRS484)) # (Sa_55_2053 Sa_13420)) Sa_FDAARGOS_504) (Sa_SJTUF_J27 Sa_CFSAN018749)) # Sa_AR_0470) Sa_ILRI_Eymole1_1)) (((Sa_CA_347 ((((Sa_MCRF184 (Sa_1549_WT # (Sa_1549_REV Sa_1549_SCV))) Sa_CFSAN007835) Sa_AR464) Sa_AR466)) Sa_NCTC8726) # Sa_JS395))) (Sa_55_99_44 Sa_BB155)) (((Sa_Bmb9393 ((Sa_FDAARGOS_10 # Sa_NCTC13140) Sa_OC8)) (((Sa_NRS1 Sa_AR_0228) ((Sa_61 Sa_191) Sa_628)) # Sa_FORC_039)) ((Sa_Gv69 Sa_MRSA107) Sa_MOK063))) ((Sa_FDAARGOS_40 (Sa_187 # Sa_629)) (Sa_K18 Sa_K5))) Sa_5_3949) (((((Sa_TCH60 (Sa_MN8 Sa_NRS143)) # (((Sa_CFSAN064038 (((((Sa_3_LA_115 Sa_21_LA_436) (Sa_13_LA_301 Sa_14_5418)) # Sa_7_4623) Sa_20_LA_415) Sa_9_LA_281)) (Sa_1_1439 ((Sa_6_LA_232 Sa_4_LA_208) # Sa_15_LA_305))) Sa_22_LA_562)) Sa_AR_0471) ((Sa_Tager_104 Sa_AR_475) # ((((((((Sa_502A_RN6607 Sa_AR_0222) Sa_080880048_3) Sa_NRS70) (((Sa_AR465 # Sa_AR_0467) Sa_AR_0215) Sa_AR_0469)) ((((((Sa_FDAARGOS_5 (Sa_NRS137 # Sa_NRS133)) (((((((Sa_USA300_2014C02 Sa_1969N) Sa_3020C01) (Sa_1971C01 # (Sa_1625CO1 Sa_2148C01))) Sa_FDAARGOS_140) Sa_AR_0225) Sa_AR_0226) (Sa_5118N # Sa_AR_0223))) Sa_NRS120) ((Sa_FDAARGOS_2 (Sa_FDAARGOS_43 Sa_Mw2_2018)) # (Sa_08_02119 (Sa_24117_WT (Sa_24117_REV Sa_24117_SCV))))) ((Sa_2148N # (Sa_AR_0472 Sa_AR_0473)) (Sa_101110051_1 Sa_101110041_3))) Sa_SR434)) # (Sa_NX_T55 Sa_QD_CD9)) Sa_NRS271) Sa_MOZ66))) Sa_K17)) Bacillus_subtilis_168) # (((((E_coli_K_12_MG1655 E_coli_O157_H7_Sakai_RIMD_0509952) E_coli_UMN026) # E_coli_O83_H1_NRG_857C) E_coli_IAI39) E_coli_O104_H4_2011C_3493)) sed -e 's/:.*//; s/(//g; s/ *//;' staAur2.369way.nh \ | xargs echo > species.list cat species.list | fold -s -w 78 | sed -e 's/^/# /;' # staAur2 Sa_USA300_SUR12 Sa_USA300_TCH1516 Sa_JE2 Sa_C2406 Sa_USA300_NRS384 # Sa_UA_S391_USA300 Sa_CIT Sa_OXLIM Sa_33b Sa_26b_MRSA Sa_27b_MRSA Sa_29b_MRSA # Sa_31b_MRSA Sa_25b_MRSA Sa_CAR Sa_AR_0216 Sa_USA300_ISMMS1 Sa_USA300_SUR9 # Sa_USA300_SUR10 Sa_USA300_SUR4 Sa_USA300_SUR8 Sa_USA300_SUR11 Sa_USA300_SUR13 # Sa_USA300_SUR21 Sa_USA300_SUR3 Sa_USA300_SUR6 Sa_USA300_SUR14 Sa_USA300_SUR19 # Sa_USA300_SUR16 Sa_USA300_SUR17 Sa_USA300_SUR18 Sa_USA300_SUR7 # Sa_USA300_SUR20 Sa_USA300_SUR22 Sa_USA300_SUR23 Sa_USA300_SUR15 # Sa_USA300_SUR24 Sa_USA300_2014C01 Sa_USA300_SUR1_a Sa_USA300_SUR1_b # Sa_USA300_SUR2 Sa_UTSW_MRSA_55 Sa_TCH_959 Sa_CA15 Sa_HUV05 Sa_M121 Sa_CA12 # Sa_V2200 Sa_Newman Sa_Newman_NYU_Newman Sa_Newman_D2C Sa_Sa_Newman_UoM Sa_COL # Sa_NCTC8325_2006 Sa_HG001_RN1 Sa_NCTC8325_2018 Sa_VC40 Sa_NRS146 # Sa_FDAARGOS_6 Sa_NRS107 Sa_FDAARGOS_15 Sa_DSM_20231 Sa_FDAARGOS_412 Sa_54 # Sa_MSSA54 Sa_164 Sa_EDCC5458 Sa_28 Sa_82 Sa_JH4899 Sa_M1 Sa_Clinical # Sa_NCTC13395 Sa_NCTC13394 Sa_199 Sa_64 Sa_2395_USA500 Sa_SVH7513 Sa_165 # Sa_546 Sa_545 Sa_TW20 Sa_CMRSA_6 Sa_V521 Sa_M92 Sa_Z172 Sa_CMRSA_3 Sa_JKD6008 # Sa_T0131 Sa_WCH_SK2 Sa_NCTC11940 Sa_NCTC9944 Sa_Gv51 Sa_Be62 Sa_Gv88 # Sa_HC1335 Sa_HC1340 Sa_NMR08 Sa_MW2_2004 Sa_USA400_0051 Sa_JMUB3031 # Sa_MSSA476 Sa_FORC_026 Sa_No10 Sa_CFSAN007851 Sa_FORC_045 Sa_FORC59 # Sa_JMUB1273 Sa_ST20130940 Sa_ST20130941 Sa_45 Sa_135 Sa_ST20130938 # Sa_ST20130939 Sa_S_aureus_171 Sa_121560027_1 Sa_11819_97 Sa_GR2 Sa_NCTC13435 # Sa_CN1 Sa_FORC_040 Sa_TMUS2126 Sa_TMUS2134 Sa_FORC_012 Sa_E16SA093 # Sa_F17SA003 Sa_FORC_061 Sa_08_02300 Sa_NCTC13137 Sa_RKI4 Sa_CFSAN018750 # Sa_NCTC6136 Sa_BA01611 Sa_ST20130942 Sa_ST20130943 Sa_AUS0325 Sa_FDA209P # Sa_ATCC_6538 Sa_NCTC10344 Sa_NCTC9752 Sa_NCTC3761 Sa_MOK042 Sa_N315 # Sa_ECT_R_2 Sa_CC5 Sa_04_02981 Sa_AR_0220 Sa_CFBR_105 Sa_UCI_28_ST5 Sa_FCFHV36 # Sa_ZJ5499 Sa_AR461 Sa_AR_0468 Sa_MI Sa_UCI62 Sa_JH9 Sa_JH1 Sa_AR_474 Sa_SA112 # Sa_CR14_035 Sa_ST228_16035 Sa_ST228_10388 Sa_ST228_10497 Sa_ST228_18341 # Sa_ST228_15532 Sa_ST228_18583 Sa_ST228_18412 Sa_ST228_16125 Sa_NRS149 # Sa_CFSAN007894 Sa_SA564 Sa_CFSAN007850 Sa_RIVM6519 Sa_AR462 Sa_CHU15_056 # Sa_FDAARGOS_159 Sa_NZAK3 Sa_FDAARGOS_48 Sa_FDAARGOS_1 Sa_Mu50 Sa_AR_0219 # Sa_Mu3 Sa_FORC_027 Sa_FORC_062 Sa_NCCP14558 Sa_NCCP14562 Sa_HPV107 # Sa_ISU935_ST5 Sa_HOU1444_VR Sa_ED98 Sa_ch21 Sa_ch22 Sa_ST772_MRSA_V_DAR4145 # Sa_HO_5096_0412 Sa_H_EMRSA_15 Sa_EDCC5464 Sa_IT1_S Sa_IT4_R Sa_RF122 # Sa_NCTC7485 Sa_O46 Sa_O326 Sa_CFSAN064037 Sa_ED133 Sa_O17 Sa_LGA251 Sa_NRS153 # Sa_422 Sa_128 Sa_27 Sa_78 Sa_XQ Sa_NCTC5663 Sa_M013 Sa_SA957 Sa_HZW450 # Sa_SA268 Sa_SA40 Sa_SA40TW Sa_CFSAN007883 Sa_MS4 Sa_6850 Sa_GN3 Sa_GN1 # Sa_JKD6159 Sa_ST398_S0385 Sa_08BA02176 Sa_RIVM1295 Sa_RIVM1607 Sa_08S00974 # Sa_10_5235 Sa_LA_MRSA_ST398_E154 Sa_ISU926_ST398 Sa_16_LA_309 Sa_12_LA_293 # Sa_8_LA_272 Sa_2_LA_86 Sa_19_LA_388 Sa_71193 Sa_293G Sa_GD5 Sa_GD705 # Sa_GD1539 Sa_GD1677 Sa_PTDrAP2 Sa_17_LA_343 Sa_RIVM3897 Sa_MRSA252 # Sa_NCTC13277 Sa_FORC_001 Sa_0201753_2 Sa_CFSAN007847 Sa_468 Sa_143 Sa_466 # Sa_277 Sa_85 Sa_CFSAN007896 Sa_ATCC_25923 Sa_Seattle_1945_G477 # Sa_Seattle_1945_G478 Sa_80wphwpl_v1 Sa_NRS484 Sa_55_2053 Sa_13420 # Sa_FDAARGOS_504 Sa_SJTUF_J27 Sa_CFSAN018749 Sa_AR_0470 Sa_ILRI_Eymole1_1 # Sa_CA_347 Sa_MCRF184 Sa_1549_WT Sa_1549_REV Sa_1549_SCV Sa_CFSAN007835 # Sa_AR464 Sa_AR466 Sa_NCTC8726 Sa_JS395 Sa_55_99_44 Sa_BB155 Sa_Bmb9393 # Sa_FDAARGOS_10 Sa_NCTC13140 Sa_OC8 Sa_NRS1 Sa_AR_0228 Sa_61 Sa_191 Sa_628 # Sa_FORC_039 Sa_Gv69 Sa_MRSA107 Sa_MOK063 Sa_FDAARGOS_40 Sa_187 Sa_629 Sa_K18 # Sa_K5 Sa_5_3949 Sa_TCH60 Sa_MN8 Sa_NRS143 Sa_CFSAN064038 Sa_3_LA_115 # Sa_21_LA_436 Sa_13_LA_301 Sa_14_5418 Sa_7_4623 Sa_20_LA_415 Sa_9_LA_281 # Sa_1_1439 Sa_6_LA_232 Sa_4_LA_208 Sa_15_LA_305 Sa_22_LA_562 Sa_AR_0471 # Sa_Tager_104 Sa_AR_475 Sa_502A_RN6607 Sa_AR_0222 Sa_080880048_3 Sa_NRS70 # Sa_AR465 Sa_AR_0467 Sa_AR_0215 Sa_AR_0469 Sa_FDAARGOS_5 Sa_NRS137 Sa_NRS133 # Sa_USA300_2014C02 Sa_1969N Sa_3020C01 Sa_1971C01 Sa_1625CO1 Sa_2148C01 # Sa_FDAARGOS_140 Sa_AR_0225 Sa_AR_0226 Sa_5118N Sa_AR_0223 Sa_NRS120 # Sa_FDAARGOS_2 Sa_FDAARGOS_43 Sa_Mw2_2018 Sa_08_02119 Sa_24117_WT Sa_24117_REV # Sa_24117_SCV Sa_2148N Sa_AR_0472 Sa_AR_0473 Sa_101110051_1 Sa_101110041_3 # Sa_SR434 Sa_NX_T55 Sa_QD_CD9 Sa_NRS271 Sa_MOZ66 Sa_K17 Bacillus_subtilis_168 # E_coli_K_12_MG1655 E_coli_O157_H7_Sakai_RIMD_0509952 E_coli_UMN026 # E_coli_O83_H1_NRG_857C E_coli_IAI39 E_coli_O104_H4_2011C_3493 # do not need to take any kind of N50 survey, going to use 'mafNet' # alignment for everything # the usual rules for selection: # 1. when n50Size over 500,000 and percentN under 5 -> syntenic net # 2. otherwise, use chainNet # are woefully inadequate for this purpose. All the alignments # are similar. The 'mafNet' alignment has the highest numbers awk -F$'\t' '$4 > 500000' allSummary.tab | awk '$5 < 5' \ | cut -f7 | awk -F'.' '{print $1}' > syntenicList.txt awk -F$'\t' '$4 <= 500000 || $5 >= 5' allSummary.tab \ | cut -f7 | awk -F'.' '{print $1}' > chainNetList.txt awk '{printf "%s\t%s\t%s\n", $NF,$5,$7}' ../sizeStats.txt \ | sed -e 's/)//g;' | sort -k3,3nr | sort \ | join -t$'\t' - <(sort syntenicList.txt) | sed -e 's/^/# /;' # bash shell syntax here ... cd /hive/data/genomes/staAur2/bed/multiz369way mkdir mafLinks export H="/hive/data/genomes/staAur2/bed" export TOP="/hive/data/genomes/staAur2/bed/multiz369way" # ls ../lastzRuns/lastzGCF_003194185v1/mafNet # NC_007790v1.maf.gz NC_007791v1.maf.gz NC_007792v1.maf.gz NC_007793v1.maf.gz sed -e 's/staAur2 //;' species.list | sed -e 's/ /\n/g' | while read specName do cd "${TOP}" GC=`grep -w "${specName}" shortName.to.GCFname.tab | cut -f2` # Sa_0201753_2 GCF_003194185v1 mafNet="../lastzRuns/lastz${GC}/mafNet" echo "$specName" 1>&2 mkdir -p "mafLinks/${specName}" cd mafLinks/${specName} for M in ../../$mafNet/*.maf.gz do B=`basename $M` zcat "${M}" | sed -e "s/$GC/$specName/;" | gzip -c > ${B} done done # verify the symLinks are good: find ./mafLinks -type f | sed -e 's#.*/##;' | sort | uniq -c # 260 NC_007790v1.maf.gz # 265 NC_007791v1.maf.gz # 365 NC_007792v1.maf.gz # 368 NC_007793v1.maf.gz # not all of them match in the plastids # the fullChromRun is going to take forever, split them up at # approximately 100,000 boundaries in the largest gap between # genes # run a kluster job to split them all ssh ku cd /hive/data/genomes/staAur2/bed/multiz369way/mafSplit printf ' #!/bin/csh -ef set G = $1 set M = $2 mkdir -p $G pushd $G > /dev/null if ( -s staAur2_${M}.00.maf ) then /bin/rm -f staAur2_${M}.*.maf endif /cluster/bin/x86_64/mafSplit ../mafSplit.bed staAur2_ ../../mafLinks/${G}/${M}.maf.gz /bin/gzip staAur2_${M}.*.maf popd > /dev/null ' > runOne # << happy emacs chmod +x runOne printf '#LOOP runOne $(dir1) $(file1) {check out exists+ $(dir1)/staAur2_$(file1).00.maf.gz} #ENDLOOP ' > template find ../mafLinks -type l | awk -F'/' '{printf "%s/%s\n", $3,$4}' \ | sed -e 's/.maf.gz//;' > maf.list gensub2 maf.list single template jobList para -ram=16g create jobList para try ... check ... push ... etc... # Completed: 1258 of 1258 jobs # CPU time in finished jobs: 729s 12.15m 0.20h 0.01d 0.000 y # IO & Wait Time: 5160s 86.00m 1.43h 0.06d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest finished job: 8s 0.13m 0.00h 0.00d # Submission to last job: 487s 8.12m 0.14h 0.01d # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | grep "maf.gz" | wc -l # 11577 find . -type f | grep ".maf.gz$" | xargs -L 1 basename | sort -u \ > run.maf.list wc -l run.maf.list # 33 run.maf.list # number of chroms with data: awk -F'.' '{print $1}' run.maf.list | sed -e 's/staAur2_//;' \ | sort | uniq -c | sort -n | wc -l # 4 mkdir /hive/data/genomes/staAur2/bed/multiz369way/splitRun cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = staAur2 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/staAur2/bed/multiz369way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /dev/shm/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh printf '#LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/staAur2/bed/multiz369way/splitRun/maf/$(root1)} #ENDLOOP ' > template ln -s ../../mafSplit/run.maf.list maf.list ssh ku cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun/run gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc... # Completed: 33 of 33 jobs # CPU time in finished jobs: 665856s 11097.59m 184.96h 7.71d 0.021 y # IO & Wait Time: 214s 3.57m 0.06h 0.00d 0.000 y # Average job time: 20184s 336.40m 5.61h 0.23d # Longest finished job: 29328s 488.80m 8.15h 0.34d # Submission to last job: 29328s 488.80m 8.15h 0.34d # put the split maf results back together into a single per-chrom maf file # eliminate duplicate comments ssh hgwdev cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun mkdir ../maf # no need to save the comments since they are lost with mafAddIRows cat << '_EOF_' >> runOne #!/bin/csh -fe set C = $1 if ( -s ../maf/${C}.maf.gz ) then rm -f ../maf/${C}.maf.gz endif if ( -s maf/staAur2_${C}.00.maf ) then head -q -n 1 maf/staAur2_${C}.00.maf | sort -u > ../maf/${C}.maf grep -h -v "^#" `ls maf/staAur2_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf tail -q -n 1 maf/staAur2_${C}.00.maf | sort -u >> ../maf/${C}.maf else touch ../maf/${C}.maf endif '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' >> template #LOOP runOne $(root1) {check out exists ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 ../../../chrom.sizes > chr.list ssh ku cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun gensub2 chr.list single template jobList para -ram=16g create jobList # only 4 jobs, they run with the 'para try' para try ... check ... push ... etc ... para -maxJob=32 push # Completed: 4 of 4 jobs # CPU time in finished jobs: 11s 0.18m 0.00h 0.00d 0.000 y # IO & Wait Time: 9s 0.16m 0.00h 0.00d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest finished job: 14s 0.23m 0.00h 0.00d # Submission to last job: 21s 0.35m 0.01h 0.00d cd /hive/data/genomes/staAur2/bed/multiz369way/maf # the four results: # -rw-rw-r-- 1 1039254 Dec 29 10:20 NC_007791v1.maf # -rw-rw-r-- 1 8427476 Dec 29 10:20 NC_007792v1.maf # -rw-rw-r-- 1 2682382 Dec 29 10:20 NC_007790v1.maf # -rw-rw-r-- 1 1716754011 Dec 29 10:20 NC_007793v1.maf # Load into database mkdir -p /gbdb/staAur2/multiz369way/maf cd /hive/data/genomes/staAur2/bed/multiz369way/maf ln -s `pwd`/*.maf /gbdb/staAur2/multiz369way/maf/ # this generates an immense multiz369way.tab file in the directory # where it is running. Best to run this over in scratch. # This is going to take all day. cd /dev/shm time hgLoadMaf staAur2 multiz369way # Loaded 25064 mafs in 4 files from /gbdb/staAur2/multiz369way # real 0m8.868s time (cat /gbdb/staAur2/multiz369way/maf/*.maf \ | hgLoadMafSummary staAur2 -minSize=10000 -verbose=2 \ -mergeGap=500 -maxSize=50000 multiz369waySummary stdin # Created 110274 summary blocks from 7621971 components and 25064 mafs from stdin # real 0m22.061s # -rw-rw-r-- 1 5919198 Dec 29 10:26 multiz369waySummary.tab # -rw-rw-r-- 1 1306535 Dec 29 10:26 multiz369way.tab wc -l multiz369way* # 25064 multiz369way.tab # 110274 multiz369waySummary.tab rm multiz369way*.tab ####################################################################### # this fullChromRun is on-going 2018-12-29 to see if it will finish # at some point # only 4 of these, run up as full chromosomes without the splitting # procedure mkdir /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun cd /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn find ../../mafLinks -type f | grep ".maf.gz$" | xargs -L 1 basename \ | sed -e 's/.gz//;' | sort -u > maf.list wc -l maf.list # 4 maf.list cat maf.list | sed -e 's/^/# /;' # NC_007790v1.maf # NC_007791v1.maf # NC_007792v1.maf # NC_007793v1.maf # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = staAur2 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/staAur2/bed/multiz369way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun/maf/$(root1).maf} #ENDLOOP ' > template ssh ku cd /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun/run gensub2 maf.list single template jobList para -ram=64g create jobList # there are only 4 jobs, 'para try' runs them all XXX - running - Sun Dec 23 22:38:44 PST 2018 # the tiny plasmids run rather rapidly, both the small ones under 4 minutes # the large chromosome might run for some time . . . # Completed: 7 of 7 jobs # CPU time in finished jobs: 295337s 4922.28m 82.04h 3.42d 0.009 y # IO & Wait Time: 226s 3.77m 0.06h 0.00d 0.000 y # Average job time: 42223s 703.72m 11.73h 0.49d # Longest finished job: 58035s 967.25m 16.12h 0.67d # Submission to last job: 58046s 967.43m 16.12h 0.67d cd /hive/data/genomes/staAur2/bed/multiz369way du -hsc fullChromRun/maf # 20G fullChromRun/maf # Load into database ssh hgwdev cd /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun/maf mkdir /gbdb/staAur2/multiz369way ln -s `pwd`/*.maf /gbdb/staAur2/multiz369way cd /dev/shm time hgLoadMaf staAur2 multiz369way # Loaded 6396612 mafs in 7 files from /gbdb/staAur2/multiz369way # real 2m35.511s time cat /gbdb/staAur2/multiz369way/*.maf \ | hgLoadMafSummary staAur2 -minSize=10000 -verbose=2 \ -mergeGap=500 -maxSize=50000 multiz369waySummary stdin # Created 1897571 summary blocks from 214521234 components and 6396612 mafs from stdin # real 5m27.726s # -rw-rw-r-- 1 312973610 Dec 10 09:06 multiz369way.tab # -rw-rw-r-- 1 103769870 Dec 10 09:15 multiz369waySummary.tab wc -l multiz369way* # 6396612 multiz369way.tab # 1897571 multiz369waySummary.tab rm multiz369way*.tab ############################################################################## # GAP ANNOTATE MULTIZ369WAY MAF AND LOAD TABLES (TBD - 2018-12-10 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. # this new one needs to be created cd /hive/data/genomes/staAur2 twoBitInfo -nBed staAur2.unmasked.2bit staAur2.N.bed # there are no gaps: # -rw-rw-r-- 1 0 Dec 29 11:28 staAur2.N.bed mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/anno cd /hive/data/genomes/staAur2/bed/multiz369way/anno ln -s ../../../nameCorrespond/gcToShortName.tab . rm -fr nBedsDir sizesDir nBeds sizes mkdir nBedsDir sizesDir # the UCSC local databases can be done normally cat ../species.list | tr ' ' '\n' | grep "^[a-z]" | while read D do printf "%s\n" "${D}" ls -og /hive/data/genomes/${D}/${D}.N.bed /hive/data/genomes/${D}/chrom.sizes ln -s /hive/data/genomes/${D}/${D}.N.bed nBedsDir/${D}.bed echo "${D}.bed" >> nBeds ln -s /hive/data/genomes/${D}/chrom.sizes sizesDir/${D}.len echo "${D}.len" >> sizes done ls ../../../twoBits/GCF_003193745v1 GCF_003193745v1.2bit GCF_003193745v1.idKeys.txt GCF_003193745v1.chrom.sizes GCF_003193745v1.keySignature.txt cat ../species.list | tr ' ' '\n' | grep -v "^[a-z]" | while read S do gcName=`grep -w "${S}" gcToShortName.tab | cut -f1 | sed -e 's/\./v/;'` printf "%s\t%s\n" "${S}" "${gcName}" twoBitInfo -nBed ../../../twoBits/$gcName/$gcName.2bit nBedsDir/${S}.bed echo ${S}.bed >> nBeds ln -s /hive/data/genomes/staAur2/twoBits/$gcName/$gcName.chrom.sizes sizesDir/${S}.len echo ${S}.len >> sizes done # verify count: wc -l nBeds sizes # 369 nBeds # 369 sizes # make sure they all are successful symLinks: ls -ogL nBedsDir/*.bed | wc -l # 369 screen -S staAur2 # use a screen to control this longish job ssh ku mkdir /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet cd /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet mkdir result ln -s ../nBeds . ln -s ../sizes . ln -s ../sizesDir/*.len . ln -s ../nBedsDir/*.bed . printf '#LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/staAur2/staAur2.2bit {check out line+ result/$(file1)} #ENDLOOP ' > template ls ../../maf/*.maf > maf.list gensub2 maf.list single template jobList # these jobs require a lot of memory para -ram=48g create jobList # only 4 jobs run with the 'para try' para try ... check ... para push # Completed: 4 of 4 jobs # CPU time in finished jobs: 111s 1.85m 0.03h 0.00d 0.000 y # IO & Wait Time: 11s 0.19m 0.00h 0.00d 0.000 y # Average job time: 31s 0.51m 0.01h 0.00d # Longest finished job: 113s 1.88m 0.03h 0.00d # Submission to last job: 120s 2.00m 0.03h 0.00d du -hsc result ../../maf # 2.1G result # 1.7G ../../maf # Load into database rm -f /gbdb/staAur2/multiz369way/NC*.maf cd /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet/result ln -s `pwd`/*.maf /gbdb/staAur2/multiz369way/ # this generates an immense multiz369way.tab file in the directory # where it is running. Best to run this over in scratch. cd /dev/shm time hgLoadMaf staAur2 multiz369way # Loading multiz369way into database # Loaded 25064 mafs in 4 files from /gbdb/staAur2/multiz369way # real 0m12.367s time (cat /gbdb/staAur2/multiz369way/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=10000 \ -mergeGap=500 -maxSize=50000 staAur2 multiz369waySummary stdin) # Created 110274 summary blocks from 7621971 components and 25064 mafs from stdin # real 0m26.114s wc -l multiz369*.tab # 25064 multiz369way.tab # 110274 multiz369waySummary.tab # -rw-rw-r-- 1 1310640 Dec 29 11:44 multiz369way.tab # -rw-rw-r-- 1 6139746 Dec 29 11:45 multiz369waySummary.tab rm multiz369way*.tab ############################################################################## # MULTIZ7WAY MAF FRAMES (TBD - 2018-12-13 - Hiram) ssh hgwdev mkdir /hive/data/genomes/staAur2/bed/multiz369way/frames cd /hive/data/genomes/staAur2/bed/multiz369way/frames # survey all the genomes to find out what kinds of gene tracks they have printf '#!/bin/csh -fe foreach db (`cat ../species.list | tr " " "\\n" | grep -v "^[A-Z]"`) echo -n "# ${db}: " set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "ncbiRefSeq" || $table == "mgcGenes" || \ $table == "ws245Genes" || \ $table == "knownGene" || $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end echo end ' > showGenes.csh chmod +x ./showGenes.csh time ./showGenes.csh # staAur2: ensGene: 61109, ncbiRefSeq: 52984, refGene: 53937, ws245Genes: 57827, xenoRefGene: 32625, # caeAng2: ws245Genes: 33934, # caeJap4: ws245Genes: 38144, xenoRefGene: 158453, # cb4: ws245Genes: 23050, xenoRefGene: 130721, # caeSp51: ws245Genes: 46280, # caeRem4: ws245Genes: 32899, xenoRefGene: 155137, # caePb3: ws245Genes: 33210, xenoRefGene: 189584, # caeSp111: ws245Genes: 27721, # priExs1: ws245Genes: 24642, xenoRefGene: 153824, # priPac3: ws245Genes: 24243, xenoRefGene: 97503, # burXyl1: ws245Genes: 18074, # necAme1: ws245Genes: 19643, # ancCey1: ws245Genes: 65583, xenoRefGene: 149848, # ascSuu1: ws245Genes: 3082, xenoRefGene: 148202, # haeCon2: ws245Genes: 26359, # triSui1: ws245Genes: 14435, # panRed1: ws245Genes: 26372, # bruMal2: ws245Genes: 18709, # dirImm1: ws245Genes: 12857, # oncVol1: ws245Genes: 12385, # loaLoa1: ws245Genes: 15577, # melInc2: ws245Genes: 20365, # melHap1: ws245Genes: 14420, xenoRefGene: 88300, # ci3: ensGene: 17784, refGene: 1306, xenoRefGene: 86960, # hetBac1: ws245Genes: 20928, # triSpi1: ws245Genes: 16380, # from that summary, use these gene sets: # ensGene - staAur2 ci3 # ws245Genes - caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1 # priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2 # dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1 mkdir genes # 1. ensGene: staAur2 ci3 for DB in staAur2 ci3 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz echo -n "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # staAur2: checked: 20195 failed: 0 # ci3: checked: 16035 failed: 0 # 2. ws245Genes caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1 # priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2 # dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1 for DB in caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1 priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2 dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ws245Genes" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz echo -n "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # caeAng2: checked: 27919 failed: 0 # caeJap4: checked: 29913 failed: 0 # cb4: checked: 21737 failed: 0 # caeSp51: checked: 34679 failed: 0 # caeRem4: checked: 31433 failed: 0 # caePb3: checked: 30626 failed: 0 # caeSp111: checked: 22306 failed: 0 # priExs1: checked: 23767 failed: 0 # priPac3: checked: 22872 failed: 0 # burXyl1: checked: 18074 failed: 0 # necAme1: checked: 18984 failed: 0 # ancCey1: checked: 35775 failed: 0 # ascSuu1: checked: 3065 failed: 0 # haeCon2: checked: 21837 failed: 0 # triSui1: checked: 13600 failed: 0 # panRed1: checked: 24071 failed: 0 # bruMal2: checked: 13471 failed: 0 # dirImm1: checked: 12857 failed: 0 # oncVol1: checked: 12133 failed: 0 # loaLoa1: checked: 14903 failed: 0 # melInc2: checked: 19189 failed: 0 # melHap1: checked: 14103 failed: 0 # hetBac1: checked: 20928 failed: 0 # triSpi1: checked: 16370 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/ancCey1.gp.gz: 35775 # genes/ascSuu1.gp.gz: 3065 # genes/bruMal2.gp.gz: 13471 # genes/burXyl1.gp.gz: 18074 # genes/caeAng2.gp.gz: 27919 # genes/caeJap4.gp.gz: 29913 # genes/caePb3.gp.gz: 30626 # genes/caeRem4.gp.gz: 31433 # genes/caeSp111.gp.gz: 22306 # genes/caeSp51.gp.gz: 34679 # genes/cb4.gp.gz: 21737 # genes/staAur2.gp.gz: 20195 # genes/ci3.gp.gz: 16035 # genes/dirImm1.gp.gz: 12857 # genes/haeCon2.gp.gz: 21837 # genes/hetBac1.gp.gz: 20928 # genes/loaLoa1.gp.gz: 14903 # genes/melHap1.gp.gz: 14103 # genes/melInc2.gp.gz: 19189 # genes/necAme1.gp.gz: 18984 # genes/oncVol1.gp.gz: 12133 # genes/panRed1.gp.gz: 24071 # genes/priExs1.gp.gz: 23767 # genes/priPac3.gp.gz: 22872 # genes/triSpi1.gp.gz: 16370 # genes/triSui1.gp.gz: 13600 # kluster job to annotate each maf file screen -S staAur2 # manage long running procedure with screen ssh ku cd /hive/data/genomes/staAur2/bed/multiz369way/frames printf '#!/bin/csh -fe set C = $1 set G = $2 cat ../anno/mafNet/result/${C}.maf | genePredToMafFrames staAur2 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz ' > runOne chmod +x runOne ls ../anno/mafNet/result > chr.list ls genes | sed -e "s/.gp.gz//" > gene.list printf '#LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP ' > template mkdir parts gensub2 chr.list gene.list template jobList para -ram=64g create jobList para try ... check ... push # Completed: 182 of 182 jobs # CPU time in finished jobs: 16011s 266.86m 4.45h 0.19d 0.001 y # IO & Wait Time: 7072s 117.86m 1.96h 0.08d 0.000 y # Average job time: 127s 2.11m 0.04h 0.00d # Longest finished job: 196s 3.27m 0.05h 0.00d # Submission to last job: 401s 6.68m 0.11h 0.00d # collect all results into one file: cd /hive/data/genomes/staAur2/bed/multiz369way/frames time find ./parts -type f | while read F do echo "${F}" 1>&2 zcat ${F} done | sort -k1,1 -k2,2n > multiz369wayFrames.bed # real 0m24.365s # -rw-rw-r-- 1 287750919 Dec 13 11:50 multiz369wayFrames.bed gzip multiz369wayFrames.bed # verify there are frames on everything, should be 26 species: # (count from: ls genes | wc) zcat multiz369wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' > species.check.list wc -l species.check.list # 26 # 278211 ancCey1 # 12740 ascSuu1 # 120258 bruMal2 # 147066 burXyl1 # 314609 caeAng2 # 268169 caeJap4 # 372623 caePb3 # 450001 caeRem4 # 384830 caeSp111 # 271790 caeSp51 # 251521 cb4 # 123527 staAur2 # 49580 ci3 # 119879 dirImm1 # 154460 haeCon2 # 209501 hetBac1 # 119983 loaLoa1 # 100365 melHap1 # 87175 melInc2 # 144268 necAme1 # 124476 oncVol1 # 157050 panRed1 # 86331 priExs1 # 89170 priPac3 # 94395 triSpi1 # 60978 triSui1 # load the resulting file ssh hgwdev cd /hive/data/genomes/staAur2/bed/multiz369way/frames time hgLoadMafFrames staAur2 multiz369wayFrames multiz369wayFrames.bed.gz # real 0m29.214s hgsql -e 'select count(*) from multiz369wayFrames;' staAur2 # +----------+ # | count(*) | # +----------+ # | 4592956 | # +----------+ time featureBits -countGaps staAur2 multiz369wayFrames # 34457786 bases of 100286401 (34.359%) in intersection # real 0m22.416s # enable the trackDb entries: # frames multiz369wayFrames # irows on # zoom to base level in an exon to see codon displays # appears to work OK ######################################################################### # Phylogenetic tree from 369-way (TBD - 2018-12-13 - Hiram) mkdir /hive/data/genomes/staAur2/bed/multiz369way/4d cd /hive/data/genomes/staAur2/bed/multiz369way/4d # the annotated maf's are in: ../anno/mafNet/result # using knownGene for staAur2, only transcribed genes and nothing # from the randoms and other misc. hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene where cdsEnd > cdsStart;" staAur2 \ | egrep -E -v "chrM|chrUn|random|_alt" > ensGene.gp wc -l *.gp # 33379 ensGene.gp # verify it is only on the chroms: cut -f2 ensGene.gp | sort | uniq -c | sort -rn | sed -e 's/^/ # /;' # 7509 chrV # 6211 chrIV # 5588 chrII # 4997 chrI # 4646 chrIII # 4428 chrX genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp wc -l ensGeneNR.gp # 20184 ensGeneNR.gp ssh ku mkdir /hive/data/genomes/staAur2/bed/multiz369way/4d/run cd /hive/data/genomes/staAur2/bed/multiz369way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/staAur2/bed/multiz369way" set c = $1 set infile = $r/anno/mafNet/result/$2 set outfile = $3 cd /dev/shm # 'clean' maf, removes all chrom names, leaves only the db name perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\tstaAur2.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ../anno/mafNet/result ls -1S /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet/result/*.maf \ | sed -e "s#.*multiz369way/anno/mafNet/result/##" \ | egrep -E -v "chrM|chrUn|random|_alt" > maf.list printf '#LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP ' > template gensub2 maf.list single template jobList para -ram=64g create jobList para try ... check ... push ... etc... para time # Completed: 6 of 6 jobs # CPU time in finished jobs: 1829s 30.48m 0.51h 0.02d 0.000 y # IO & Wait Time: 20s 0.34m 0.01h 0.00d 0.000 y # Average job time: 308s 5.14m 0.09h 0.00d # Longest finished job: 341s 5.68m 0.09h 0.00d # Submission to last job: 352s 5.87m 0.10h 0.00d # combine mfa files ssh hgwdev cd /hive/data/genomes/staAur2/bed/multiz369way/4d # verify no tiny files: ls -og mfa | sort -k3nr # -rw-rw-r-- 1 4660039 Dec 13 12:08 chrII.mfa # -rw-rw-r-- 1 3973699 Dec 13 12:09 chrV.mfa # -rw-rw-r-- 1 3416554 Dec 13 12:08 chrX.mfa # -rw-rw-r-- 1 2920429 Dec 13 12:08 chrI.mfa # -rw-rw-r-- 1 2589544 Dec 13 12:08 chrIII.mfa # -rw-rw-r-- 1 2183464 Dec 13 12:09 chrIV.mfa #want comma-less species.list time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # real 0m1.255s # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 369 # remove the :0.1234 distances from the nh tree: sed -e 's/:[0-9.]\+//g;' ../staAur2.369way.nh > tree-commas.nh # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree-commas.nh 4d.all.mfa # real 50m55.630s mv phyloFit.mod all.mod # after rearranging the tree to be more display friendly: # (((((((((((((Caenorhabditis_elegans:0.425799, # (Caenorhabditis_sp._38_MB-2015:0.448904, # Caenorhabditis_angaria:0.438865):0.38477):0.0632893, # Caenorhabditis_japonica:0.631705):0.00332863, # Caenorhabditis_sp._34_TK-2017:0.670052):0.00266019, # (((((((Caenorhabditis_briggsae:0.00761568, # Caenorhabditis_briggsae:0.00617763):0.0901744, # Caenorhabditis_nigoni:0.0878926):0.37284, # (Caenorhabditis_sp._26_LS-2015:0.266258, # Caenorhabditis_sp5_ju800:0.250871):0.0573557):0.00272983, # Caenorhabditis_sp._40_LS-2015:0.311033):0.160657, # ((Caenorhabditis_latens:0.0739274, # Caenorhabditis_remanei:0.0771516):0.343608, # Caenorhabditis_brenneri:0.439498):0.0439004):0.0374774, # Caenorhabditis_tropicalis:0.479027):0.0929836, # Caenorhabditis_sp._31_LS-2015:0.844993):0.0466798):0.0886724, # Caenorhabditis_sp._21_LS-2015:0.81046):0.159439, # (((Caenorhabditis_sp._32_LS-2015:0.461375, # Caenorhabditis_sp._39_LS-2015:0.42763):0.0699299, # (((Pristionchus_arcanus:0.12529, # ((Pristionchus_exspectatus:0.0115759, # Pristionchus_exspectatus:0.00987795):0.092637, # (Pristionchus_pacificus:0.0369936, # Pristionchus_pacificus:0.0151984):0.0917107):0.0582013):0.146807, # Pristionchus_maxplancki:0.231402):0.108203, # Pristionchus_entomophagus:0.314155):0.615931):0.00264449, # (((Bursaphelenchus_xylophilus:0.0169712, # Bursaphelenchus_xylophilus:0.0121836):0.664516, # Subanguina_moxae:1.19812):0.279427, # ((Oscheius_sp._MCB:0.310932, # Oscheius_sp._TEL-2014:0.314067):0.387814, # (((Angiostrongylus_cantonensis:0.76614, # (Necator_americanus:0.0069094, # Necator_americanus:0.00709169):0.62038):0.22255, # Nippostrongylus_brasiliensis:0.585445):0.128625, # ((((((((Ancylostoma_caninum:0.109552, # Ancylostoma_duodenale:0.128167):0.0645197, # Ancylostoma_ceylanicum:0.138249):0.302453, # ((((Ascaris_suum:0.0264911, # Ascaris_suum:0.0266353):0.0718425, # Parascaris_univalens:0.111167):0.412171, # Toxocara_canis:0.435243):1.03133, # Oesophagostomum_dentatum:0.444431):0.133431):0.231588, # ((Haemonchus_contortus:0.0237971, # Haemonchus_contortus:0.0104187):0.40375, # Teladorsagia_circumcincta:0.313931):0.415593):0.132113, # (((Clonorchis_sinensis:0.102068, # Opisthorchis_viverrini:0.0948009):0.672677, # Dicrocoelium_dendriticum:0.641121):0.366763, # ((Elaeophora_elaphi:0.229754, # Macrostomum_lignano:0.315971):0.260862, # ((Fasciola_gigantica:0.0728112, # Fasciola_hepatica:0.0784224):0.849193, # Spirometra_erinaceieuropaei:0.717335):0.238022):0.175856):0.306152):0.0247175, # (Heligmosomoides_polygyrus_bakeri:0.639113, # (Oscheius_tipulae:0.772979, # (((Steinernema_carpocapsae:0.288945, # Steinernema_scapterisci:0.285651):0.224527, # Steinernema_feltiae:0.606142):0.0853374, # Steinernema_glaseri:0.525076):0.152498):0.168142):0.00279645):0.00267557, # ((((Echinococcus_canadensis:0.0305467, # Echinococcus_granulosus:0.0307877):0.00949128, # Echinococcus_multilocularis:0.039056):0.157973, # (((Taenia_asiatica:0.00962609, # Taenia_saginata:0.0164412):0.0108728, # Taenia_multiceps:0.0270195):0.034031, # Taenia_solium:0.0458656):0.187618):0.687903, # (Trichuris_muris:0.289869, # (Trichuris_trichiura:0.184836, # Trichuris_suis:0.125543):0.291242):0.610672):0.454711):0.00261495, # ((Plectus_sambesii:0.668591, # Steinernema_monticolum:0.874638):0.0801328, # Panagrellus_redivivus:0.667253):0.218884):0.00718934):0.198615):0.0754372):0.16779):0.00370935):0.149757, # (Diploscapter_coronatus:0.0396115, # Diploscapter_pachys:0.0410107):0.965863):0.0835229, # (((((((((((((Brugia_malayi:0.0147423, # Brugia_malayi:0.0184123):0.0136278, # Brugia_pahangi:0.0347893):0.0496926, # Wuchereria_bancrofti:0.0607276):0.203732, # ((Dirofilaria_immitis:0.021486, # Dirofilaria_immitis:0.00721013):0.206272, # (Onchocerca_ochengi:0.0198343, # (Onchocerca_volvulus:0.00726612, # Onchocerca_volvulus:0.0153111):0.0105149):0.0986467):0.0222077):0.00274305, # (Loa_loa:0.0109839, # Loa_loa:0.00898028):0.213308):0.00261442, # (Setaria_digitata:0.149493, # Setaria_equina:0.120979):0.317084):0.0425831, # Onchocerca_flexuosa:0.0978734):0.742036, # ((((Dugesia_japonica:0.514157, # Girardia_tigrina:1.22545):0.240555, # Schmidtea_mediterranea:0.619031):0.356539, # (((((Meloidogyne_arenaria:0.0340878, # Meloidogyne_javanica:0.0426825):0.0272648, # Meloidogyne_incognita:0.0308494):0.0157159, # (Meloidogyne_floridensis:0.0840962, # Meloidogyne_incognita:0.0530444):0.00989274):0.18009, # (Meloidogyne_graminicola:0.50525, # Meloidogyne_hapla:0.171223):0.0509786):0.654631, # (Parastrongyloides_trichosuri:0.389745, # ((Strongyloides_papillosus:0.0783108, # Strongyloides_venezuelensis:0.0916494):0.198375, # (Strongyloides_stercoralis:0.177646, # Strongyloides_ratti:0.150624):0.0896129):0.165222):0.220136):0.131527):0.137744, # ((Schistosoma_haematobium:0.0960504, # Schistosoma_mansoni:0.10575):0.224188, # Schistosoma_japonicum:0.29067):0.484258):0.152071):0.239138, # Dictyocaulus_viviparus:0.814104):0.248629, # Hymenolepis_microstoma:0.705812):0.234011, # (Rhabditophanes_sp._KR3021:0.753609, # (Ciona_intestinalis:0.864953, # Heterorhabditis_bacteriophora:0.889764):0.341081):0.233997):0.163448, # Gyrodactylus_salaris:0.912435):0.232379, # Parapristionchus_giblindavisi:0.738653):0.156608):0.28746, # Ditylenchus_destructor:0.885526):0.238847, # (((((Trichinella_sp._T6:0.0168947, # Trichinella_patagoniensis:0.0187164):0.00398067, # ((Trichinella_sp._T8:0.0102628, # (Trichinella_sp._T9:0.0118704, # Trichinella_britovi:0.00645468):0.00273378):0.00273053, # Trichinella_murrelli:0.0111672):0.00482982):0.00272028, # Trichinella_nativa:0.0154206):0.0136128, # (Trichinella_nelsoni:0.0210349, # (Trichinella_spiralis:0.00697283, # Trichinella_spiralis:0.0168217):0.0169346):0.00352233):0.0510198, # ((Trichinella_papuae:0.0217595, # Trichinella_zimbabwensis:0.0207942):0.0233717, # Trichinella_pseudospiralis:0.0610335):0.0574885):0.947802):0.183355, # Romanomermis_culicivorax:0.680971):0.164498, # ((((Globodera_ellingtonae:0.035005, # Globodera_rostochiensis:0.0362764):0.0267624, # Globodera_pallida:0.0659637):0.259339, # Heterodera_glycines:0.354234):0.225215, # Rotylenchulus_reniformis:0.468661):0.395303):0.491, # Acrobeloides_nanus:0.491):0.1:0.1; grep TREE all.mod | sed -e 's/TREE: //;' \ | /cluster/bin/phast/all_dists /dev/stdin | grep staAur2 \ | sed -e "s/staAur2.//;" | sort > staAur2.dists /cluster/bin/phast/all_dists ../staAur2.369way.nh | grep staAur2 \ | sed -e "s/staAur2.//;" | sort > original.dists # printing out the 'original', the 'new' the 'difference' and # percent difference/delta join original.dists staAur2.dists | awk '{ printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' | sort -k4n ######################################################################### # phastCons 369-way (TBD - 2018-12-14 - Hiram) # split 369way mafs into 1M chunks and generate sufficient statistics # files for phastCons ssh ku mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/cons/ss mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/cons/msa.split cd /hive/data/genomes/staAur2/bed/multiz369way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet/result/$c.maf set WINDOWS = /hive/data/genomes/staAur2/bed/multiz369way/cons/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh printf '#LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP ' > template # do the easy ones first to see some immediate results ls -1S -r ../../anno/mafNet/result | sed -e "s/.maf//;" > maf.list # all can finish OK at a 64Gb memory limit gensub2 maf.list single template jobList para -ram=64g create jobList para try ... check ... etc para push # Completed: 7 of 7 jobs # CPU time in finished jobs: 2431s 40.52m 0.68h 0.03d 0.000 y # IO & Wait Time: 278s 4.63m 0.08h 0.00d 0.000 y # Average job time: 387s 6.45m 0.11h 0.00d # Longest finished job: 540s 9.00m 0.15h 0.01d # Submission to last job: 548s 9.13m 0.15h 0.01d # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/cons/run.cons cd /hive/data/genomes/staAur2/bed/multiz369way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/staAur2/bed/multiz369way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons/ss set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$c/$f.ss $tmp else ln -s $ssSrc/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix printf '#LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP ' > template ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list wc -l ss.list # 104 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/staAur2/bed/multiz369way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList # beware overwhelming the cluster with these fast running high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push # Completed: 104 of 104 jobs # CPU time in finished jobs: 6198s 103.30m 1.72h 0.07d 0.000 y # IO & Wait Time: 718s 11.97m 0.20h 0.01d 0.000 y # Average job time: 67s 1.11m 0.02h 0.00d # Longest finished job: 84s 1.40m 0.02h 0.00d # Submission to last job: 216s 3.60m 0.06h 0.00d # create Most Conserved track cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do echo $C 1>&2 ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m39.720s # -rw-rw-r-- 1 74615007 Dec 14 14:15 tmpMostConserved.bed time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ > mostConserved.bed # real 0m10.119s # -rw-rw-r-- 1 76283801 Dec 14 14:16 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all time hgLoadBed staAur2 phastConsElements369way mostConserved.bed # Read 2275482 elements of size 5 from mostConserved.bed # real 0m10.328s # most interesting high measurement for this coverage # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits staAur2 -enrichment ncbiRefSeq:cds phastConsElements369way # ncbiRefSeq:cds 25.009%, phastConsElements369way 43.266%, both 16.084%, cover 64.31%, enrich 1.49x # real 0m12.804s # Try for 5% overall cov, and 70% CDS cov time featureBits staAur2 -enrichment refGene:cds phastConsElements369way # refGene:cds 25.013%, phastConsElements369way 43.266%, both 16.087%, cover 64.31%, enrich 1.49x # real 0m13.519s time featureBits staAur2 -enrichment ensGene:cds phastConsElements369way # ensGene:cds 25.144%, phastConsElements369way 43.266%, both 16.129%, cover 64.15%, enrich 1.48x # real 0m10.267s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all mkdir downloads time for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" 1>&2 find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons369way.wigFix.gz done # real 0m51.373s # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons369way.wig phastCons369way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 0m26.755s du -hsc *.wi? # 91M phastCons369way.wib # 9.1M phastCons369way.wig # -rw-rw-r-- 1 95051167 Dec 14 14:24 phastCons369way.wib # -rw-rw-r-- 1 9487394 Dec 14 14:24 phastCons369way.wig # encode into a bigWig file: # (warning wigToBigWig process may be too large for memory limits # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin \ ../../../../chrom.sizes phastCons369way.bw) > bigWig.log 2>&1 egrep "VmPeak|real" bigWig.log # pid=74267: VmPeak: 1108636 kB # real 0m51.112s # -rw-rw-r-- 1 244041045 Dec 14 14:27 phastCons369way.bw bigWigInfo phastCons369way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 182,564,643 primaryIndexSize: 3,214,556 zoomLevels: 10 chromCount: 7 basesCovered: 95,051,167 mean: 0.467683 min: 0.000000 max: 1.000000 std: 0.439150 # if you wanted to use the bigWig file, loading bigWig table: # but we don't use the bigWig file ln -s `pwd`/phastCons369way.bw /gbdb/staAur2/multiz369way hgsql staAur2 -e 'drop table if exists phastCons369way; \ create table phastCons369way (fileName varchar(255) not null); \ insert into phastCons369way values ("/gbdb/staAur2/multiz369way/phastCons369way.bw");' # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all ln -s `pwd`/phastCons369way.wib /gbdb/staAur2/multiz369way/phastCons369way.wib time hgLoadWiggle -pathPrefix=/gbdb/staAur2/multiz369way staAur2 \ phastCons369way phastCons369way.wig # real 0m0.722s time wigTableStats.sh staAur2 phastCons369way # db.table min max mean count sumData # staAur2.phastCons369way 0 1 0.467683 95051167 4.44538e+07 # stdDev viewLimits # 0.43915 viewLimits=0:1 # real 0m0.505s # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all time hgWiggle -doHistogram -db=staAur2 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons369way > staAur2.phastCons369way.histogram.data 2>&1 # real 0m4.020s # the pid VmPeak measurements to stderr are getting into the middle of # a line in this data output. Either do *not* use 2>&1 to get all # into the output file, or edit it to fix it after done # create plot of histogram: # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change) printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" set output "staAur2.phastCons369.histo.png" set size 1.0, 1.0 set style line 1 lt 2 lc rgb "#ff88ff" lw 2 set style line 2 lt 2 lc rgb "#66ff66" lw 2 set style line 3 lt 2 lc rgb "#ffff00" lw 2 set style line 4 lt 2 lc rgb "#ffffff" lw 2 set border lc rgb "#ffff00" set key left box ls 3 set key tc variable set grid noxtics set grid y2tics ls 4 set grid ytics ls 4 set title " C. elegans/staAur2 Histogram phastCons369way track" \ tc rgb "#ffffff" set xlabel " phastCons369way score" tc rgb "#ffffff" set ylabel " Relative Frequency" tc rgb "#ff88ff" set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" set y2range [0:1] set yrange [0:0.02] plot "staAur2.phastCons369way.histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \ "staAur2.phastCons369way.histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 ' | gnuplot # take a look to see if it is sane: display histo.png & ######################################################################### # phyloP for 369-way (TBD - 2018-12-14 - Hiram) # # can use the same split 1M chunk SS files into that were used # for phastCons # run phyloP with score=LRT ssh ku mkdir /cluster/data/staAur2/bed/multiz369way/consPhyloP cd /cluster/data/staAur2/bed/multiz369way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACK ../../4d/all.mod # BACKGROUND: 0.338800 0.205656 0.164894 0.290649 grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.371 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../4d/all.mod 0.371 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.314500 0.185500 0.185500 0.314500 cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set d = $f:h set file1 = $f:t set out = $2 set cName = $f:t:r set grp = $cwd:t set cons = /hive/data/genomes/staAur2/bed/multiz369way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "/hive/data/genomes/staAur2/bed/multiz369way/cons/ss/$f" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$file1.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../../cons/ss -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/ss/##" > ss.list # make sure the list looks good wc -l ss.list # 104 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/staAur2/bed/multiz369way/consPhyloP/all cd /hive/data/genomes/staAur2/bed/multiz369way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overloading the cluster with these quick and high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push para time > run.time # Completed: 104 of 104 jobs # CPU time in finished jobs: 420481s 7008.02m 116.80h 4.87d 0.013 y # IO & Wait Time: 1037s 17.28m 0.29h 0.01d 0.000 y # Average job time: 4053s 67.55m 1.13h 0.05d # Longest finished job: 5455s 90.92m 1.52h 0.06d # Submission to last job: 5629s 93.82m 1.56h 0.07d mkdir downloads time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" 1>&2 find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP369way.wigFix.gz done # real 1m24.231s du -hsc downloads # 195M downloads # check integrity of data with wigToBigWig time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/staAur2/chrom.sizes \ phyloP369way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=180522: VmPeak: 1108636 kB # real 0m51.720s # -rw-rw-r-- 1 356259120 Dec 14 17:17 phyloP369way.bw bigWigInfo phyloP369way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 281,654,520 # primaryIndexSize: 3,214,556 # zoomLevels: 10 # chromCount: 7 # basesCovered: 95,051,167 # mean: 1.130380 # min: -20.000000 # max: 20.000000 # std: 2.927330 # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP369way.wig phyloP369way.wib) # Converted stdin, upper limit 20.00, lower limit -20.00 # real 0m25.705s # -rw-rw-r-- 1 95051167 Dec 14 17:19 phyloP369way.wib # -rw-rw-r-- 1 10006648 Dec 14 17:19 phyloP369way.wig du -hsc *.wi? # 91M phyloP369way.wib # 9.6M phyloP369way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP369way.wib /gbdb/staAur2/multiz369way/phyloP369way.wib time hgLoadWiggle -pathPrefix=/gbdb/staAur2/multiz369way staAur2 \ phyloP369way phyloP369way.wig # real 0m0.761s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh staAur2 phyloP369way # db.table min max mean count sumData # staAur2.phyloP369way -20 20 1.13038 95051167 1.07444e+08 # stdDev viewLimits # 2.92733 viewLimits=-13.5063:15.767 # that range is: 20+20= 40 for hBinSize=0.04 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.04 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=staAur2 phyloP369way > staAur2.phyloP369way.histo.data 2>&1 # real 0m4.117s # xaxis range: grep -v chrom staAur2.phyloP369way.histo.data | grep "^[0-9]" \ | ave -col=2 stdin | sed -e 's/^/# /;' # Q1 -7.500000 # median 0.320000 # Q3 8.140000 # average 0.309272 # min -20.000000 # max 20.000000 # count 783 # total 242.159998 # standard deviation 9.085841 # find out the range for the 2:5 graph grep -v chrom staAur2.phyloP369way.histo.data | grep "^[0-9]" \ | ave -col=5 stdin | sed -e 's/^/# /;' # Q1 0.000002 # median 0.000108 # Q3 0.000374 # average 0.001277 # min 0.000000 # max 0.038921 # count 783 # total 0.999993 # standard deviation 0.003638 # create plot of histogram: # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change) printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" set output "staAur2.phyloP369.histo.png" set size 1.0, 1.0 set style line 1 lt 2 lc rgb "#ff88ff" lw 2 set style line 2 lt 2 lc rgb "#66ff66" lw 2 set style line 3 lt 2 lc rgb "#ffff00" lw 2 set style line 4 lt 2 lc rgb "#ffffff" lw 2 set border lc rgb "#ffff00" set key left box ls 3 set key tc variable set grid noxtics set grid y2tics ls 4 set grid ytics ls 4 set title " C. elegans/staAur2 Histogram phyloP369way track" \ tc rgb "#ffffff" set xlabel " phyloP369way score" tc rgb "#ffffff" set ylabel " Relative Frequency" tc rgb "#ff88ff" set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" set xrange [-1:1.5] set yrange [0:0.04] plot "staAur2.phyloP369way.histo.data" using 2:5 title " RelFreq" with impulses ls 1, \ "staAur2.phyloP369way.histo.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 ' | gnuplot # verify it looks sane display staAur2.phyloP369.histo.png & ############################################################################# # determine taxonomy for everything (TBD - 2018-12-14 - Hiram) # to set up group divisions # use this perl script to extra the taxonomy strings from NCBI entrez mkdir /hive/data/genomes/staAur2/bed/multiz369way/taxonomy cd /hive/data/genomes/staAur2/bed/multiz369way/taxonomy cat > taxon.pl << '_EOF_' #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc < 1) { printf STDERR "usage: taxon.pl [more taxIds]\n"; exit 255; } while (my $taxId = shift) { my $taxonomyString = ""; open (TX, "wget -O /dev/stdout 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=$taxId&retmode=xml' 2> /dev/null | grep -i ScientificName | headRest 1 stdin|") or die "can not wget entrez/eutils"; while (my $sciName = ) { chomp $sciName; $sciName =~ s/.*//; $sciName =~ s###; $taxonomyString .= sprintf("%s;", $sciName); } close (TX); printf "%d\t%s\n", $taxId, $taxonomyString; } '_EOF_' # << happy emacs chmod +x taxon.pl ./taxon.pl `sed -e 's/:.*//; s/(//g; s/ //g;' ../staAur2.369way.taxId.nh | sort -u | xargs echo` > taxonomy.txt # survey the names in those sed -e 's/.*organisms;//;' taxonomy.txt | tr ';' '\n' | sort | uniq -c | sort -rn | head -20 122 Opisthokonta 122 Metazoa 122 Eumetazoa 122 Eukaryota 122 Bilateria 122 99 Protostomia 99 Nematoda 99 Ecdysozoa 83 Chromadorea 76 Rhabditida 35 Rhabditina 29 Rhabditomorpha 27 Tylenchina 23 Rhabditoidea 23 Rhabditidae 22 Platyhelminthes 18 Peloderinae 18 Caenorhabditis 16 Enoplea sed -e 's/.*;Bilateria;//;' taxonomy.txt | cut -d';' -f1-4 \ | sort | uniq -c | sort -n 1 Deuterostomia;Chordata;Tunicata;Ascidiacea 1 Platyhelminthes;Cestoda;Eucestoda;Diphyllobothriidea 1 Platyhelminthes;Monogenea;Monopisthocotylea;Gyrodactylidea 1 Platyhelminthes;Rhabditophora;Macrostomorpha;Macrostomida 2 Platyhelminthes;Trematoda;Digenea;Opisthorchiida 3 Platyhelminthes;Rhabditophora;Seriata;Tricladida 3 Platyhelminthes;Trematoda;Digenea;Plagiorchiida 3 Platyhelminthes;Trematoda;Digenea;Strigeidida 8 Platyhelminthes;Cestoda;Eucestoda;Cyclophyllidea 16 Protostomia;Ecdysozoa;Nematoda;Enoplea 83 Protostomia;Ecdysozoa;Nematoda;Chromadorea sed -e 's/.*;Bilateria;//;' taxonomy.txt | grep Protostomia \ | cut -d';' -f1-6 | sort | uniq -c | sort -n 1 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Plectida;Plectina 1 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Strongylida;Metastrongyloidea 1 Protostomia;Ecdysozoa;Nematoda;Enoplea;Dorylaimia;Mermithida 5 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Strongylida;Trichostrongyloidea 14 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Spirurina 15 Protostomia;Ecdysozoa;Nematoda;Enoplea;Dorylaimia;Trichinellida 27 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Tylenchina 35 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Rhabditina # with those numbers in hand, break them up like this # the 'orderByChain.db.txt' list is from the output of sizeStats.pl # as done earlier, just the sequence name/db names from that output # in their featureBits chainLink order for N in Deuterostomia Platyhelminthes Plectida Rhabditina Spirurina Tylenchina Strongylida Dorylaimia do grep ";$N;" taxonomy.txt | cut -f1 > $N.taxId.list join -t$'\t' <(sort $N.taxId.list) <(sort taxId.GC.sequenceName.accAsmId.sciName.name.tab) | cut -f3 | sort -u > $N.db.list done cat orderByChain.db.txt | while read n do grep -w "${n}" *.list done | sed -e 's/.db.list:/\t/;' > division.dbName.ordered.tab for N in Deuterostomia Platyhelminthes Plectida Rhabditina Spirurina Tylenchina Strongylida Dorylaimia do grep "$N" division.dbName.ordered.tab | cut -f2 > ${N}.ordered done # those *.ordered listings are used in trackDb to establish # the chainNet composite: cd ~/kent/src/hg/makeDb/trackDb/worm/staAur2 ~/kent/src/hg/utils/phyloTrees/chainNetCompositeTrackDb.pl net nematodes \ staAur2 Rhabditina.ordered Spirurina.ordered Tylenchina.ordered \ Strongylida.ordered Plectida.ordered Dorylaimia.ordered \ Platyhelminthes.ordered Deuterostomia.ordered \ > chainNetNematodes.ra # edit that .ra file to remove the .ordered from each name in this list: subGroup3 clade Clade c00=Rhabditina c01=Spirurina c02=Tylenchina c03=Strongylida c04=Plectida c05=Dorylaimia c06=Platyhelminthes c07=Deuterostomia # can avoid this edit by calling those files *.list instead ############################################################################# # construct download files for 369-way (TBD - 2018-11-27 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phastCons369way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phyloP369way mkdir /hive/data/genomes/staAur2/bed/multiz369way/downloads cd /hive/data/genomes/staAur2/bed/multiz369way/downloads mkdir multiz369way phastCons369way phyloP369way ######################################################################### ## create upstream refGene maf files cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way # bash script #!/bin/bash export geneTbl="ensGene" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits staAur2 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags staAur2 multiz369way \ stdin stdout \ -orgs=/hive/data/genomes/staAur2/bed/multiz369way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 107m51.664s # -rw-rw-r-- 1 289630077 Dec 14 18:00 upstream1000.ensGene.maf.gz # -rw-rw-r-- 1 545877544 Dec 14 18:30 upstream2000.ensGene.maf.gz # -rw-rw-r-- 1 1334065100 Dec 14 19:16 upstream5000.ensGene.maf.gz ###################################################################### ## compress the maf files cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way time rsync -a -P ../../anno/mafNet/result/ ./maf/ # real 3m21.807s du -hsc maf/ # 49G maf/ cd maf time gzip *.maf & # real 21m44.263s du -hscL maf ../../anno/maf/result/ # 5.4G maf # 63G ../../anno/maf/result/ cd maf md5sum *.maf.gz > md5sum.txt # collect the other sequences here: cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way mkdir sequences grep -v "^[a-z]" nameCrossReference.tsv | while read L do seqName=`printf "%s" "${L}" | awk -F$'\t' '{print $1}'` acc=`printf "%s" "${L}" | awk -F$'\t' '{print $2}'` asmId=`printf "%s" "${L}" | awk -F$'\t' '{print $4}'` src="../../../awsMultiz/sequences/${acc}_${asmId}" twoBit="../../../awsMultiz/twoBit/${acc}_${asmId}.2bit" destName=`printf "%s.%s.%s" "${seqName}" "${acc}" "${asmId}"` printf "sequences/%s\n" "${destName}" mkdir -p "sequences/${destName}" cp -p "${twoBit}" "sequences/${destName}/" cp -p ${src}/*_assembly_report.txt "sequences/${destName}/" done du -hsc sequences # 7.2G sequences/ cd sequences time md5sum */*.2bit */*.txt > md5sum.txt # real 0m30.828s mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/maf cd maf ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/maf/ mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/sequences cd ../sequences ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/sequences/ cd .. # obtain the README.txt from the multiz26way and fix it up with appropriate # listings ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/ ########################################################################### cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > staAur2.369way.nh cat staAur2.369way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=../../sequenceName.sciName.tab -noInternal \ -lineOutput /dev/stdin > staAur2.369way.scientificName.nh cat staAur2.369way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=../../sequenceName.taxId.tab -noInternal \ -lineOutput /dev/stdin > staAur2.369way.taxId.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh staAur2.369way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > staAur2.369way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh staAur2.369way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > staAur2.369way.scientificNames.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m3.147s ln -s `pwd`/*.maf.gz `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way du -hsc ./maf ../../anno/result # 18G ./maf # 156G ../../anno/result # obtain the README.txt from staAur2/multiz20way and update for this # situation ln -s `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/ ##################################################################### cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/phastCons369way ln -s ../../cons/all/downloads/*.wigFix.gz . ln -s ../../cons/all/phastCons369way.bw ./staAur2.phastCons369way.bw ln -s ../../cons/all/all.mod ./staAur2.phastCons369way.mod time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m20.354s # obtain the README.txt from staAur2/phastCons26way and update for this # situation, and edit accordingly. mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phastCons369way ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phastCons369way ##################################################################### cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/phyloP369way ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz . md5sum *.wigFix.gz > md5sum.txt ln -s ../../consPhyloP/run.phyloP/all.mod staAur2.phyloP369way.mod ln -s ../../consPhyloP/all/phyloP369way.bw staAur2.phyloP369way.bw md5sum *.mod *.bw > md5sum.txt # obtain the README.txt from staAur2/phyloP20way and update for this mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phyloP369way ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phyloP369way/ ############################################################################# # hgPal downloads (TBD - 2018-12-17 - Hiram) # FASTA from 369-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S staAur2HgPal mkdir /hive/data/genomes/staAur2/bed/multiz369way/pal cd /hive/data/genomes/staAur2/bed/multiz369way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop can take hours on a high contig count assembly # it is just fine on human/staAur2, just a few seconds export mz=multiz369way export gp=ensGene export db=staAur2 export I=0 export D=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/3690)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time (sh -x ./$gp.jobs) > $gp.jobs.log 2>&1 # real 41m45.902s export mz=multiz369way export gp=ensGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m26.225s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 5m55.586s # -rw-rw-r-- 1 546125992 Dec 17 10:24 ensGene.multiz369way.exonAA.fa.gz # -rw-rw-r-- 1 1049129671 Dec 17 10:35 ensGene.multiz369way.exonNuc.fa.gz export mz=multiz369way export gp=ensGene export db=staAur2 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # wiki page for 369-way (TBD - 2017-11-06 - Hiram) mkdir /hive/users/hiram/bigWays/staAur2.369way cd /hive/users/hiram/bigWays sed -e 's/ /\n/g;' /hive/data/genomes/staAur2/bed/multiz369way/species.list \ > staAur2.369way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh staAur2.369way/ordered.list # dbDb.sh constructs staAur2.369way/XenTro9_369-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh staAur2 369way # sizeStats.pl constructs staAur2.369way/XenTro9_369-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl staAur2 369way # defCheck.pl constructs XenTro9_369-way_conservation_lastz_parameters.html ./defCheck.pl staAur2 369way # this constructs the html pages in staAur2.369way/: # -rw-rw-r-- 1 6247 May 2 17:07 XenTro9_369-way_conservation_alignment.html # -rw-rw-r-- 1 84369 May 2 17:09 XenTro9_369-way_Genome_size_statistics.html # -rw-rw-r-- 1 5033 May 2 17:10 XenTro9_369-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # XenTro9_369-way_conservation_alignment # XenTro9_369-way_Genome_size_statistics # XenTro9_369-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################ # pushQ readmine (TBD - 2017-11-07 - Hiram) cd /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2 find -L `pwd`/multiz369way `pwd`/phastCons369way `pwd`/phyloP369way \ /gbdb/staAur2/multiz369way -type f \ > /hive/data/genomes/staAur2/bed/multiz369way/downloads/redmine.20216.fileList wc -l /hive/data/genomes/staAur2/bed/multiz369way/downloads/redmine.20216.fileList # 1450 /hive/data/genomes/staAur2/bed/multiz369way/downloads/redmine.20216.fileList cd /hive/data/genomes/staAur2/bed/multiz369way/downloads hgsql -e 'show tables;' staAur2 | grep 369way \ | sed -e 's/^/staAur2./;' > redmine.20216.table.list ############################################################################