############################################################################# ## 135-Way Multiz (DONE - 2018-12-10 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ce11/bed/multiz135way cd /hive/data/genomes/ce11/bed/multiz135way # after much list mangling, produced this 135way file from # a combination of local alignments on ce11 and the AWS alignments # this tree was generated with the 'phylip' command 'neighbor' # on a distance matrix computed from kmer counting on each sequence # using TreeGraph2 tree editor on the Mac, rearrange to get ce11 # at the top, and attempt to get the same sequences from the 26-way # diagram to have the same relationships. This distance order table # is not important, but it does prove the tree can be parsed correctly: /cluster/bin/phast/all_dists ce11.135way.nh | grep ce11 \ | sed -e "s/ce11.//" | sort -k2n | sed -e 's/^/#\t/;' # C_sp38_MB_2015 0.425310 # caePb3 0.435730 # caeSp51 0.442450 # caeAng2 0.443390 # C_latens 0.446630 # caeRem4 0.447990 # Ancylostoma_duodenale 0.456900 # Ancylostoma_caninum 0.457600 # haeCon2 0.460600 # Teladorsagia_circumcincta 0.460680 # caeSp111 0.461500 # C_sp31_LS_2015 0.461520 # Opisthorchis_viverrini 0.462970 # Oesophagostomum_dentatum 0.463370 # Clonorchis_sinensis 0.463830 # Toxocara_canis 0.464540 # Haemonchus_contortus 0.464700 # ascSuu1 0.465120 # Dicrocoelium_dendriticum 0.466030 # Ascaris_suum 0.467180 # Hymenolepis_microstoma 0.468120 # C_sp34_TK_2017 0.468580 # Parascaris_univalens 0.468940 # C_sp26_LS_2015 0.471470 # Nippostrongylus_brasiliensis 0.471510 # ancCey1 0.472110 # Diploscapter_coronatus 0.472600 # Heligmosomoides_polygyrus_bakeri 0.473010 # Fasciola_gigantica 0.473980 # C_nigoni 0.474010 # Diploscapter_pachys 0.474220 # Pristionchus_arcanus 0.474650 # C_briggsae 0.474730 # cb4 0.474730 # C_sp40_LS_2015 0.475150 # Necator_americanus 0.475780 # necAme1 0.475780 # Spirometra_erinaceieuropaei 0.476160 # Fasciola_hepatica 0.478620 # Dictyocaulus_viviparus 0.478790 # Romanomermis_culicivorax 0.483050 # priExs1 0.485860 # Pristionchus_exspectatus 0.485880 # Oscheius_TEL_2014 0.486320 # Setaria_digitata 0.488910 # Plectus_sambesii 0.490220 # Angiostrongylus_cantonensis 0.495910 # Setaria_equina 0.496910 # Loa_loa 0.498040 # loaLoa1 0.498040 # Ditylenchus_destructor 0.498440 # Pristionchus_entomophagus 0.498840 # caeJap4 0.500630 # Onchocerca_flexuosa 0.500710 # priPac3 0.501010 # Echinococcus_granulosus 0.501290 # Onchocerca_ochengi 0.502800 # Taenia_saginata 0.503810 # Taenia_solium 0.503820 # Echinococcus_canadensis 0.504030 # Taenia_asiatica 0.504170 # Pristionchus_maxplancki 0.504190 # C_sp21_LS_2015 0.504220 # oncVol1 0.506710 # Onchocerca_volvulus 0.507130 # Pristionchus_pacificus 0.507430 # Echinococcus_multilocularis 0.508790 # C_sp39_LS_2015 0.509800 # Wuchereria_bancrofti 0.513640 # hetBac1 0.514300 # Brugia_pahangi 0.515560 # Steinernema_monticolum 0.516460 # Macrostomum_lignano 0.516950 # Schistosoma_japonicum 0.518330 # Parapristionchus_giblindavisi 0.518480 # Dirofilaria_immitis 0.518990 # dirImm1 0.518990 # Schistosoma_haematobium 0.520510 # Rhabditophanes_KR3021 0.521690 # Trichinella_papuae 0.525710 # Trichinella_nativa 0.526160 # Trichinella_nelsoni 0.526290 # Girardia_tigrina 0.527190 # Rotylenchulus_reniformis 0.527680 # Trichinella_T8 0.528740 # bruMal2 0.529200 # Schistosoma_mansoni 0.529490 # Trichinella_T9 0.530040 # Gyrodactylus_salaris 0.530400 # Trichinella_pseudospiralis 0.531400 # Trichinella_patagoniensis 0.532420 # ci3 0.533100 # Trichinella_T6 0.533520 # triSpi1 0.534550 # Trichinella_spiralis 0.534570 # Trichinella_zimbabwensis 0.534950 # Steinernema_carpocapsae 0.535530 # Trichinella_britovi 0.536140 # Brugia_malayi 0.539340 # Trichuris_trichiura 0.545340 # burXyl1 0.546570 # Bursaphelenchus_xylophilus 0.546610 # Oscheius_tipulae 0.547370 # Taenia_multiceps 0.547750 # Steinernema_glaseri 0.547770 # panRed1 0.548030 # Schmidtea_mediterranea 0.549430 # Globodera_pallida 0.550670 # Steinernema_feltiae 0.551640 # Dugesia_japonica 0.552630 # Subanguina_moxae 0.553240 # Globodera_ellingtonae 0.554660 # Steinernema_scapterisci 0.555070 # Globodera_rostochiensis 0.556060 # Meloidogyne_floridensis 0.556500 # C_sp32_LS_2015 0.559280 # triSui1 0.561340 # Meloidogyne_javanica 0.561500 # melInc2 0.563260 # Meloidogyne_incognita 0.563420 # Strongyloides_venezuelensis 0.565300 # Strongyloides_papillosus 0.565340 # Parastrongyloides_trichosuri 0.566200 # Oscheius_MCB 0.568860 # Trichuris_muris 0.569030 # Trichinella_murrelli 0.569380 # melHap1 0.571430 # Meloidogyne_arenaria 0.574360 # Heterodera_glycines 0.578900 # Meloidogyne_graminicola 0.593230 # Strongyloides_stercoralis 0.595290 # strRat2 0.598090 # Acrobeloides_nanus 0.641980 # Elaeophora_elaphi 0.704510 ######################################################################### # extract species list from that .nh file sed -e 's/:.*//; s/(//g; s/ //g;' ce11.135way.nh \ | xargs echo > species.list.txt # this sed only works on this type of ascii printout that has one # name per line. This tree is looking like: # (((((((((((((ce11:0.20748, # (C_sp38_MB_2015:0.20466, # caeAng2:0.22274):0.01317):0.0068, # caeJap4:0.28635):0.01267, # C_sp34_TK_2017:0.24163):0.00414, # (((((((C_briggsae:0.00002, # cb4:0.00002):0.16315, # C_nigoni:0.16245):0.05419, # (C_sp26_LS_2015:0.21016, # caeSp51:0.18114):0.00394):0.00419, # C_sp40_LS_2015:0.22197):0.00718, # ((C_latens:0.16492, # caeRem4:0.16628):0.03344, # caePb3:0.18746):0.00227):0.00131, # caeSp111:0.21681):0.00954, # C_sp31_LS_2015:0.22637):0.00406):0.00682, # C_sp21_LS_2015:0.26631):0.00948, # (((C_sp32_LS_2015:0.27624, # C_sp39_LS_2015:0.22676):0.02114, # (((Pristionchus_arcanus:0.16743, # ((Pristionchus_exspectatus:0.00016, # priExs1:0.00014):0.16989, # (Pristionchus_pacificus:0.11371, # priPac3:0.10729):0.07789):0.00861):0.01285, # Pristionchus_maxplancki:0.20982):0.01106, # Pristionchus_entomophagus:0.21553):0.02141):0.00924, # (((Bursaphelenchus_xylophilus:0.00007, # burXyl1:0.00003):0.27975, # Subanguina_moxae:0.28645):0.00915, # ((Oscheius_MCB:0.28777, # Oscheius_TEL_2014:0.20523):0.02181, # (((Angiostrongylus_cantonensis:0.21437, # (Necator_americanus:0.00001, # necAme1:0.00001):0.19423):0.01349, # Nippostrongylus_brasiliensis:0.20346):0.00307, # ((((((((Ancylostoma_caninum:0.1313, # Ancylostoma_duodenale:0.1306):0.02082, # ancCey1:0.16663):0.02164, # ((((Ascaris_suum:0.09443, # ascSuu1:0.09237):0.04203, # Parascaris_univalens:0.13822):0.01148, # Toxocara_canis:0.1453):0.0276, # Oesophagostomum_dentatum:0.17173):0.0078):0.00349, # ((Haemonchus_contortus:0.03815, # haeCon2:0.03405):0.14271, # Teladorsagia_circumcincta:0.17684):0.00349):0.00412, # (((Clonorchis_sinensis:0.10618, # Opisthorchis_viverrini:0.10532):0.05136, # Dicrocoelium_dendriticum:0.15974):0.01767, # ((Elaeophora_elaphi:0.36168, # Macrostomum_lignano:0.17412):0.0443, # ((Fasciola_gigantica:0.07413, # Fasciola_hepatica:0.07877):0.08767, # Spirometra_erinaceieuropaei:0.16398):0.01365):0.00991):0.01239):0.00505, # (Heligmosomoides_polygyrus_bakeri:0.19711, # (Oscheius_tipulae:0.26899, # (((Steinernema_carpocapsae:0.22328, # Steinernema_scapterisci:0.24282):0.00848, # Steinernema_feltiae:0.24787):0.00279, # Steinernema_glaseri:0.24679):0.0226):0.00248):0.00472):0.00159, # ((((Echinococcus_canadensis:0.15777, # Echinococcus_granulosus:0.15503):0.01106, # Echinococcus_multilocularis:0.17359):0.03289, # (((Taenia_asiatica:0.12893, # Taenia_saginata:0.12857):0.04182, # Taenia_multiceps:0.21433):0.0125, # Taenia_solium:0.1829):0.01861):0.03022, # (Trichuris_muris:0.27467, # (Trichuris_trichiura:0.2441, # triSui1:0.2601):0.00688):0.02227):0.0025):0.00087, # ((Plectus_sambesii:0.21658, # Steinernema_monticolum:0.24282):0.00218, # panRed1:0.27657):0.00274):0.00374):0.0057):0.00164):0.00498):0.00527):0.00553, # (Diploscapter_coronatus:0.11839, # Diploscapter_pachys:0.12001):0.10129):0.00434, # (((((((((((((Brugia_malayi:0.11227, # bruMal2:0.10213):0.05133, # Brugia_pahangi:0.13982):0.01705, # Wuchereria_bancrofti:0.15495):0.01451, # ((Dirofilaria_immitis:0.00005, # dirImm1:0.00005):0.16673, # (Onchocerca_ochengi:0.10614, # (Onchocerca_volvulus:0.00301, # oncVol1:0.00259):0.10746):0.04445):0.00803):0.01321, # (Loa_loa:0.00005, # loaLoa1:0.00005):0.16702):0.00946, # (Setaria_digitata:0.1589, # Setaria_equina:0.1669):0.0085):0.00925, # Onchocerca_flexuosa:0.18845):0.02742, # ((((Dugesia_japonica:0.18172, # Girardia_tigrina:0.15628):0.02484, # Schmidtea_mediterranea:0.20336):0.0345, # (((((Meloidogyne_arenaria:0.10438, # Meloidogyne_javanica:0.09152):0.02008, # Meloidogyne_incognita:0.11352):0.05932, # (Meloidogyne_floridensis:0.14372, # melInc2:0.15048):0.0222):0.03897, # (Meloidogyne_graminicola:0.21265, # melHap1:0.19085):0.02897):0.02436, # (Parastrongyloides_trichosuri:0.19421, # ((Strongyloides_papillosus:0.15902, # Strongyloides_venezuelensis:0.15898):0.00505, # (Strongyloides_stercoralis:0.14855, # strRat2:0.15135):0.04547):0.02928):0.04474):0.01568):0.0184, # ((Schistosoma_haematobium:0.12796, # Schistosoma_mansoni:0.13694):0.04046, # Schistosoma_japonicum:0.16624):0.05892):0.00833):0.0038, # Dictyocaulus_viviparus:0.19775):0.00547, # Hymenolepis_microstoma:0.19255):0.00226, # (Rhabditophanes_KR3021:0.24082, # (ci3:0.25005, # hetBac1:0.23125):0.00218):0.00756):0.0063, # Gyrodactylus_salaris:0.26339):0.00168, # Parapristionchus_giblindavisi:0.25315):0.00807):0.00549, # Ditylenchus_destructor:0.23569):0.00352, # (((((Trichinella_T6:0.15065, # Trichinella_patagoniensis:0.14955):0.00371, # ((Trichinella_T8:0.13865, # (Trichinella_T9:0.1349, # Trichinella_britovi:0.141):0.00505):0.00653, # Trichinella_murrelli:0.18582):0.0044):0.00421, # Trichinella_nativa:0.15121):0.01512, # (Trichinella_nelsoni:0.16399, # (Trichinella_spiralis:0.00011, # triSpi1:0.00009):0.17216):0.00247):0.03872, # ((Trichinella_papuae:0.15043, # Trichinella_zimbabwensis:0.15967):0.03714, # Trichinella_pseudospiralis:0.19326):0.01703):0.05484):0.00848, # Romanomermis_culicivorax:0.2083):0.01192, # ((((Globodera_ellingtonae:0.16265, # Globodera_rostochiensis:0.16405):0.01402, # Globodera_pallida:0.17268):0.05421, # Heterodera_glycines:0.25512):0.02294, # Rotylenchulus_reniformis:0.22684):0.01417):0.1, # Acrobeloides_nanus:0.25531):0.1:0.1; # a name translation file was made up from scanning the sequences and # databases at UCSC or files used to build UCSC browsers. Result in the # file: taxId.GC.sequenceName.accAsmId.sciName.name.tab Extract the equivalence files from that list: cut -f1,3 taxId.GC.sequenceName.accAsmId.sciName.name.tab \ | awk '{printf "%s\t%s\n", $2, $1}' > sequenceName.taxId.tab cut -f3,5 taxId.GC.sequenceName.accAsmId.sciName.name.tab \ | awk -F$'\t' '{printf "%s\t%s\n", $1, $2}' > sequenceName.sciName.tab # use those lists to make other named .nh files. For example, # the scientific names: cat ce11.135way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=sequenceName.sciName.tab -noInternal -lineOutput \ /dev/stdin > ce11.135way.sciName.nh cat ce11.135way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=sequenceName.taxId.tab -noInternal -lineOutput \ /dev/stdin > ce11.135way.taxId.nh sed -e 's/^/# /;' ce11.135way.sciName.nh # (((((((((((((Caenorhabditis_elegans:0.20748, # (Caenorhabditis_sp._38_MB-2015:0.20466, # Caenorhabditis_angaria:0.22274):0.01317):0.0068, # Caenorhabditis_japonica:0.28635):0.01267, # Caenorhabditis_sp._34_TK-2017:0.24163):0.00414, # (((((((Caenorhabditis_briggsae:0.00002, # Caenorhabditis_briggsae:0.00002):0.16315, # Caenorhabditis_nigoni:0.16245):0.05419, # (Caenorhabditis_sp._26_LS-2015:0.21016, # Caenorhabditis_sp5_ju800:0.18114):0.00394):0.00419, # Caenorhabditis_sp._40_LS-2015:0.22197):0.00718, # ((Caenorhabditis_latens:0.16492, # Caenorhabditis_remanei:0.16628):0.03344, # Caenorhabditis_brenneri:0.18746):0.00227):0.00131, # Caenorhabditis_tropicalis:0.21681):0.00954, # Caenorhabditis_sp._31_LS-2015:0.22637):0.00406):0.00682, # Caenorhabditis_sp._21_LS-2015:0.26631):0.00948, # (((Caenorhabditis_sp._32_LS-2015:0.27624, # Caenorhabditis_sp._39_LS-2015:0.22676):0.02114, # (((Pristionchus_arcanus:0.16743, # ((Pristionchus_exspectatus:0.00016, # Pristionchus_exspectatus:0.00014):0.16989, # (Pristionchus_pacificus:0.11371, # Pristionchus_pacificus:0.10729):0.07789):0.00861):0.01285, # Pristionchus_maxplancki:0.20982):0.01106, # Pristionchus_entomophagus:0.21553):0.02141):0.00924, # (((Bursaphelenchus_xylophilus:0.00007, # Bursaphelenchus_xylophilus:0.00003):0.27975, # Subanguina_moxae:0.28645):0.00915, # ((Oscheius_sp._MCB:0.28777, # Oscheius_sp._TEL-2014:0.20523):0.02181, # (((Angiostrongylus_cantonensis:0.21437, # (Necator_americanus:0.00001, # Necator_americanus:0.00001):0.19423):0.01349, # Nippostrongylus_brasiliensis:0.20346):0.00307, # ((((((((Ancylostoma_caninum:0.1313, # Ancylostoma_duodenale:0.1306):0.02082, # Ancylostoma_ceylanicum:0.16663):0.02164, # ((((Ascaris_suum:0.09443, # Ascaris_suum:0.09237):0.04203, # Parascaris_univalens:0.13822):0.01148, # Toxocara_canis:0.1453):0.0276, # Oesophagostomum_dentatum:0.17173):0.0078):0.00349, # ((Haemonchus_contortus:0.03815, # Haemonchus_contortus:0.03405):0.14271, # Teladorsagia_circumcincta:0.17684):0.00349):0.00412, # (((Clonorchis_sinensis:0.10618, # Opisthorchis_viverrini:0.10532):0.05136, # Dicrocoelium_dendriticum:0.15974):0.01767, # ((Elaeophora_elaphi:0.36168, # Macrostomum_lignano:0.17412):0.0443, # ((Fasciola_gigantica:0.07413, # Fasciola_hepatica:0.07877):0.08767, # Spirometra_erinaceieuropaei:0.16398):0.01365):0.00991):0.01239):0.00505, # (Heligmosomoides_polygyrus_bakeri:0.19711, # (Oscheius_tipulae:0.26899, # (((Steinernema_carpocapsae:0.22328, # Steinernema_scapterisci:0.24282):0.00848, # Steinernema_feltiae:0.24787):0.00279, # Steinernema_glaseri:0.24679):0.0226):0.00248):0.00472):0.00159, # ((((Echinococcus_canadensis:0.15777, # Echinococcus_granulosus:0.15503):0.01106, # Echinococcus_multilocularis:0.17359):0.03289, # (((Taenia_asiatica:0.12893, # Taenia_saginata:0.12857):0.04182, # Taenia_multiceps:0.21433):0.0125, # Taenia_solium:0.1829):0.01861):0.03022, # (Trichuris_muris:0.27467, # (Trichuris_trichiura:0.2441, # Trichuris_suis:0.2601):0.00688):0.02227):0.0025):0.00087, # ((Plectus_sambesii:0.21658, # Steinernema_monticolum:0.24282):0.00218, # Panagrellus_redivivus:0.27657):0.00274):0.00374):0.0057):0.00164):0.00498):0.00527):0.00553, # (Diploscapter_coronatus:0.11839, # Diploscapter_pachys:0.12001):0.10129):0.00434, # (((((((((((((Brugia_malayi:0.11227, # Brugia_malayi:0.10213):0.05133, # Brugia_pahangi:0.13982):0.01705, # Wuchereria_bancrofti:0.15495):0.01451, # ((Dirofilaria_immitis:0.00005, # Dirofilaria_immitis:0.00005):0.16673, # (Onchocerca_ochengi:0.10614, # (Onchocerca_volvulus:0.00301, # Onchocerca_volvulus:0.00259):0.10746):0.04445):0.00803):0.01321, # (Loa_loa:0.00005, # Loa_loa:0.00005):0.16702):0.00946, # (Setaria_digitata:0.1589, # Setaria_equina:0.1669):0.0085):0.00925, # Onchocerca_flexuosa:0.18845):0.02742, # ((((Dugesia_japonica:0.18172, # Girardia_tigrina:0.15628):0.02484, # Schmidtea_mediterranea:0.20336):0.0345, # (((((Meloidogyne_arenaria:0.10438, # Meloidogyne_javanica:0.09152):0.02008, # Meloidogyne_incognita:0.11352):0.05932, # (Meloidogyne_floridensis:0.14372, # Meloidogyne_incognita:0.15048):0.0222):0.03897, # (Meloidogyne_graminicola:0.21265, # Meloidogyne_hapla:0.19085):0.02897):0.02436, # (Parastrongyloides_trichosuri:0.19421, # ((Strongyloides_papillosus:0.15902, # Strongyloides_venezuelensis:0.15898):0.00505, # (Strongyloides_stercoralis:0.14855, # Strongyloides_ratti:0.15135):0.04547):0.02928):0.04474):0.01568):0.0184, # ((Schistosoma_haematobium:0.12796, # Schistosoma_mansoni:0.13694):0.04046, # Schistosoma_japonicum:0.16624):0.05892):0.00833):0.0038, # Dictyocaulus_viviparus:0.19775):0.00547, # Hymenolepis_microstoma:0.19255):0.00226, # (Rhabditophanes_sp._KR3021:0.24082, # (Ciona_intestinalis:0.25005, # Heterorhabditis_bacteriophora:0.23125):0.00218):0.00756):0.0063, # Gyrodactylus_salaris:0.26339):0.00168, # Parapristionchus_giblindavisi:0.25315):0.00807):0.00549, # Ditylenchus_destructor:0.23569):0.00352, # (((((Trichinella_sp._T6:0.15065, # Trichinella_patagoniensis:0.14955):0.00371, # ((Trichinella_sp._T8:0.13865, # (Trichinella_sp._T9:0.1349, # Trichinella_britovi:0.141):0.00505):0.00653, # Trichinella_murrelli:0.18582):0.0044):0.00421, # Trichinella_nativa:0.15121):0.01512, # (Trichinella_nelsoni:0.16399, # (Trichinella_spiralis:0.00011, # Trichinella_spiralis:0.00009):0.17216):0.00247):0.03872, # ((Trichinella_papuae:0.15043, # Trichinella_zimbabwensis:0.15967):0.03714, # Trichinella_pseudospiralis:0.19326):0.01703):0.05484):0.00848, # Romanomermis_culicivorax:0.2083):0.01192, # ((((Globodera_ellingtonae:0.16265, # Globodera_rostochiensis:0.16405):0.01402, # Globodera_pallida:0.17268):0.05421, # Heterodera_glycines:0.25512):0.02294, # Rotylenchulus_reniformis:0.22684):0.01417):0.1, # Acrobeloides_nanus:0.25531):0.1:0.1; # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/ce11_135way.png # all distances for sizeStats.pl below: /cluster/bin/phast/all_dists ce11.135way.nh | grep ce11 \ | sed -e "s/ce11.//" | sort -k2n > 135way.distances.txt # this is a specialized sizeStats.pl from the usual kind cat > sizeStats.pl << '_EOF_' #!/usr/bin/env perl use strict; use warnings; my %dbToName; # key is name from 135way.distances.txt, value is sciName open (FH , ") { chomp $line; my ($db, $name) = split('\s+',$line); $name =~ s/__/. /; $dbToName{$db} = $name; } close (FH); open (FH, "<135way.distances.txt") or die "can not read 135way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('\s+', $line); my $Db = ucfirst($D); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/ce11/bed/lastz.$D/fb.ce11." . $chain . "Link.txt"; if ( $D =~ m/^G/) { $B=`ls /hive/data/genomes/ce11/bed/awsMultiz/lastzRun/lastz_$D/fb.ce11.chain.${D}*`; chomp $B; } my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $chainSynMeasure = ""; if ( $D =~ m/^G/) { $B=`ls /hive/data/genomes/ce11/bed/awsMultiz/lastzRun/lastz_$D/fb.ce11.chainSyn.${D}*`; chomp $B; } else { $B="/hive/data/genomes/ce11/bed/lastz.${D}/fb.ce11.chainSyn${Db}Link.txt"; } $chainSynMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainSynMeasure; $chainSynMeasure = 0.0 if (length($chainSynMeasure) < 1); $chainSynMeasure =~ s/\%//; my $chainRBestMeasure = ""; if ( $D =~ m/^G/) { $B=`ls /hive/data/genomes/ce11/bed/awsMultiz/lastzRun/lastz_$D/fb.ce11.chainRBest.${D}*`; chomp $B; } else { $B="/hive/data/genomes/ce11/bed/lastz.${D}/fb.ce11.chainRBest.${Db}.txt"; } $chainRBestMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainRBestMeasure; $chainRBestMeasure = 0.0 if (length($chainRBestMeasure) < 1); $chainRBestMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.ce11/fb.${D}.chainGalGal6Link.txt"; my $swapMeasure = "0"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { if (defined($dbToName{$D})) { $orgName = $dbToName{$D}; } else { $orgName="N/A"; } } ++$count; my $percentAlike = 100.0 * ($chainLinkMeasure - $chainRBestMeasure) / $chainLinkMeasure; printf "# %03d %.4f (%% %06.3f) (%% %06.3f) (%% %06.3f) %5.2f - %s %s\n", $count, $dist, $chainLinkMeasure, $chainSynMeasure, $chainRBestMeasure, $percentAlike, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure. # the unlabeled column between rBestLink and sciName is the % ratio # of recipBest to chainLink: # 100.0 * (chainLink - rBestLink) / chainLink # N dist chainLink synLink rBestLink sciName - accession/db # # 001 0.4253 (% 43.814) (% 12.216) (% 32.785) 25.17 - Caenorhabditis_sp._38_MB-2015 C_sp38_MB_2015 # 002 0.4357 (% 59.069) (% 37.574) (% 48.812) 17.36 - C. brenneri caePb3 # 003 0.4425 (% 40.332) (% 25.464) (% 36.514) 9.47 - C. sp. 5 ju800 caeSp51 # 004 0.4434 (% 39.417) (% 06.460) (% 27.675) 29.79 - C. angaria caeAng2 # 005 0.4466 (% 57.873) (% 39.138) (% 47.416) 18.07 - Caenorhabditis_latens C_latens # 006 0.4480 (% 61.020) (% 40.374) (% 50.049) 17.98 - C. remanei caeRem4 # 007 0.4569 (% 15.004) (% 00.079) (% 12.189) 18.76 - Ancylostoma_duodenale # 008 0.4576 (% 16.527) (% 00.580) (% 13.743) 16.85 - Ancylostoma_caninum # 009 0.4606 (% 08.936) (% 00.180) (% 07.705) 13.78 - Barber pole worm haeCon2 # 010 0.4607 (% 14.484) (% 00.219) (% 11.872) 18.03 - Teladorsagia_circumcincta # 011 0.4615 (% 55.203) (% 30.009) (% 43.133) 21.86 - C. tropicalis caeSp111 # 012 0.4615 (% 41.026) (% 18.314) (% 31.414) 23.43 - Caenorhabditis_sp._31_LS-2015 C_sp31_LS_2015 # 013 0.4630 (% 03.444) (% 00.018) (% 02.414) 29.91 - Opisthorchis_viverrini # 014 0.4634 (% 14.994) (% 00.164) (% 12.019) 19.84 - Oesophagostomum_dentatum # 015 0.4638 (% 03.450) (% 00.007) (% 02.410) 30.14 - Clonorchis_sinensis # 016 0.4645 (% 11.502) (% 00.228) (% 09.476) 17.61 - Toxocara_canis # 017 0.4647 (% 14.406) (% 00.306) (% 11.734) 18.55 - Haemonchus_contortus # 018 0.4651 (% 05.722) (% 00.132) (% 04.869) 14.91 - Pig roundworm ascSuu1 # 019 0.4660 (% 00.832) (% 00.000) (% 00.495) 40.50 - Dicrocoelium_dendriticum # 020 0.4672 (% 13.175) (% 00.264) (% 10.946) 16.92 - Ascaris_suum # 021 0.4681 (% 05.601) (% 00.019) (% 04.196) 25.08 - Hymenolepis_microstoma # 022 0.4686 (% 53.670) (% 38.079) (% 44.232) 17.59 - Caenorhabditis_sp._34_TK-2017 C_sp34_TK_2017 # 023 0.4689 (% 11.720) (% 00.230) (% 09.718) 17.08 - Parascaris_univalens # 024 0.4715 (% 59.618) (% 37.200) (% 48.546) 18.57 - Caenorhabditis_sp._26_LS-2015 C_sp26_LS_2015 # 025 0.4715 (% 15.792) (% 00.540) (% 13.351) 15.46 - Nippostrongylus_brasiliensis # 026 0.4721 (% 15.964) (% 00.578) (% 13.279) 16.82 - A. ceylanicum ancCey1 # 027 0.4726 (% 24.970) (% 01.150) (% 20.772) 16.81 - Diploscapter_coronatus # 028 0.4730 (% 14.696) (% 00.369) (% 12.240) 16.71 - Heligmosomoides_polygyrus_bakeri # 029 0.4740 (% 03.502) (% 00.000) (% 02.468) 29.53 - Fasciola_gigantica # 030 0.4740 (% 62.531) (% 41.703) (% 51.026) 18.40 - Caenorhabditis_nigoni C_nigoni # 031 0.4742 (% 23.687) (% 00.814) (% 19.743) 16.65 - Diploscapter_pachys # 032 0.4747 (% 13.074) (% 00.292) (% 10.610) 18.85 - Pristionchus_arcanus # 033 0.4747 (% 60.930) (% 40.599) (% 49.316) 19.06 - Caenorhabditis_briggsae C_briggsae # 034 0.4747 (% 39.384) (% 31.995) (% 35.681) 9.40 - C. briggsae cb4 # 035 0.4752 (% 52.955) (% 36.731) (% 44.297) 16.35 - Caenorhabditis_sp._40_LS-2015 C_sp40_LS_2015 # 036 0.4758 (% 15.754) (% 00.618) (% 12.952) 17.79 - Necator_americanus # 037 0.4758 (% 08.902) (% 00.270) (% 07.682) 13.70 - N. americanus necAme1 # 038 0.4762 (% 02.894) (% 00.000) (% 02.011) 30.51 - Spirometra_erinaceieuropaei # 039 0.4786 (% 04.266) (% 00.019) (% 03.130) 26.63 - Fasciola_hepatica # 040 0.4788 (% 17.480) (% 00.631) (% 14.549) 16.77 - Dictyocaulus_viviparus # 041 0.4830 (% 08.805) (% 00.022) (% 06.772) 23.09 - Romanomermis_culicivorax # 042 0.4859 (% 06.222) (% 00.068) (% 05.303) 14.77 - P. exspectatus priExs1 # 043 0.4859 (% 12.342) (% 00.199) (% 10.056) 18.52 - Pristionchus_exspectatus # 044 0.4863 (% 15.278) (% 00.137) (% 11.986) 21.55 - Oscheius_sp._TEL-2014 Oscheius_TEL_2014 # 045 0.4889 (% 16.319) (% 00.084) (% 12.626) 22.63 - Setaria_digitata # 046 0.4902 (% 11.406) (% 00.118) (% 09.141) 19.86 - Plectus_sambesii # 047 0.4959 (% 12.624) (% 00.190) (% 10.610) 15.95 - Angiostrongylus_cantonensis # 048 0.4969 (% 14.600) (% 00.179) (% 11.254) 22.92 - Setaria_equina # 049 0.4980 (% 16.698) (% 00.276) (% 12.631) 24.36 - Loa_loa # 050 0.4980 (% 05.056) (% 00.107) (% 04.254) 15.86 - Eye worm loaLoa1 # 051 0.4984 (% 14.413) (% 00.233) (% 11.306) 21.56 - Ditylenchus_destructor # 052 0.4988 (% 12.602) (% 00.312) (% 10.270) 18.50 - Pristionchus_entomophagus # 053 0.5006 (% 45.308) (% 20.405) (% 36.074) 20.38 - C. japonica caeJap4 # 054 0.5007 (% 15.520) (% 00.269) (% 11.600) 25.26 - Onchocerca_flexuosa # 055 0.5010 (% 06.308) (% 00.187) (% 05.354) 15.12 - P. pacificus priPac3 # 056 0.5013 (% 03.234) (% 00.019) (% 02.304) 28.76 - Echinococcus_granulosus # 057 0.5028 (% 19.777) (% 00.117) (% 14.787) 25.23 - Onchocerca_ochengi # 058 0.5038 (% 03.457) (% 00.018) (% 02.482) 28.20 - Taenia_saginata # 059 0.5038 (% 03.268) (% 00.012) (% 02.310) 29.31 - Taenia_solium # 060 0.5040 (% 03.383) (% 00.019) (% 02.395) 29.20 - Echinococcus_canadensis # 061 0.5042 (% 03.396) (% 00.018) (% 02.434) 28.33 - Taenia_asiatica # 062 0.5042 (% 12.880) (% 00.310) (% 10.474) 18.68 - Pristionchus_maxplancki # 063 0.5042 (% 23.081) (% 01.821) (% 18.534) 19.70 - Caenorhabditis_sp._21_LS-2015 C_sp21_LS_2015 # 064 0.5067 (% 05.193) (% 00.151) (% 04.375) 15.75 - O. volvulus oncVol1 # 065 0.5071 (% 25.170) (% 00.609) (% 18.668) 25.83 - Onchocerca_volvulus # 066 0.5074 (% 12.987) (% 00.445) (% 10.449) 19.54 - Pristionchus_pacificus # 067 0.5088 (% 03.403) (% 00.019) (% 02.440) 28.30 - Echinococcus_multilocularis # 068 0.5098 (% 44.315) (% 17.195) (% 35.657) 19.54 - Caenorhabditis_sp._39_LS-2015 C_sp39_LS_2015 # 069 0.5136 (% 20.673) (% 00.156) (% 15.260) 26.18 - Wuchereria_bancrofti # 070 0.5143 (% 10.486) (% 00.600) (% 09.313) 11.19 - H. bacteriophora/m31e hetBac1 # 071 0.5156 (% 19.944) (% 00.441) (% 14.559) 27.00 - Brugia_pahangi # 072 0.5165 (% 12.825) (% 00.184) (% 09.853) 23.17 - Steinernema_monticolum # 073 0.5170 (% 13.379) (% 00.020) (% 10.156) 24.09 - Macrostomum_lignano # 074 0.5183 (% 07.825) (% 00.000) (% 06.305) 19.42 - Schistosoma_japonicum # 075 0.5185 (% 15.865) (% 00.344) (% 12.506) 21.17 - Parapristionchus_giblindavisi # 076 0.5190 (% 20.305) (% 00.186) (% 15.022) 26.02 - Dirofilaria_immitis # 077 0.5190 (% 05.005) (% 00.129) (% 04.188) 16.32 - Dog heartworm dirImm1 # 078 0.5205 (% 09.723) (% 00.007) (% 07.927) 18.47 - Schistosoma_haematobium # 079 0.5217 (% 18.703) (% 00.294) (% 13.109) 29.91 - Rhabditophanes_sp._KR3021 Rhabditophanes_KR3021 # 080 0.5257 (% 11.415) (% 00.065) (% 07.962) 30.25 - Trichinella_papuae # 081 0.5262 (% 10.699) (% 00.051) (% 07.442) 30.44 - Trichinella_nativa # 082 0.5263 (% 10.754) (% 00.051) (% 07.465) 30.58 - Trichinella_nelsoni # 083 0.5272 (% 37.989) (% 00.013) (% 30.954) 18.52 - Girardia_tigrina # 084 0.5277 (% 16.345) (% 00.160) (% 11.694) 28.46 - Rotylenchulus_reniformis # 085 0.5287 (% 10.902) (% 00.051) (% 07.665) 29.69 - Trichinella_sp._T8 Trichinella_T8 # 086 0.5292 (% 05.065) (% 00.102) (% 04.259) 15.91 - Filarial worm bruMal2 # 087 0.5295 (% 10.328) (% 00.018) (% 08.457) 18.12 - Schistosoma_mansoni # 088 0.5300 (% 10.885) (% 00.063) (% 07.626) 29.94 - Trichinella_sp._T9 Trichinella_T9 # 089 0.5304 (% 06.686) (% 00.020) (% 04.946) 26.02 - Gyrodactylus_salaris # 090 0.5314 (% 11.462) (% 00.066) (% 07.980) 30.38 - Trichinella_pseudospiralis # 091 0.5324 (% 10.836) (% 00.052) (% 07.561) 30.22 - Trichinella_patagoniensis # 092 0.5331 (% 02.006) (% 00.000) (% 01.536) 23.43 - C. intestinalis ci3 # 093 0.5335 (% 11.008) (% 00.066) (% 07.671) 30.31 - Trichinella_sp._T6 Trichinella_T6 # 094 0.5345 (% 02.848) (% 00.007) (% 02.239) 21.38 - Trichinella triSpi1 # 095 0.5346 (% 10.697) (% 00.052) (% 07.591) 29.04 - Trichinella_spiralis # 096 0.5350 (% 11.705) (% 00.054) (% 08.203) 29.92 - Trichinella_zimbabwensis # 097 0.5355 (% 12.640) (% 00.334) (% 09.947) 21.31 - Steinernema_carpocapsae # 098 0.5361 (% 10.944) (% 00.063) (% 07.639) 30.20 - Trichinella_britovi # 099 0.5393 (% 16.763) (% 00.185) (% 12.386) 26.11 - Brugia_malayi # 100 0.5453 (% 06.137) (% 00.064) (% 04.468) 27.20 - Trichuris_trichiura # 101 0.5466 (% 06.291) (% 00.145) (% 05.125) 18.53 - Pine wood nematode burXyl1 # 102 0.5466 (% 13.470) (% 00.169) (% 09.909) 26.44 - Bursaphelenchus_xylophilus # 103 0.5474 (% 16.438) (% 01.183) (% 13.268) 19.28 - Oscheius_tipulae # 104 0.5477 (% 03.712) (% 00.018) (% 02.756) 25.75 - Taenia_multiceps # 105 0.5478 (% 10.278) (% 00.197) (% 08.144) 20.76 - Steinernema_glaseri # 106 0.5480 (% 06.585) (% 00.136) (% 05.461) 17.07 - Microworm panRed1 # 107 0.5494 (% 39.093) (% 00.095) (% 31.402) 19.67 - Schmidtea_mediterranea # 108 0.5507 (% 10.269) (% 00.081) (% 07.827) 23.78 - Globodera_pallida # 109 0.5516 (% 11.885) (% 00.259) (% 09.311) 21.66 - Steinernema_feltiae # 110 0.5526 (% 17.785) (% 00.000) (% 14.029) 21.12 - Dugesia_japonica # 111 0.5532 (% 11.549) (% 00.085) (% 08.872) 23.18 - Subanguina_moxae # 112 0.5547 (% 11.318) (% 00.138) (% 08.695) 23.18 - Globodera_ellingtonae # 113 0.5551 (% 11.987) (% 00.319) (% 09.412) 21.48 - Steinernema_scapterisci # 114 0.5561 (% 10.968) (% 00.132) (% 08.386) 23.54 - Globodera_rostochiensis # 115 0.5565 (% 12.683) (% 00.000) (% 08.999) 29.05 - Meloidogyne_floridensis # 116 0.5593 (% 38.533) (% 24.831) (% 31.789) 17.50 - Caenorhabditis_sp._32_LS-2015 C_sp32_LS_2015 # 117 0.5613 (% 02.917) (% 00.036) (% 02.285) 21.67 - Whipworm triSui1 # 118 0.5615 (% 26.293) (% 00.097) (% 19.830) 24.58 - Meloidogyne_javanica # 119 0.5633 (% 03.516) (% 00.090) (% 02.816) 19.91 - M. incognita melInc2 # 120 0.5634 (% 26.760) (% 00.186) (% 19.868) 25.75 - Meloidogyne_incognita # 121 0.5653 (% 31.839) (% 00.889) (% 19.485) 38.80 - Strongyloides_venezuelensis # 122 0.5653 (% 31.051) (% 00.685) (% 19.457) 37.34 - Strongyloides_papillosus # 123 0.5662 (% 28.310) (% 00.661) (% 17.250) 39.07 - Parastrongyloides_trichosuri # 124 0.5689 (% 06.427) (% 00.000) (% 04.011) 37.59 - Oscheius_sp._MCB Oscheius_MCB # 125 0.5690 (% 05.651) (% 00.066) (% 04.144) 26.67 - Trichuris_muris # 126 0.5694 (% 11.930) (% 00.052) (% 08.389) 29.68 - Trichinella_murrelli # 127 0.5714 (% 03.916) (% 00.074) (% 03.186) 18.64 - M. hapla melHap1 # 128 0.5744 (% 31.303) (% 00.285) (% 23.952) 23.48 - Meloidogyne_arenaria # 129 0.5789 (% 07.254) (% 00.013) (% 05.201) 28.30 - Heterodera_glycines # 130 0.5932 (% 17.004) (% 00.081) (% 10.827) 36.33 - Meloidogyne_graminicola # 131 0.5953 (% 35.984) (% 01.356) (% 20.257) 43.71 - Strongyloides_stercoralis # 132 0.5981 (% 36.164) (% 01.850) (% 20.838) 42.38 - Threadworm strRat2 # 133 0.6420 (% 23.468) (% 00.240) (% 18.204) 22.43 - Acrobeloides_nanus # 134 0.7045 (% 00.158) (% 00.000) (% 00.090) 43.04 - Elaeophora_elaphi # None of this concern for distances matters in building the first step, the # maf files. The distances will be better calibrated later. # create species list and stripped down tree for autoMZ sed -e 's/,//g; s/:[0-9]\+.[0-9]\+//g; s/;//g;' ce11.135way.nh \ | xargs echo > tree.nh cat tree.nh | fold -s -w 78 | sed -e 's/^/# /;' # (((((((((((((ce11 (C_sp38_MB_2015 caeAng2)) caeJap4) C_sp34_TK_2017) # (((((((C_briggsae cb4) C_nigoni) (C_sp26_LS_2015 caeSp51)) C_sp40_LS_2015) # ((C_latens caeRem4) caePb3)) caeSp111) C_sp31_LS_2015)) C_sp21_LS_2015) # (((C_sp32_LS_2015 C_sp39_LS_2015) (((Pristionchus_arcanus # ((Pristionchus_exspectatus priExs1) (Pristionchus_pacificus priPac3))) # Pristionchus_maxplancki) Pristionchus_entomophagus)) # (((Bursaphelenchus_xylophilus burXyl1) Subanguina_moxae) ((Oscheius_MCB # Oscheius_TEL_2014) (((Angiostrongylus_cantonensis (Necator_americanus # necAme1)) Nippostrongylus_brasiliensis) ((((((((Ancylostoma_caninum # Ancylostoma_duodenale) ancCey1) ((((Ascaris_suum ascSuu1) # Parascaris_univalens) Toxocara_canis) Oesophagostomum_dentatum)) # ((Haemonchus_contortus haeCon2) Teladorsagia_circumcincta)) # (((Clonorchis_sinensis Opisthorchis_viverrini) Dicrocoelium_dendriticum) # ((Elaeophora_elaphi Macrostomum_lignano) ((Fasciola_gigantica # Fasciola_hepatica) Spirometra_erinaceieuropaei)))) # (Heligmosomoides_polygyrus_bakeri (Oscheius_tipulae # (((Steinernema_carpocapsae Steinernema_scapterisci) Steinernema_feltiae) # Steinernema_glaseri)))) ((((Echinococcus_canadensis Echinococcus_granulosus) # Echinococcus_multilocularis) (((Taenia_asiatica Taenia_saginata) # Taenia_multiceps) Taenia_solium)) (Trichuris_muris (Trichuris_trichiura # triSui1)))) ((Plectus_sambesii Steinernema_monticolum) panRed1))))))) # (Diploscapter_coronatus Diploscapter_pachys)) (((((((((((((Brugia_malayi # bruMal2) Brugia_pahangi) Wuchereria_bancrofti) ((Dirofilaria_immitis dirImm1) # (Onchocerca_ochengi (Onchocerca_volvulus oncVol1)))) (Loa_loa loaLoa1)) # (Setaria_digitata Setaria_equina)) Onchocerca_flexuosa) ((((Dugesia_japonica # Girardia_tigrina) Schmidtea_mediterranea) (((((Meloidogyne_arenaria # Meloidogyne_javanica) Meloidogyne_incognita) (Meloidogyne_floridensis # melInc2)) (Meloidogyne_graminicola melHap1)) (Parastrongyloides_trichosuri # ((Strongyloides_papillosus Strongyloides_venezuelensis) # (Strongyloides_stercoralis strRat2))))) ((Schistosoma_haematobium # Schistosoma_mansoni) Schistosoma_japonicum))) Dictyocaulus_viviparus) # Hymenolepis_microstoma) (Rhabditophanes_KR3021 (ci3 hetBac1))) # Gyrodactylus_salaris) Parapristionchus_giblindavisi)) Ditylenchus_destructor) # (((((Trichinella_T6 Trichinella_patagoniensis) ((Trichinella_T8 # (Trichinella_T9 Trichinella_britovi)) Trichinella_murrelli)) # Trichinella_nativa) (Trichinella_nelsoni (Trichinella_spiralis triSpi1))) # ((Trichinella_papuae Trichinella_zimbabwensis) Trichinella_pseudospiralis))) # Romanomermis_culicivorax) ((((Globodera_ellingtonae Globodera_rostochiensis) # Globodera_pallida) Heterodera_glycines) Rotylenchulus_reniformis)) # Acrobeloides_nanus) sed -e 's/:.*//; s/(//g; s/ *//;' ce11.135way.nh \ | xargs echo > species.list cat species.list | fold -s -w 78 | sed -e 's/^/# /;' # ce11 C_sp38_MB_2015 caeAng2 caeJap4 C_sp34_TK_2017 C_briggsae cb4 C_nigoni # C_sp26_LS_2015 caeSp51 C_sp40_LS_2015 C_latens caeRem4 caePb3 caeSp111 # C_sp31_LS_2015 C_sp21_LS_2015 C_sp32_LS_2015 C_sp39_LS_2015 # Pristionchus_arcanus Pristionchus_exspectatus priExs1 Pristionchus_pacificus # priPac3 Pristionchus_maxplancki Pristionchus_entomophagus # Bursaphelenchus_xylophilus burXyl1 Subanguina_moxae Oscheius_MCB # Oscheius_TEL_2014 Angiostrongylus_cantonensis Necator_americanus necAme1 # Nippostrongylus_brasiliensis Ancylostoma_caninum Ancylostoma_duodenale # ancCey1 Ascaris_suum ascSuu1 Parascaris_univalens Toxocara_canis # Oesophagostomum_dentatum Haemonchus_contortus haeCon2 # Teladorsagia_circumcincta Clonorchis_sinensis Opisthorchis_viverrini # Dicrocoelium_dendriticum Elaeophora_elaphi Macrostomum_lignano # Fasciola_gigantica Fasciola_hepatica Spirometra_erinaceieuropaei # Heligmosomoides_polygyrus_bakeri Oscheius_tipulae Steinernema_carpocapsae # Steinernema_scapterisci Steinernema_feltiae Steinernema_glaseri # Echinococcus_canadensis Echinococcus_granulosus Echinococcus_multilocularis # Taenia_asiatica Taenia_saginata Taenia_multiceps Taenia_solium # Trichuris_muris Trichuris_trichiura triSui1 Plectus_sambesii # Steinernema_monticolum panRed1 Diploscapter_coronatus Diploscapter_pachys # Brugia_malayi bruMal2 Brugia_pahangi Wuchereria_bancrofti Dirofilaria_immitis # dirImm1 Onchocerca_ochengi Onchocerca_volvulus oncVol1 Loa_loa loaLoa1 # Setaria_digitata Setaria_equina Onchocerca_flexuosa Dugesia_japonica # Girardia_tigrina Schmidtea_mediterranea Meloidogyne_arenaria # Meloidogyne_javanica Meloidogyne_incognita Meloidogyne_floridensis melInc2 # Meloidogyne_graminicola melHap1 Parastrongyloides_trichosuri # Strongyloides_papillosus Strongyloides_venezuelensis # Strongyloides_stercoralis strRat2 Schistosoma_haematobium Schistosoma_mansoni # Schistosoma_japonicum Dictyocaulus_viviparus Hymenolepis_microstoma # Rhabditophanes_KR3021 ci3 hetBac1 Gyrodactylus_salaris # Parapristionchus_giblindavisi Ditylenchus_destructor Trichinella_T6 # Trichinella_patagoniensis Trichinella_T8 Trichinella_T9 Trichinella_britovi # Trichinella_murrelli Trichinella_nativa Trichinella_nelsoni # Trichinella_spiralis triSpi1 Trichinella_papuae Trichinella_zimbabwensis # Trichinella_pseudospiralis Romanomermis_culicivorax Globodera_ellingtonae # Globodera_rostochiensis Globodera_pallida Heterodera_glycines # Rotylenchulus_reniformis Acrobeloides_nanus # take an N50 survery to see if the quality of these genomes makes a mark mkdir /hive/data/genomes/ce11/bed/multiz135way/n50Survey cd /hive/data/genomes/ce11/bed/multiz135way/n50Survey ln -s ln -s ../taxId.GC.sequenceName.accAsmId.sciName.name.tab . sed -e 's/ /\n/g;' ../species.list | grep -v ce11 | while read db do printf "# working: %s\n" "${db}" case "${db}" in [a-z][a-zA-Z]*) chromSizes="/hive/data/genomes/$db/chrom.sizes" mkdir -p "${db}" n50.pl "${chromSizes}" > "${db}/${db}.n50.txt" 2>&1 ;; *) asmAccId=`grep -w "${db}" taxId.GC.sequenceName.accAsmId.sciName.name.tab | cut -f4` chromSizes=/hive/data/genomes/ce11/bed/awsMultiz/chromSizes/${asmAccId}.chrom.sizes echo mkdir -p "${db}" mkdir -p "${db}" n50.pl "${chromSizes}" > "${db}/${db}.n50.txt" 2>&1 ;; esac echo "# $chromSizes" 1>&2 done cat > summary.pl << '_EOF_' #!/usr/bin/env perl use strict; use warnings; my $id = shift; my $file = "$id/${id}.n50.txt"; my $totalSize = 0; my $contigCount = 0; my $lastLine = ""; open (FH, "<$file") or die "can not read $file"; while (my $line = ) { chomp $line; if ($line =~ m/contig count:/) { (undef, undef, undef, $contigCount, undef, undef, $totalSize, undef) = split('\s+', $line, 8); $contigCount =~ s/,//g; $totalSize =~ s/,//g; } $lastLine = $line; } close (FH); my (undef, $n50Count, $chrName, $n50Size) = split('\s+', $lastLine); my $perCentN = 100.0 * $n50Count / $contigCount; my $perCentSize = 100.0 * $n50Size / $totalSize; printf "%d\t%d\t%d\t%d\t%.2f\t%.2f\t%s.%s\n", $totalSize, $contigCount, $n50Count, $n50Size, $perCentN, $perCentSize, $id, $chrName; '_EOF_' # << happy emacs chmod +x summary.pl ls | while read D do if [ -d "${D}" ]; then ./summary.pl "${D}" fi done > allSummary.tab cat allSummary.tab | sed -e 's/^/# /;' # 248054631 30759 3645 19614 11.85 0.01 Acrobeloides_nanus.OZJO01010157v1 # 465734117 25335 379 256738 1.50 0.06 Ancylostoma_caninum.JOJR01000443v1 # 332875813 69981 8129 10138 11.62 0.00 Ancylostoma_duodenale.KN735649v1 # 260768283 17280 1807 42141 10.46 0.02 Angiostrongylus_cantonensis.JRZF01007231v1 # 298028455 415 21 4646302 5.06 1.56 Ascaris_suum.AEUI03000021v1 # 93659149 24286 226 41308 0.93 0.04 Brugia_malayi.NW_001893204v1 # 83528896 15835 141 156844 0.89 0.19 Brugia_pahangi.JRWH01000313v1 # 73085728 10432 1065 18150 10.21 0.02 Bursaphelenchus_xylophilus.CADV01006909v1 # 108384165 367 3 17485439 0.82 16.13 C_briggsae.FR847118v2 # 117124750 1858 83 366678 4.47 0.31 C_latens.NIPN01000084v1 # 129488540 155 3 20390332 1.94 15.75 C_nigoni.CM008512v1 # 93737141 5719 639 44405 11.17 0.05 C_sp21_LS_2015.UNPB01000020v1 # 101089037 3128 328 91328 10.49 0.09 C_sp26_LS_2015.UNPC01001673v1 # 104046868 3222 145 177024 4.50 0.17 C_sp31_LS_2015.UNPD01000261v1 # 65098870 2044 139 136669 6.80 0.21 C_sp32_LS_2015.UNPH01000935v1 # 122994328 6 3 20594552 50.00 16.74 C_sp34_TK_2017.AP018151v1 # 100371645 4890 184 139430 3.76 0.14 C_sp38_MB_2015.UNPQ01000282v1 # 91416478 21203 1253 15123 5.91 0.02 C_sp39_LS_2015.UNPJ01000616v1 # 101228815 3276 129 224519 3.94 0.22 C_sp40_LS_2015.UNPG01000052v1 # 547288241 4348 408 417486 9.38 0.08 Clonorchis_sinensis.DF143171v1 # 547921014 478538 143342 1217 29.95 0.00 Dicrocoelium_dendriticum.LK536351v1 # 160963781 7157 209 225748 2.92 0.14 Dictyocaulus_viviparus.KN716344v1 # 170492099 520 51 1007652 9.81 0.59 Diploscapter_coronatus.DF939061v1 # 157719873 10775 386 124241 3.58 0.08 Diploscapter_pachys.LIAE01010435v1 # 84888114 25560 1296 15147 5.07 0.02 Dirofilaria_immitis.CAWD010002093v1 # 111138200 1761 47 561030 2.67 0.50 Ditylenchus_destructor.LSTP01000047v1 # 854187087 648594 127635 1740 19.68 0.00 Dugesia_japonica.MQRL01621902v1 # 115024622 9330 395 74555 4.23 0.06 Echinococcus_canadensis.CVKT01006254v1 # 110837706 957 39 712683 4.08 0.64 Echinococcus_granulosus.APAU02000039v1 # 114963228 1217 4 13762452 0.33 11.97 Echinococcus_multilocularis.LN902844v1 # 1480825 1 1 1480825 100.00 100.00 Elaeophora_elaphi.HG738131v1 # 1273710336 167286 24990 15613 14.94 0.00 Fasciola_gigantica.MKHB01164695v1 # 1203652780 2816 196 1903231 6.96 0.16 Fasciola_hepatica.LT997902v1 # 1428149015 255254 26441 12094 10.36 0.00 Girardia_tigrina.MQRK01194996v1 # 105964814 2246 85 327189 3.78 0.31 Globodera_ellingtonae.MEIZ01000083v1 # 123625196 6873 296 120481 4.31 0.10 Globodera_pallida.HG820627v1 # 95876286 4281 278 88688 6.49 0.09 Globodera_rostochiensis.FKKZ01000267v1 # 67380660 6075 1058 18390 17.42 0.03 Gyrodactylus_salaris.KL566788v1 # 319757902 12915 876 99131 6.78 0.03 Haemonchus_contortus.HF967872v1 # 696954138 23647 1131 179575 4.78 0.03 Heligmosomoides_polygyrus_bakeri.FMJQ01001131v1 # 81908042 34708 5786 3715 16.67 0.00 Heterodera_glycines.DS577450v1 # 182136974 3643 6 7673820 0.16 4.21 Hymenolepis_microstoma.LN906334v1 # 91365832 5764 130 174388 2.26 0.19 Loa_loa.NW_018108310v1 # 764410778 5269 845 246204 16.04 0.03 Macrostomum_lignano.NIVC01001472v1 # 284032373 2224 391 204551 17.58 0.07 Meloidogyne_arenaria.QEUI01002087v1 # 96673063 58696 6957 3698 11.85 0.00 Meloidogyne_floridensis.CCDZ01006957v1 # 38184958 4304 522 20482 12.13 0.05 Meloidogyne_graminicola.NXFT01003462v1 # 183531997 12091 1209 38588 10.00 0.02 Meloidogyne_incognita.FXSY01001209v1 # 235798407 31341 6741 10388 21.51 0.00 Meloidogyne_javanica.CEWN01006741v1 # 244075060 11864 284 211861 2.39 0.09 Necator_americanus.NW_013560381v1 # 347480154 3578 414 209980 11.57 0.06 Nippostrongylus_brasiliensis.FZSS01002572v1 # 443017110 64255 3095 26462 4.82 0.01 Oesophagostomum_dentatum.KN551766v1 # 67740367 1604 27 540294 1.68 0.80 Onchocerca_flexuosa.KZ270009v1 # 91660559 20243 1317 16199 6.51 0.02 Onchocerca_ochengi.FJNM01001317v1 # 96340582 674 2 25485961 0.30 26.45 Onchocerca_volvulus.HG738138v1 # 620452578 42216 138 1349843 0.33 0.22 Opisthorchis_viverrini.KL596756v1 # 38321349 11377 2983 4474 26.22 0.01 Oscheius_MCB.JYHL01000305v1 # 117474248 73786 10216 2785 13.85 0.00 Oscheius_TEL_2014.LNBV01001323v1 # 59468623 191 15 1203411 7.85 2.02 Oscheius_tipulae.FXWY01000015v1 # 200833897 7303 504 112408 6.90 0.06 Parapristionchus_giblindavisi.LS524347v1 # 242743000 484 41 1878535 8.47 0.77 Parascaris_univalens.NINM01000041v1 # 42486966 1810 12 836942 0.66 1.97 Parastrongyloides_trichosuri.LM523169v1 # 186672612 18975 2318 23450 12.22 0.01 Plectus_sambesii.NIUQ01002266v1 # 202757162 4263 221 270727 5.18 0.13 Pristionchus_arcanus.LS486755v1 # 264161588 72721 209 368542 0.29 0.14 Pristionchus_entomophagus.LS706761v1 # 177635997 4412 343 142227 7.77 0.08 Pristionchus_exspectatus.LS397036v1 # 265592641 69506 219 308886 0.32 0.12 Pristionchus_maxplancki.LS590559v1 # 154958356 134 12 4619064 8.96 2.98 Pristionchus_pacificus.ABKE03000122v1 # 47267908 471 22 537195 4.67 1.14 Rhabditophanes_KR3021.LK995567v1 # 322765759 62537 4624 17632 7.39 0.01 Romanomermis_culicivorax.CAQS01004624v1 # 313575666 100525 2916 22705 2.90 0.01 Rotylenchulus_reniformis.LDKF01002902v1 # 375894156 29834 350 317484 1.17 0.08 Schistosoma_haematobium.KL250836v1 # 402743189 25048 521 176869 2.08 0.04 Schistosoma_japonicum.FN331375v1 # 364538298 885 4 32115376 0.45 8.81 Schistosoma_mansoni.HE601627v1 # 942041730 1990 82 2910179 4.12 0.31 Schmidtea_mediterranea.NNSW01000082v1 # 89886140 8974 882 24961 9.83 0.03 Setaria_digitata.FKYE01000882v1 # 78096315 17604 680 25640 3.86 0.03 Setaria_equina.PQVD01001835v1 # 1258717181 482396 73893 4636 15.32 0.00 Spirometra_erinaceieuropaei.LN073885v1 # 85934671 1562 74 300606 4.74 0.35 Steinernema_carpocapsae.KN151542v1 # 82504363 5834 369 47568 6.32 0.06 Steinernema_feltiae.KN163705v1 # 92836338 7510 692 37444 9.21 0.04 Steinernema_glaseri.KN171528v1 # 89158954 14252 2223 11563 15.60 0.01 Steinernema_monticolum.KI650532v1 # 79651268 2864 202 91123 7.05 0.11 Steinernema_scapterisci.KN167822v1 # 60447969 4703 129 86359 2.74 0.14 Strongyloides_papillosus.LM525683v1 # 42674647 800 16 431128 2.00 1.01 Strongyloides_stercoralis.LL999063v1 # 52178999 587 16 715404 2.73 1.37 Strongyloides_venezuelensis.LM524983v1 # 90131014 15010 1692 14601 11.27 0.02 Subanguina_moxae.BBSZ01010103v1 # 168679183 6900 97 342420 1.41 0.20 Taenia_asiatica.LWMJ02000097v1 # 301477105 738 3 44815576 0.41 14.87 Taenia_multiceps.CM010312v1 # 169104283 3626 65 586235 1.79 0.35 Taenia_saginata.LWMK02000065v1 # 129810970 13131 192 165960 1.46 0.13 Taenia_solium.MIEF01010054v1 # 700607159 81730 3153 47089 3.86 0.01 Teladorsagia_circumcincta.KZ348361v1 # 317115901 22857 239 375067 1.05 0.12 Toxocara_canis.JPKZ01000338v1 # 50853738 5951 90 158103 1.51 0.31 Trichinella_T6.JYDK01000090v1 # 49331761 4124 61 239129 1.48 0.48 Trichinella_T8.JYDM01000061v1 # 49075249 5752 63 212690 1.10 0.43 Trichinella_T9.JYDN01000064v1 # 51504795 7991 100 147150 1.25 0.29 Trichinella_britovi.JYDI01000100v1 # 64723107 503 2 17567319 0.40 27.14 Trichinella_murrelli.CM008134v1 # 47933238 4443 108 141195 2.43 0.29 Trichinella_nativa.JYDW01000108v1 # 47436005 2969 50 293867 1.68 0.62 Trichinella_nelsoni.JYDL01000050v1 # 46844510 2542 63 222396 2.48 0.47 Trichinella_papuae.JYDO01000063v1 # 49773209 6623 90 154103 1.36 0.31 Trichinella_patagoniensis.JYDQ01000090v1 # 48106863 7462 130 112255 1.74 0.23 Trichinella_pseudospiralis.JYDS01000130v1 # 63525422 6863 4 6373445 0.06 10.03 Trichinella_spiralis.NW_003526945v1 # 50795114 10839 67 205645 0.62 0.40 Trichinella_zimbabwensis.JYDP01000067v1 # 84405302 1683 59 399802 3.51 0.47 Trichuris_muris.HG803612v1 # 75496394 4156 265 70602 6.38 0.09 Trichuris_trichiura.HG806073v1 # 90325107 5105 351 56670 6.88 0.06 Wuchereria_bancrofti.LAQH01004755v1 # 313092710 1736 108 668412 6.22 0.21 ancCey1.JARK01001444v1 # 269573965 12989 175 413062 1.35 0.15 ascSuu1.JH879081v1 # 95156665 9781 60 200860 0.61 0.21 bruMal2.Bmal_v3_scaffold59 # 74576239 5528 22 949830 0.40 1.27 burXyl1.scaffold01198 # 105997628 34621 354 79858 1.02 0.08 caeAng2.Cang_2012_03_13_00354 # 166256191 18817 429 94149 2.28 0.06 caeJap4.Scaffold17868 # 190369721 3305 120 381961 3.63 0.20 caePb3.Scfld02_161 # 145442736 3670 70 435512 1.91 0.30 caeRem4.Crem_Contig70 # 79321433 665 2 20921866 0.30 26.38 caeSp111.Scaffold630 # 131797386 15261 1291 25228 8.46 0.02 caeSp51.Csp5_scaffold_01291 # 108434085 13 3 17485439 23.08 16.13 cb4.chrIV # 115227500 1272 9 5152901 0.71 4.47 ci3.chr11 # 88323343 16062 219 71281 1.36 0.08 dirImm1.nDi_2_2_scaf00219 # 369720058 23823 1150 83299 4.83 0.02 haeCon2.scaffold_1150 # 76992477 1241 57 312328 4.59 0.41 hetBac1.GL996455v1 # 91379422 5765 130 174388 2.25 0.19 loaLoa1.JH712195v1 # 53017507 3452 372 37608 10.78 0.07 melHap1.MhA1_Contig1371 # 86079534 2996 370 62516 12.35 0.07 melInc2.MiV1ctg368 # 244088665 11865 284 211861 2.39 0.09 necAme1.KI659553v1 # 97402144 710 2 25485961 0.28 26.17 oncVol1.HG738138v1 # 65110236 941 64 262414 6.80 0.40 panRed1.KB455552 # 177536987 4369 342 142245 7.83 0.08 priExs1.scaffold305 # 172510819 18084 39 1244534 0.22 0.72 priPac3.Ppa_Contig47 # 43150242 135 2 11693564 1.48 27.10 strRat2.chr1 # 63542128 6864 4 6373445 0.06 10.03 triSpi1.GL622789v1 # 74248995 4293 44 503034 1.02 0.68 triSui1.KL363225v1 # totalSize contigCount n50count n50size percentN percentSize db.n50Name # the usual rules for selection: # 1. when n50Size over 500,000 and percentN under 5 -> syntenic net # 2. otherwise, use chainNet # are woefully inadequate for this purpose. Hardly any of them # have any syntenic net coverage. awk -F$'\t' '$4 > 500000' allSummary.tab | awk '$5 < 5' \ | cut -f7 | awk -F'.' '{print $1}' > syntenicList.txt awk -F$'\t' '$4 <= 500000 || $5 >= 5' allSummary.tab \ | cut -f7 | awk -F'.' '{print $1}' > chainNetList.txt awk '{printf "%s\t%s\t%s\n", $NF,$5,$7}' ../sizeStats.txt \ | sed -e 's/)//g;' | sort -k3,3nr | sort \ | join -t$'\t' - <(sort syntenicList.txt) | sed -e 's/^/# /;' # sequence name chainNet syntenicNet # C_briggsae 60.930 40.599 # C_nigoni 62.531 41.703 # Ditylenchus_destructor 14.413 00.233 # Echinococcus_granulosus 03.234 00.019 # Echinococcus_multilocularis 03.403 00.019 # Hymenolepis_microstoma 05.601 00.019 # Onchocerca_flexuosa 15.520 00.269 # Onchocerca_volvulus 25.170 00.609 # Opisthorchis_viverrini 03.444 00.018 # Parastrongyloides_trichosuri 28.310 00.661 # Rhabditophanes_KR3021 18.703 00.294 # Schistosoma_mansoni 10.328 00.018 # Schmidtea_mediterranea 39.093 00.095 # Strongyloides_venezuelensis 31.839 00.889 # Taenia_multiceps 03.712 00.018 # Taenia_saginata 03.457 00.018 # Trichinella_murrelli 11.930 00.052 # Trichinella_spiralis 10.697 00.052 # burXyl1 06.291 00.145 # caeSp111 55.203 30.009 # ci3 02.006 00.000 # oncVol1 05.193 00.151 # priPac3 06.308 00.187 # strRat2 36.164 01.850 # triSpi1 02.848 00.007 # triSui1 02.917 00.036 wc -l syntenicList.txt chainNetList.txt # 26 syntenicList.txt # 108 chainNetList.txt # 134 total # bash shell syntax here ... cd /hive/data/genomes/ce11/bed/multiz135way mkdir mafLinks export H="/hive/data/genomes/ce11/bed" export TOP="/hive/data/genomes/ce11/bed/multiz135way" sed -e 's/ce11 //;' species.list | sed -e 's/ /\n/g' | while read specName do cd "${TOP}" case "${specName}" in [A-Z]*) GC=`grep -w "${specName}" taxId.GC.sequenceName.accAsmId.sciName.name.tab \ | cut -f2 | sed -e 's/v/./;'` accAsmId=`grep -w "${specName}" taxId.GC.sequenceName.accAsmId.sciName.name.tab \ | cut -f4` mafNet="../awsMultiz/lastzRun/lastz_${GC}/mafNet" echo "$specName" 1>&2 mkdir -p "mafLinks/${specName}" cd mafLinks/${specName} for M in ../../$mafNet/*.maf.gz do B=`basename $M` zcat "${M}" | sed -e "s/$accAsmId/$specName/;" | gzip -c > ${B} done ;; *) mafNet="../lastz.$specName/mafNet" mkdir -p "mafLinks/${specName}" cd mafLinks/${specName} echo "ln -s ../../$mafNet/*.maf.gz ." 1>&2 ln -s ../../$mafNet/*.maf.gz . ;; esac done # real 16m37.247s # verify the symLinks are good: find ./mafLinks -type f | sed -e 's#.*/##;' | sort | uniq -c # 108 chrI.maf.gz # 108 chrII.maf.gz # 108 chrIII.maf.gz # 108 chrIV.maf.gz # 101 chrM.maf.gz # 108 chrV.maf.gz # it appears some of them did not have any matching sequence to chrM find ./mafLinks | grep chrM \ | sed -e 's#./mafLinks/##; s#/chrM.maf.gz##;' | sort > links.with.chrM.list find ./mafLinks | grep chrIII \ | sed -e 's#./mafLinks/##; s#/chrIII.maf.gz##;' | sort > all.links.list comm -13 links.with.chrM.list all.links.list | sed -e 's/^/# /;' # C_nigoni # Elaeophora_elaphi # Gyrodactylus_salaris # Oscheius_tipulae # Pristionchus_pacificus # Taenia_asiatica # Trichinella_spiralis # even though there are 26 of these, they can still be done as # full chromosomes without the splitting procedure mkdir /hive/data/genomes/ce11/bed/multiz135way/fullChromRun cd /hive/data/genomes/ce11/bed/multiz135way/fullChromRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn find ../../mafLinks -type l | grep ".maf.gz$" | xargs -L 1 basename \ | sed -e 's/.gz//;' | sort -u > maf.list wc -l maf.list # 7 maf.list # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = ce11 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/ce11/bed/multiz135way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/ce11/bed/multiz135way/fullChromRun/maf/$(root1).maf} #ENDLOOP ' > template ssh ku cd /hive/data/genomes/ce11/bed/multiz135way/fullChromRun/run gensub2 maf.list single template jobList para -ram=64g create jobList # Completed: 7 of 7 jobs # CPU time in finished jobs: 295337s 4922.28m 82.04h 3.42d 0.009 y # IO & Wait Time: 226s 3.77m 0.06h 0.00d 0.000 y # Average job time: 42223s 703.72m 11.73h 0.49d # Longest finished job: 58035s 967.25m 16.12h 0.67d # Submission to last job: 58046s 967.43m 16.12h 0.67d cd /hive/data/genomes/ce11/bed/multiz135way du -hsc fullChromRun/maf # 20G fullChromRun/maf # Load into database ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz135way/fullChromRun/maf mkdir /gbdb/ce11/multiz135way ln -s `pwd`/*.maf /gbdb/ce11/multiz135way cd /dev/shm time hgLoadMaf ce11 multiz135way # Loaded 6396612 mafs in 7 files from /gbdb/ce11/multiz135way # real 2m35.511s time cat /gbdb/ce11/multiz135way/*.maf \ | hgLoadMafSummary ce11 -minSize=10000 -verbose=2 \ -mergeGap=500 -maxSize=50000 multiz135waySummary stdin # Created 1897571 summary blocks from 214521234 components and 6396612 mafs from stdin # real 5m27.726s # -rw-rw-r-- 1 312973610 Dec 10 09:06 multiz135way.tab # -rw-rw-r-- 1 103769870 Dec 10 09:15 multiz135waySummary.tab wc -l multiz135way* # 6396612 multiz135way.tab # 1897571 multiz135waySummary.tab rm multiz135way*.tab ############################################################################## # GAP ANNOTATE MULTIZ135WAY MAF AND LOAD TABLES (DONE - 2018-12-10 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/ce11/bed/multiz135way/anno cd /hive/data/genomes/ce11/bed/multiz135way/anno ln -s ../../awsMultiz/nameCorrespond/noDot.Names.tab . rm -fr nBedsDir sizesDir nBeds sizes mkdir nBedsDir sizesDir # the UCSC local databases can be done normally cat ../species.list | tr ' ' '\n' | grep "^[a-z]" | while read D do printf "%s\n" "${D}" ls -og /hive/data/genomes/${D}/${D}.N.bed /hive/data/genomes/${D}/chrom.sizes ln -s /hive/data/genomes/${D}/${D}.N.bed nBedsDir/${D}.bed echo "${D}.bed" >> nBeds ln -s /hive/data/genomes/${D}/chrom.sizes sizesDir/${D}.len echo "${D}.len" >> sizes done cat ../species.list | tr ' ' '\n' | grep -v "^[a-z]" | while read S do printf "%s\n" "${S}" accAsmId=`grep -w "${S}" \ ../taxId.GC.sequenceName.accAsmId.sciName.name.tab | cut -f4` # ls -og ../../awsMultiz/twoBit/${accAsmId}.2bit twoBitInfo -nBed ../../awsMultiz/twoBit/${accAsmId}.2bit nBedsDir/${S}.bed echo ${S}.bed >> nBeds ln -s /hive/data/genomes/ce11/bed/awsMultiz/chromSizes/${accAsmId}.chrom.sizes sizesDir/${S}.len echo ${S}.len >> sizes done # make sure they all are successful symLinks: ls -ogL nBedsDir/*.bed | wc -l # 135 screen -S ce11 # use a screen to control this longish job ssh ku mkdir /hive/data/genomes/ce11/bed/multiz135way/anno/mafNet cd /hive/data/genomes/ce11/bed/multiz135way/anno/mafNet mkdir result ln -s ../nBeds . ln -s ../sizes . ln -s ../sizesDir/*.len . ln -s ../nBedsDir/*.bed . printf '#LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/ce11/ce11.2bit {check out line+ result/$(file1)} #ENDLOOP ' > template ls ../../fullChromRun/maf/*.maf > maf.list gensub2 maf.list single template jobList # these jobs require a lot of memory para -ram=48g create jobList para try ... check ... para push # Completed: 7 of 7 jobs # CPU time in finished jobs: 3144s 52.40m 0.87h 0.04d 0.000 y # IO & Wait Time: 43s 0.72m 0.01h 0.00d 0.000 y # Average job time: 455s 7.59m 0.13h 0.01d # Longest finished job: 583s 9.72m 0.16h 0.01d # Submission to last job: 647s 10.78m 0.18h 0.01d du -hsc result # 49G result # Load into database rm -f /gbdb/ce11/multiz135way/chr*.maf cd /hive/data/genomes/ce11/bed/multiz135way/anno/mafNet/result ln -s `pwd`/*.maf /gbdb/ce11/multiz135way/ # this generates an immense multiz135way.tab file in the directory # where it is running. Best to run this over in scratch. cd /dev/shm time hgLoadMaf ce11 multiz135way # Loading multiz135way into database # Loaded 6407627 mafs in 7 files from /gbdb/ce11/multiz135way # real 5m17.184s time (cat /gbdb/ce11/multiz135way/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=10000 \ -mergeGap=500 -maxSize=50000 ce11 multiz135waySummary stdin) # Created 1897571 summary blocks from 214521234 components and 6407627 mafs from stdin # real 8m10.853s wc -l *.tab # 6407627 multiz135way.tab # 1897571 multiz135waySummary.tab # -rw-rw-r-- 1 314835985 Dec 10 13:49 multiz135way.tab # -rw-rw-r-- 1 107565012 Dec 10 13:59 multiz135waySummary.tab rm multiz135way*.tab ############################################################################## # MULTIZ7WAY MAF FRAMES (DONE - 2018-12-13 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ce11/bed/multiz135way/frames cd /hive/data/genomes/ce11/bed/multiz135way/frames # survey all the genomes to find out what kinds of gene tracks they have printf '#!/bin/csh -fe foreach db (`cat ../species.list | tr " " "\\n" | grep -v "^[A-Z]"`) echo -n "# ${db}: " set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "ncbiRefSeq" || $table == "mgcGenes" || \ $table == "ws245Genes" || \ $table == "knownGene" || $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end echo end ' > showGenes.csh chmod +x ./showGenes.csh time ./showGenes.csh # ce11: ensGene: 61109, ncbiRefSeq: 52984, refGene: 53937, ws245Genes: 57827, xenoRefGene: 32625, # caeAng2: ws245Genes: 33934, # caeJap4: ws245Genes: 38144, xenoRefGene: 158453, # cb4: ws245Genes: 23050, xenoRefGene: 130721, # caeSp51: ws245Genes: 46280, # caeRem4: ws245Genes: 32899, xenoRefGene: 155137, # caePb3: ws245Genes: 33210, xenoRefGene: 189584, # caeSp111: ws245Genes: 27721, # priExs1: ws245Genes: 24642, xenoRefGene: 153824, # priPac3: ws245Genes: 24243, xenoRefGene: 97503, # burXyl1: ws245Genes: 18074, # necAme1: ws245Genes: 19643, # ancCey1: ws245Genes: 65583, xenoRefGene: 149848, # ascSuu1: ws245Genes: 3082, xenoRefGene: 148202, # haeCon2: ws245Genes: 26359, # triSui1: ws245Genes: 14435, # panRed1: ws245Genes: 26372, # bruMal2: ws245Genes: 18709, # dirImm1: ws245Genes: 12857, # oncVol1: ws245Genes: 12385, # loaLoa1: ws245Genes: 15577, # melInc2: ws245Genes: 20365, # melHap1: ws245Genes: 14420, xenoRefGene: 88300, # ci3: ensGene: 17784, refGene: 1306, xenoRefGene: 86960, # hetBac1: ws245Genes: 20928, # triSpi1: ws245Genes: 16380, # from that summary, use these gene sets: # ensGene - ce11 ci3 # ws245Genes - caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1 # priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2 # dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1 mkdir genes # 1. ensGene: ce11 ci3 for DB in ce11 ci3 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz echo -n "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # ce11: checked: 20195 failed: 0 # ci3: checked: 16035 failed: 0 # 2. ws245Genes caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1 # priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2 # dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1 for DB in caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1 priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2 dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ws245Genes" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz echo -n "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # caeAng2: checked: 27919 failed: 0 # caeJap4: checked: 29913 failed: 0 # cb4: checked: 21737 failed: 0 # caeSp51: checked: 34679 failed: 0 # caeRem4: checked: 31433 failed: 0 # caePb3: checked: 30626 failed: 0 # caeSp111: checked: 22306 failed: 0 # priExs1: checked: 23767 failed: 0 # priPac3: checked: 22872 failed: 0 # burXyl1: checked: 18074 failed: 0 # necAme1: checked: 18984 failed: 0 # ancCey1: checked: 35775 failed: 0 # ascSuu1: checked: 3065 failed: 0 # haeCon2: checked: 21837 failed: 0 # triSui1: checked: 13600 failed: 0 # panRed1: checked: 24071 failed: 0 # bruMal2: checked: 13471 failed: 0 # dirImm1: checked: 12857 failed: 0 # oncVol1: checked: 12133 failed: 0 # loaLoa1: checked: 14903 failed: 0 # melInc2: checked: 19189 failed: 0 # melHap1: checked: 14103 failed: 0 # hetBac1: checked: 20928 failed: 0 # triSpi1: checked: 16370 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/ancCey1.gp.gz: 35775 # genes/ascSuu1.gp.gz: 3065 # genes/bruMal2.gp.gz: 13471 # genes/burXyl1.gp.gz: 18074 # genes/caeAng2.gp.gz: 27919 # genes/caeJap4.gp.gz: 29913 # genes/caePb3.gp.gz: 30626 # genes/caeRem4.gp.gz: 31433 # genes/caeSp111.gp.gz: 22306 # genes/caeSp51.gp.gz: 34679 # genes/cb4.gp.gz: 21737 # genes/ce11.gp.gz: 20195 # genes/ci3.gp.gz: 16035 # genes/dirImm1.gp.gz: 12857 # genes/haeCon2.gp.gz: 21837 # genes/hetBac1.gp.gz: 20928 # genes/loaLoa1.gp.gz: 14903 # genes/melHap1.gp.gz: 14103 # genes/melInc2.gp.gz: 19189 # genes/necAme1.gp.gz: 18984 # genes/oncVol1.gp.gz: 12133 # genes/panRed1.gp.gz: 24071 # genes/priExs1.gp.gz: 23767 # genes/priPac3.gp.gz: 22872 # genes/triSpi1.gp.gz: 16370 # genes/triSui1.gp.gz: 13600 # kluster job to annotate each maf file screen -S ce11 # manage long running procedure with screen ssh ku cd /hive/data/genomes/ce11/bed/multiz135way/frames printf '#!/bin/csh -fe set C = $1 set G = $2 cat ../anno/mafNet/result/${C}.maf | genePredToMafFrames ce11 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz ' > runOne chmod +x runOne ls ../anno/mafNet/result > chr.list ls genes | sed -e "s/.gp.gz//" > gene.list printf '#LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP ' > template mkdir parts gensub2 chr.list gene.list template jobList para -ram=64g create jobList para try ... check ... push # Completed: 182 of 182 jobs # CPU time in finished jobs: 16011s 266.86m 4.45h 0.19d 0.001 y # IO & Wait Time: 7072s 117.86m 1.96h 0.08d 0.000 y # Average job time: 127s 2.11m 0.04h 0.00d # Longest finished job: 196s 3.27m 0.05h 0.00d # Submission to last job: 401s 6.68m 0.11h 0.00d # collect all results into one file: cd /hive/data/genomes/ce11/bed/multiz135way/frames time find ./parts -type f | while read F do echo "${F}" 1>&2 zcat ${F} done | sort -k1,1 -k2,2n > multiz135wayFrames.bed # real 0m24.365s # -rw-rw-r-- 1 287750919 Dec 13 11:50 multiz135wayFrames.bed gzip multiz135wayFrames.bed # verify there are frames on everything, should be 26 species: # (count from: ls genes | wc) zcat multiz135wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' > species.check.list wc -l species.check.list # 26 # 278211 ancCey1 # 12740 ascSuu1 # 120258 bruMal2 # 147066 burXyl1 # 314609 caeAng2 # 268169 caeJap4 # 372623 caePb3 # 450001 caeRem4 # 384830 caeSp111 # 271790 caeSp51 # 251521 cb4 # 123527 ce11 # 49580 ci3 # 119879 dirImm1 # 154460 haeCon2 # 209501 hetBac1 # 119983 loaLoa1 # 100365 melHap1 # 87175 melInc2 # 144268 necAme1 # 124476 oncVol1 # 157050 panRed1 # 86331 priExs1 # 89170 priPac3 # 94395 triSpi1 # 60978 triSui1 # load the resulting file ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz135way/frames time hgLoadMafFrames ce11 multiz135wayFrames multiz135wayFrames.bed.gz # real 0m29.214s hgsql -e 'select count(*) from multiz135wayFrames;' ce11 # +----------+ # | count(*) | # +----------+ # | 4592956 | # +----------+ time featureBits -countGaps ce11 multiz135wayFrames # 34457786 bases of 100286401 (34.359%) in intersection # real 0m22.416s # enable the trackDb entries: # frames multiz135wayFrames # irows on # zoom to base level in an exon to see codon displays # appears to work OK ######################################################################### # Phylogenetic tree from 135-way (DONE - 2018-12-13 - Hiram) mkdir /hive/data/genomes/ce11/bed/multiz135way/4d cd /hive/data/genomes/ce11/bed/multiz135way/4d # the annotated maf's are in: ../anno/mafNet/result # using knownGene for ce11, only transcribed genes and nothing # from the randoms and other misc. hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene where cdsEnd > cdsStart;" ce11 \ | egrep -E -v "chrM|chrUn|random|_alt" > ensGene.gp wc -l *.gp # 33379 ensGene.gp # verify it is only on the chroms: cut -f2 ensGene.gp | sort | uniq -c | sort -rn | sed -e 's/^/ # /;' # 7509 chrV # 6211 chrIV # 5588 chrII # 4997 chrI # 4646 chrIII # 4428 chrX genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp wc -l ensGeneNR.gp # 20184 ensGeneNR.gp ssh ku mkdir /hive/data/genomes/ce11/bed/multiz135way/4d/run cd /hive/data/genomes/ce11/bed/multiz135way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/ce11/bed/multiz135way" set c = $1 set infile = $r/anno/mafNet/result/$2 set outfile = $3 cd /dev/shm # 'clean' maf, removes all chrom names, leaves only the db name perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\tce11.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ../anno/mafNet/result ls -1S /hive/data/genomes/ce11/bed/multiz135way/anno/mafNet/result/*.maf \ | sed -e "s#.*multiz135way/anno/mafNet/result/##" \ | egrep -E -v "chrM|chrUn|random|_alt" > maf.list printf '#LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP ' > template gensub2 maf.list single template jobList para -ram=64g create jobList para try ... check ... push ... etc... para time # Completed: 6 of 6 jobs # CPU time in finished jobs: 1829s 30.48m 0.51h 0.02d 0.000 y # IO & Wait Time: 20s 0.34m 0.01h 0.00d 0.000 y # Average job time: 308s 5.14m 0.09h 0.00d # Longest finished job: 341s 5.68m 0.09h 0.00d # Submission to last job: 352s 5.87m 0.10h 0.00d # combine mfa files ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz135way/4d # verify no tiny files: ls -og mfa | sort -k3nr # -rw-rw-r-- 1 4660039 Dec 13 12:08 chrII.mfa # -rw-rw-r-- 1 3973699 Dec 13 12:09 chrV.mfa # -rw-rw-r-- 1 3416554 Dec 13 12:08 chrX.mfa # -rw-rw-r-- 1 2920429 Dec 13 12:08 chrI.mfa # -rw-rw-r-- 1 2589544 Dec 13 12:08 chrIII.mfa # -rw-rw-r-- 1 2183464 Dec 13 12:09 chrIV.mfa #want comma-less species.list time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # real 0m1.255s # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 135 # remove the :0.1234 distances from the nh tree: sed -e 's/:[0-9.]\+//g;' ../ce11.135way.nh > tree-commas.nh # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree-commas.nh 4d.all.mfa # real 50m55.630s mv phyloFit.mod all.mod # after rearranging the tree to be more display friendly: # (((((((((((((Caenorhabditis_elegans:0.425799, # (Caenorhabditis_sp._38_MB-2015:0.448904, # Caenorhabditis_angaria:0.438865):0.38477):0.0632893, # Caenorhabditis_japonica:0.631705):0.00332863, # Caenorhabditis_sp._34_TK-2017:0.670052):0.00266019, # (((((((Caenorhabditis_briggsae:0.00761568, # Caenorhabditis_briggsae:0.00617763):0.0901744, # Caenorhabditis_nigoni:0.0878926):0.37284, # (Caenorhabditis_sp._26_LS-2015:0.266258, # Caenorhabditis_sp5_ju800:0.250871):0.0573557):0.00272983, # Caenorhabditis_sp._40_LS-2015:0.311033):0.160657, # ((Caenorhabditis_latens:0.0739274, # Caenorhabditis_remanei:0.0771516):0.343608, # Caenorhabditis_brenneri:0.439498):0.0439004):0.0374774, # Caenorhabditis_tropicalis:0.479027):0.0929836, # Caenorhabditis_sp._31_LS-2015:0.844993):0.0466798):0.0886724, # Caenorhabditis_sp._21_LS-2015:0.81046):0.159439, # (((Caenorhabditis_sp._32_LS-2015:0.461375, # Caenorhabditis_sp._39_LS-2015:0.42763):0.0699299, # (((Pristionchus_arcanus:0.12529, # ((Pristionchus_exspectatus:0.0115759, # Pristionchus_exspectatus:0.00987795):0.092637, # (Pristionchus_pacificus:0.0135936, # Pristionchus_pacificus:0.0151984):0.0917107):0.0582013):0.146807, # Pristionchus_maxplancki:0.231402):0.108203, # Pristionchus_entomophagus:0.314155):0.615931):0.00264449, # (((Bursaphelenchus_xylophilus:0.0169712, # Bursaphelenchus_xylophilus:0.0121836):0.664516, # Subanguina_moxae:1.19812):0.279427, # ((Oscheius_sp._MCB:0.310932, # Oscheius_sp._TEL-2014:0.314067):0.387814, # (((Angiostrongylus_cantonensis:0.76614, # (Necator_americanus:0.0069094, # Necator_americanus:0.00709169):0.62038):0.22255, # Nippostrongylus_brasiliensis:0.585445):0.128625, # ((((((((Ancylostoma_caninum:0.109552, # Ancylostoma_duodenale:0.128167):0.0645197, # Ancylostoma_ceylanicum:0.138249):0.302453, # ((((Ascaris_suum:0.0264911, # Ascaris_suum:0.0266353):0.0718425, # Parascaris_univalens:0.111167):0.412171, # Toxocara_canis:0.435243):1.03133, # Oesophagostomum_dentatum:0.444431):0.133431):0.231588, # ((Haemonchus_contortus:0.0237971, # Haemonchus_contortus:0.0104187):0.40375, # Teladorsagia_circumcincta:0.313931):0.415593):0.132113, # (((Clonorchis_sinensis:0.102068, # Opisthorchis_viverrini:0.0948009):0.672677, # Dicrocoelium_dendriticum:0.641121):0.366763, # ((Elaeophora_elaphi:0.229754, # Macrostomum_lignano:0.315971):0.260862, # ((Fasciola_gigantica:0.0728112, # Fasciola_hepatica:0.0784224):0.849193, # Spirometra_erinaceieuropaei:0.717335):0.238022):0.175856):0.306152):0.0247175, # (Heligmosomoides_polygyrus_bakeri:0.639113, # (Oscheius_tipulae:0.772979, # (((Steinernema_carpocapsae:0.288945, # Steinernema_scapterisci:0.285651):0.224527, # Steinernema_feltiae:0.606142):0.0853374, # Steinernema_glaseri:0.525076):0.152498):0.168142):0.00279645):0.00267557, # ((((Echinococcus_canadensis:0.0305467, # Echinococcus_granulosus:0.0307877):0.00949128, # Echinococcus_multilocularis:0.039056):0.157973, # (((Taenia_asiatica:0.00962609, # Taenia_saginata:0.0164412):0.0108728, # Taenia_multiceps:0.0270195):0.034031, # Taenia_solium:0.0458656):0.187618):0.687903, # (Trichuris_muris:0.289869, # (Trichuris_trichiura:0.184836, # Trichuris_suis:0.125543):0.291242):0.610672):0.454711):0.00261495, # ((Plectus_sambesii:0.668591, # Steinernema_monticolum:0.874638):0.0801328, # Panagrellus_redivivus:0.667253):0.218884):0.00718934):0.198615):0.0754372):0.16779):0.00370935):0.149757, # (Diploscapter_coronatus:0.0396115, # Diploscapter_pachys:0.0410107):0.965863):0.0835229, # (((((((((((((Brugia_malayi:0.0147423, # Brugia_malayi:0.0184123):0.0136278, # Brugia_pahangi:0.0347893):0.0496926, # Wuchereria_bancrofti:0.0607276):0.203732, # ((Dirofilaria_immitis:0.021486, # Dirofilaria_immitis:0.00721013):0.206272, # (Onchocerca_ochengi:0.0198343, # (Onchocerca_volvulus:0.00726612, # Onchocerca_volvulus:0.0153111):0.0105149):0.0986467):0.0222077):0.00274305, # (Loa_loa:0.0109839, # Loa_loa:0.00898028):0.213308):0.00261442, # (Setaria_digitata:0.149493, # Setaria_equina:0.120979):0.317084):0.0425831, # Onchocerca_flexuosa:0.0978734):0.742036, # ((((Dugesia_japonica:0.514157, # Girardia_tigrina:1.22545):0.240555, # Schmidtea_mediterranea:0.619031):0.356539, # (((((Meloidogyne_arenaria:0.0340878, # Meloidogyne_javanica:0.0426825):0.0272648, # Meloidogyne_incognita:0.0308494):0.0157159, # (Meloidogyne_floridensis:0.0840962, # Meloidogyne_incognita:0.0530444):0.00989274):0.18009, # (Meloidogyne_graminicola:0.50525, # Meloidogyne_hapla:0.171223):0.0509786):0.654631, # (Parastrongyloides_trichosuri:0.389745, # ((Strongyloides_papillosus:0.0783108, # Strongyloides_venezuelensis:0.0916494):0.198375, # (Strongyloides_stercoralis:0.177646, # Strongyloides_ratti:0.150624):0.0896129):0.165222):0.220136):0.131527):0.137744, # ((Schistosoma_haematobium:0.0960504, # Schistosoma_mansoni:0.10575):0.224188, # Schistosoma_japonicum:0.29067):0.484258):0.152071):0.239138, # Dictyocaulus_viviparus:0.814104):0.248629, # Hymenolepis_microstoma:0.705812):0.234011, # (Rhabditophanes_sp._KR3021:0.753609, # (Ciona_intestinalis:0.864953, # Heterorhabditis_bacteriophora:0.889764):0.341081):0.233997):0.163448, # Gyrodactylus_salaris:0.912435):0.232379, # Parapristionchus_giblindavisi:0.738653):0.156608):0.28746, # Ditylenchus_destructor:0.885526):0.238847, # (((((Trichinella_sp._T6:0.0168947, # Trichinella_patagoniensis:0.0187164):0.00398067, # ((Trichinella_sp._T8:0.0102628, # (Trichinella_sp._T9:0.0118704, # Trichinella_britovi:0.00645468):0.00273378):0.00273053, # Trichinella_murrelli:0.0111672):0.00482982):0.00272028, # Trichinella_nativa:0.0154206):0.0136128, # (Trichinella_nelsoni:0.0210349, # (Trichinella_spiralis:0.00697283, # Trichinella_spiralis:0.0168217):0.0169346):0.00352233):0.0510198, # ((Trichinella_papuae:0.0217595, # Trichinella_zimbabwensis:0.0207942):0.0233717, # Trichinella_pseudospiralis:0.0610335):0.0574885):0.947802):0.183355, # Romanomermis_culicivorax:0.680971):0.164498, # ((((Globodera_ellingtonae:0.035005, # Globodera_rostochiensis:0.0362764):0.0267624, # Globodera_pallida:0.0659637):0.259339, # Heterodera_glycines:0.354234):0.225215, # Rotylenchulus_reniformis:0.468661):0.395303):0.491, # Acrobeloides_nanus:0.491):0.1:0.1; grep TREE all.mod | sed -e 's/TREE: //;' \ | /cluster/bin/phast/all_dists /dev/stdin | grep ce11 \ | sed -e "s/ce11.//;" | sort > ce11.dists /cluster/bin/phast/all_dists ../ce11.135way.nh | grep ce11 \ | sed -e "s/ce11.//;" | sort > original.dists # printing out the 'original', the 'new' the 'difference' and # percent difference/delta join original.dists ce11.dists | awk '{ printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' | sort -k4n ######################################################################### # phastCons 135-way (DONE - 2018-12-14 - Hiram) # split 135way mafs into 1M chunks and generate sufficient statistics # files for phastCons ssh ku mkdir -p /hive/data/genomes/ce11/bed/multiz135way/cons/ss mkdir -p /hive/data/genomes/ce11/bed/multiz135way/cons/msa.split cd /hive/data/genomes/ce11/bed/multiz135way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/ce11/bed/multiz135way/anno/mafNet/result/$c.maf set WINDOWS = /hive/data/genomes/ce11/bed/multiz135way/cons/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh printf '#LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP ' > template # do the easy ones first to see some immediate results ls -1S -r ../../anno/mafNet/result | sed -e "s/.maf//;" > maf.list # all can finish OK at a 64Gb memory limit gensub2 maf.list single template jobList para -ram=64g create jobList para try ... check ... etc para push # Completed: 7 of 7 jobs # CPU time in finished jobs: 2431s 40.52m 0.68h 0.03d 0.000 y # IO & Wait Time: 278s 4.63m 0.08h 0.00d 0.000 y # Average job time: 387s 6.45m 0.11h 0.00d # Longest finished job: 540s 9.00m 0.15h 0.01d # Submission to last job: 548s 9.13m 0.15h 0.01d # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/ce11/bed/multiz135way/cons/run.cons cd /hive/data/genomes/ce11/bed/multiz135way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/ce11/bed/multiz135way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons/ss set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$c/$f.ss $tmp else ln -s $ssSrc/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix printf '#LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP ' > template ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list wc -l ss.list # 104 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/ce11/bed/multiz135way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList # beware overwhelming the cluster with these fast running high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push # Completed: 104 of 104 jobs # CPU time in finished jobs: 6198s 103.30m 1.72h 0.07d 0.000 y # IO & Wait Time: 718s 11.97m 0.20h 0.01d 0.000 y # Average job time: 67s 1.11m 0.02h 0.00d # Longest finished job: 84s 1.40m 0.02h 0.00d # Submission to last job: 216s 3.60m 0.06h 0.00d # create Most Conserved track cd /hive/data/genomes/ce11/bed/multiz135way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do echo $C 1>&2 ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m39.720s # -rw-rw-r-- 1 74615007 Dec 14 14:15 tmpMostConserved.bed time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ > mostConserved.bed # real 0m10.119s # -rw-rw-r-- 1 76283801 Dec 14 14:16 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz135way/cons/all time hgLoadBed ce11 phastConsElements135way mostConserved.bed # Read 2275482 elements of size 5 from mostConserved.bed # real 0m10.328s # most interesting high measurement for this coverage # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits ce11 -enrichment ncbiRefSeq:cds phastConsElements135way # ncbiRefSeq:cds 25.009%, phastConsElements135way 43.266%, both 16.084%, cover 64.31%, enrich 1.49x # real 0m12.804s # Try for 5% overall cov, and 70% CDS cov time featureBits ce11 -enrichment refGene:cds phastConsElements135way # refGene:cds 25.013%, phastConsElements135way 43.266%, both 16.087%, cover 64.31%, enrich 1.49x # real 0m13.519s time featureBits ce11 -enrichment ensGene:cds phastConsElements135way # ensGene:cds 25.144%, phastConsElements135way 43.266%, both 16.129%, cover 64.15%, enrich 1.48x # real 0m10.267s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/ce11/bed/multiz135way/cons/all mkdir downloads time for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" 1>&2 find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons135way.wigFix.gz done # real 0m51.373s # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons135way.wig phastCons135way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 0m26.755s du -hsc *.wi? # 91M phastCons135way.wib # 9.1M phastCons135way.wig # -rw-rw-r-- 1 95051167 Dec 14 14:24 phastCons135way.wib # -rw-rw-r-- 1 9487394 Dec 14 14:24 phastCons135way.wig # encode into a bigWig file: # (warning wigToBigWig process may be too large for memory limits # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin \ ../../../../chrom.sizes phastCons135way.bw) > bigWig.log 2>&1 egrep "VmPeak|real" bigWig.log # pid=74267: VmPeak: 1108636 kB # real 0m51.112s # -rw-rw-r-- 1 244041045 Dec 14 14:27 phastCons135way.bw bigWigInfo phastCons135way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 182,564,643 primaryIndexSize: 3,214,556 zoomLevels: 10 chromCount: 7 basesCovered: 95,051,167 mean: 0.467683 min: 0.000000 max: 1.000000 std: 0.439150 # if you wanted to use the bigWig file, loading bigWig table: # but we don't use the bigWig file ln -s `pwd`/phastCons135way.bw /gbdb/ce11/multiz135way hgsql ce11 -e 'drop table if exists phastCons135way; \ create table phastCons135way (fileName varchar(255) not null); \ insert into phastCons135way values ("/gbdb/ce11/multiz135way/phastCons135way.bw");' # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz135way/cons/all ln -s `pwd`/phastCons135way.wib /gbdb/ce11/multiz135way/phastCons135way.wib time hgLoadWiggle -pathPrefix=/gbdb/ce11/multiz135way ce11 \ phastCons135way phastCons135way.wig # real 0m0.722s time wigTableStats.sh ce11 phastCons135way # db.table min max mean count sumData # ce11.phastCons135way 0 1 0.467683 95051167 4.44538e+07 # stdDev viewLimits # 0.43915 viewLimits=0:1 # real 0m0.505s # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/ce11/bed/multiz135way/cons/all time hgWiggle -doHistogram -db=ce11 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons135way > ce11.phastCons135way.histogram.data 2>&1 # real 0m4.020s # the pid VmPeak measurements to stderr are getting into the middle of # a line in this data output. Either do *not* use 2>&1 to get all # into the output file, or edit it to fix it after done # create plot of histogram: # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change) printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" set output "ce11.phastCons135.histo.png" set size 1.0, 1.0 set style line 1 lt 2 lc rgb "#ff88ff" lw 2 set style line 2 lt 2 lc rgb "#66ff66" lw 2 set style line 3 lt 2 lc rgb "#ffff00" lw 2 set style line 4 lt 2 lc rgb "#ffffff" lw 2 set border lc rgb "#ffff00" set key left box ls 3 set key tc variable set grid noxtics set grid y2tics ls 4 set grid ytics ls 4 set title " C. elegans/ce11 Histogram phastCons135way track" \ tc rgb "#ffffff" set xlabel " phastCons135way score" tc rgb "#ffffff" set ylabel " Relative Frequency" tc rgb "#ff88ff" set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" set y2range [0:1] set yrange [0:0.02] plot "ce11.phastCons135way.histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \ "ce11.phastCons135way.histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 ' | gnuplot # take a look to see if it is sane: display histo.png & ######################################################################### # phyloP for 135-way (DONE - 2018-12-14 - Hiram) # # can use the same split 1M chunk SS files into that were used # for phastCons # run phyloP with score=LRT ssh ku mkdir /cluster/data/ce11/bed/multiz135way/consPhyloP cd /cluster/data/ce11/bed/multiz135way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACK ../../4d/all.mod # BACKGROUND: 0.338800 0.205656 0.164894 0.290649 grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.371 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../4d/all.mod 0.371 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.314500 0.185500 0.185500 0.314500 cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set d = $f:h set file1 = $f:t set out = $2 set cName = $f:t:r set grp = $cwd:t set cons = /hive/data/genomes/ce11/bed/multiz135way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "/hive/data/genomes/ce11/bed/multiz135way/cons/ss/$f" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$file1.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../../cons/ss -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/ss/##" > ss.list # make sure the list looks good wc -l ss.list # 104 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/ce11/bed/multiz135way/consPhyloP/all cd /hive/data/genomes/ce11/bed/multiz135way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overloading the cluster with these quick and high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push para time > run.time # Completed: 104 of 104 jobs # CPU time in finished jobs: 420481s 7008.02m 116.80h 4.87d 0.013 y # IO & Wait Time: 1037s 17.28m 0.29h 0.01d 0.000 y # Average job time: 4053s 67.55m 1.13h 0.05d # Longest finished job: 5455s 90.92m 1.52h 0.06d # Submission to last job: 5629s 93.82m 1.56h 0.07d mkdir downloads time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" 1>&2 find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP135way.wigFix.gz done # real 1m24.231s du -hsc downloads # 195M downloads # check integrity of data with wigToBigWig time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/ce11/chrom.sizes \ phyloP135way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=180522: VmPeak: 1108636 kB # real 0m51.720s # -rw-rw-r-- 1 356259120 Dec 14 17:17 phyloP135way.bw bigWigInfo phyloP135way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 281,654,520 # primaryIndexSize: 3,214,556 # zoomLevels: 10 # chromCount: 7 # basesCovered: 95,051,167 # mean: 1.130380 # min: -20.000000 # max: 20.000000 # std: 2.927330 # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP135way.wig phyloP135way.wib) # Converted stdin, upper limit 20.00, lower limit -20.00 # real 0m25.705s # -rw-rw-r-- 1 95051167 Dec 14 17:19 phyloP135way.wib # -rw-rw-r-- 1 10006648 Dec 14 17:19 phyloP135way.wig du -hsc *.wi? # 91M phyloP135way.wib # 9.6M phyloP135way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP135way.wib /gbdb/ce11/multiz135way/phyloP135way.wib time hgLoadWiggle -pathPrefix=/gbdb/ce11/multiz135way ce11 \ phyloP135way phyloP135way.wig # real 0m0.761s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh ce11 phyloP135way # db.table min max mean count sumData # ce11.phyloP135way -20 20 1.13038 95051167 1.07444e+08 # stdDev viewLimits # 2.92733 viewLimits=-13.5063:15.767 # that range is: 20+20= 40 for hBinSize=0.04 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.04 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=ce11 phyloP135way > ce11.phyloP135way.histo.data 2>&1 # real 0m4.117s # xaxis range: grep -v chrom ce11.phyloP135way.histo.data | grep "^[0-9]" \ | ave -col=2 stdin | sed -e 's/^/# /;' # Q1 -7.500000 # median 0.320000 # Q3 8.140000 # average 0.309272 # min -20.000000 # max 20.000000 # count 783 # total 242.159998 # standard deviation 9.085841 # find out the range for the 2:5 graph grep -v chrom ce11.phyloP135way.histo.data | grep "^[0-9]" \ | ave -col=5 stdin | sed -e 's/^/# /;' # Q1 0.000002 # median 0.000108 # Q3 0.000374 # average 0.001277 # min 0.000000 # max 0.038921 # count 783 # total 0.999993 # standard deviation 0.003638 # create plot of histogram: # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change) printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" set output "ce11.phyloP135.histo.png" set size 1.0, 1.0 set style line 1 lt 2 lc rgb "#ff88ff" lw 2 set style line 2 lt 2 lc rgb "#66ff66" lw 2 set style line 3 lt 2 lc rgb "#ffff00" lw 2 set style line 4 lt 2 lc rgb "#ffffff" lw 2 set border lc rgb "#ffff00" set key left box ls 3 set key tc variable set grid noxtics set grid y2tics ls 4 set grid ytics ls 4 set title " C. elegans/ce11 Histogram phyloP135way track" \ tc rgb "#ffffff" set xlabel " phyloP135way score" tc rgb "#ffffff" set ylabel " Relative Frequency" tc rgb "#ff88ff" set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" set xrange [-1:1.5] set yrange [0:0.04] plot "ce11.phyloP135way.histo.data" using 2:5 title " RelFreq" with impulses ls 1, \ "ce11.phyloP135way.histo.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 ' | gnuplot # verify it looks sane display ce11.phyloP135.histo.png & ############################################################################# # determine taxonomy for everything (DONE - 2018-12-14 - Hiram) # to set up group divisions # use this perl script to extra the taxonomy strings from NCBI entrez mkdir /hive/data/genomes/ce11/bed/multiz135way/taxonomy cd /hive/data/genomes/ce11/bed/multiz135way/taxonomy cat > taxon.pl << '_EOF_' #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc < 1) { printf STDERR "usage: taxon.pl [more taxIds]\n"; exit 255; } while (my $taxId = shift) { my $taxonomyString = ""; open (TX, "wget -O /dev/stdout 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=$taxId&retmode=xml' 2> /dev/null | grep -i ScientificName | headRest 1 stdin|") or die "can not wget entrez/eutils"; while (my $sciName = ) { chomp $sciName; $sciName =~ s/.*//; $sciName =~ s###; $taxonomyString .= sprintf("%s;", $sciName); } close (TX); printf "%d\t%s\n", $taxId, $taxonomyString; } '_EOF_' # << happy emacs chmod +x taxon.pl ./taxon.pl `sed -e 's/:.*//; s/(//g; s/ //g;' ../ce11.135way.taxId.nh | sort -u | xargs echo` > taxonomy.txt # survey the names in those sed -e 's/.*organisms;//;' taxonomy.txt | tr ';' '\n' | sort | uniq -c | sort -rn | head -20 122 Opisthokonta 122 Metazoa 122 Eumetazoa 122 Eukaryota 122 Bilateria 122 99 Protostomia 99 Nematoda 99 Ecdysozoa 83 Chromadorea 76 Rhabditida 35 Rhabditina 29 Rhabditomorpha 27 Tylenchina 23 Rhabditoidea 23 Rhabditidae 22 Platyhelminthes 18 Peloderinae 18 Caenorhabditis 16 Enoplea sed -e 's/.*;Bilateria;//;' taxonomy.txt | cut -d';' -f1-4 \ | sort | uniq -c | sort -n 1 Deuterostomia;Chordata;Tunicata;Ascidiacea 1 Platyhelminthes;Cestoda;Eucestoda;Diphyllobothriidea 1 Platyhelminthes;Monogenea;Monopisthocotylea;Gyrodactylidea 1 Platyhelminthes;Rhabditophora;Macrostomorpha;Macrostomida 2 Platyhelminthes;Trematoda;Digenea;Opisthorchiida 3 Platyhelminthes;Rhabditophora;Seriata;Tricladida 3 Platyhelminthes;Trematoda;Digenea;Plagiorchiida 3 Platyhelminthes;Trematoda;Digenea;Strigeidida 8 Platyhelminthes;Cestoda;Eucestoda;Cyclophyllidea 16 Protostomia;Ecdysozoa;Nematoda;Enoplea 83 Protostomia;Ecdysozoa;Nematoda;Chromadorea sed -e 's/.*;Bilateria;//;' taxonomy.txt | grep Protostomia \ | cut -d';' -f1-6 | sort | uniq -c | sort -n 1 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Plectida;Plectina 1 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Strongylida;Metastrongyloidea 1 Protostomia;Ecdysozoa;Nematoda;Enoplea;Dorylaimia;Mermithida 5 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Strongylida;Trichostrongyloidea 14 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Spirurina 15 Protostomia;Ecdysozoa;Nematoda;Enoplea;Dorylaimia;Trichinellida 27 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Tylenchina 35 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Rhabditina # with those numbers in hand, break them up like this # the 'orderByChain.db.txt' list is from the output of sizeStats.pl # as done earlier, just the sequence name/db names from that output # in their featureBits chainLink order for N in Deuterostomia Platyhelminthes Plectida Rhabditina Spirurina Tylenchina Strongylida Dorylaimia do grep ";$N;" taxonomy.txt | cut -f1 > $N.taxId.list join -t$'\t' <(sort $N.taxId.list) <(sort taxId.GC.sequenceName.accAsmId.sciName.name.tab) | cut -f3 | sort -u > $N.db.list done cat orderByChain.db.txt | while read n do grep -w "${n}" *.list done | sed -e 's/.db.list:/\t/;' > division.dbName.ordered.tab for N in Deuterostomia Platyhelminthes Plectida Rhabditina Spirurina Tylenchina Strongylida Dorylaimia do grep "$N" division.dbName.ordered.tab | cut -f2 > ${N}.ordered done # those *.ordered listings are used in trackDb to establish # the chainNet composite: cd ~/kent/src/hg/makeDb/trackDb/worm/ce11 ~/kent/src/hg/utils/phyloTrees/chainNetCompositeTrackDb.pl net nematodes \ ce11 Rhabditina.ordered Spirurina.ordered Tylenchina.ordered \ Strongylida.ordered Plectida.ordered Dorylaimia.ordered \ Platyhelminthes.ordered Deuterostomia.ordered \ > chainNetNematodes.ra # edit that .ra file to remove the .ordered from each name in this list: subGroup3 clade Clade c00=Rhabditina c01=Spirurina c02=Tylenchina c03=Strongylida c04=Plectida c05=Dorylaimia c06=Platyhelminthes c07=Deuterostomia # can avoid this edit by calling those files *.list instead ############################################################################# # construct download files for 135-way (TBD - 2018-11-27 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phastCons135way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phyloP135way mkdir /hive/data/genomes/ce11/bed/multiz135way/downloads cd /hive/data/genomes/ce11/bed/multiz135way/downloads mkdir multiz135way phastCons135way phyloP135way ######################################################################### ## create upstream refGene maf files cd /hive/data/genomes/ce11/bed/multiz135way/downloads/multiz135way # bash script #!/bin/bash export geneTbl="ensGene" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ce11 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags ce11 multiz135way \ stdin stdout \ -orgs=/hive/data/genomes/ce11/bed/multiz135way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 107m51.664s # -rw-rw-r-- 1 289630077 Dec 14 18:00 upstream1000.ensGene.maf.gz # -rw-rw-r-- 1 545877544 Dec 14 18:30 upstream2000.ensGene.maf.gz # -rw-rw-r-- 1 1334065100 Dec 14 19:16 upstream5000.ensGene.maf.gz ###################################################################### ## compress the maf files cd /hive/data/genomes/ce11/bed/multiz135way/downloads/multiz135way time rsync -a -P ../../anno/mafNet/result/ ./maf/ # real 3m21.807s du -hsc maf/ # 49G maf/ cd maf time gzip *.maf & # real 21m44.263s du -hscL maf ../../anno/maf/result/ # 5.4G maf # 63G ../../anno/maf/result/ cd maf md5sum *.maf.gz > md5sum.txt # collect the other sequences here: cd /hive/data/genomes/ce11/bed/multiz135way/downloads/multiz135way mkdir sequences grep -v "^[a-z]" nameCrossReference.tsv | while read L do seqName=`printf "%s" "${L}" | awk -F$'\t' '{print $1}'` acc=`printf "%s" "${L}" | awk -F$'\t' '{print $2}'` asmId=`printf "%s" "${L}" | awk -F$'\t' '{print $4}'` src="../../../awsMultiz/sequences/${acc}_${asmId}" twoBit="../../../awsMultiz/twoBit/${acc}_${asmId}.2bit" destName=`printf "%s.%s.%s" "${seqName}" "${acc}" "${asmId}"` printf "sequences/%s\n" "${destName}" mkdir -p "sequences/${destName}" cp -p "${twoBit}" "sequences/${destName}/" cp -p ${src}/*_assembly_report.txt "sequences/${destName}/" done du -hsc sequences # 7.2G sequences/ cd sequences time md5sum */*.2bit */*.txt > md5sum.txt # real 0m30.828s mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way/maf cd maf ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way/maf/ mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way/sequences cd ../sequences ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way/sequences/ cd .. # obtain the README.txt from the multiz26way and fix it up with appropriate # listings ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way/ ########################################################################### cd /hive/data/genomes/ce11/bed/multiz135way/downloads/multiz135way grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.135way.nh cat ce11.135way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=../../sequenceName.sciName.tab -noInternal \ -lineOutput /dev/stdin > ce11.135way.scientificName.nh cat ce11.135way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=../../sequenceName.taxId.tab -noInternal \ -lineOutput /dev/stdin > ce11.135way.taxId.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh ce11.135way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.135way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh ce11.135way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > ce11.135way.scientificNames.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m3.147s ln -s `pwd`/*.maf.gz `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way du -hsc ./maf ../../anno/result # 18G ./maf # 156G ../../anno/result # obtain the README.txt from ce11/multiz20way and update for this # situation ln -s `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz135way/ ##################################################################### cd /hive/data/genomes/ce11/bed/multiz135way/downloads/phastCons135way ln -s ../../cons/all/downloads/*.wigFix.gz . ln -s ../../cons/all/phastCons135way.bw ./ce11.phastCons135way.bw ln -s ../../cons/all/all.mod ./ce11.phastCons135way.mod time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m20.354s # obtain the README.txt from ce11/phastCons26way and update for this # situation, and edit accordingly. mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phastCons135way ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phastCons135way ##################################################################### cd /hive/data/genomes/ce11/bed/multiz135way/downloads/phyloP135way ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz . md5sum *.wigFix.gz > md5sum.txt ln -s ../../consPhyloP/run.phyloP/all.mod ce11.phyloP135way.mod ln -s ../../consPhyloP/all/phyloP135way.bw ce11.phyloP135way.bw md5sum *.mod *.bw > md5sum.txt # obtain the README.txt from ce11/phyloP20way and update for this mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phyloP135way ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phyloP135way/ ############################################################################# # hgPal downloads (DONE - 2018-12-17 - Hiram) # FASTA from 135-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S ce11HgPal mkdir /hive/data/genomes/ce11/bed/multiz135way/pal cd /hive/data/genomes/ce11/bed/multiz135way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop can take hours on a high contig count assembly # it is just fine on human/ce11, just a few seconds export mz=multiz135way export gp=ensGene export db=ce11 export I=0 export D=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/1350)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time (sh -x ./$gp.jobs) > $gp.jobs.log 2>&1 # real 41m45.902s export mz=multiz135way export gp=ensGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m26.225s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 5m55.586s # -rw-rw-r-- 1 546125992 Dec 17 10:24 ensGene.multiz135way.exonAA.fa.gz # -rw-rw-r-- 1 1049129671 Dec 17 10:35 ensGene.multiz135way.exonNuc.fa.gz export mz=multiz135way export gp=ensGene export db=ce11 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # wiki page for 135-way (TBD - 2017-11-06 - Hiram) mkdir /hive/users/hiram/bigWays/ce11.135way cd /hive/users/hiram/bigWays sed -e 's/ /\n/g;' /hive/data/genomes/ce11/bed/multiz135way/species.list \ > ce11.135way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh ce11.135way/ordered.list # dbDb.sh constructs ce11.135way/XenTro9_135-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh ce11 135way # sizeStats.pl constructs ce11.135way/XenTro9_135-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl ce11 135way # defCheck.pl constructs XenTro9_135-way_conservation_lastz_parameters.html ./defCheck.pl ce11 135way # this constructs the html pages in ce11.135way/: # -rw-rw-r-- 1 6247 May 2 17:07 XenTro9_135-way_conservation_alignment.html # -rw-rw-r-- 1 84135 May 2 17:09 XenTro9_135-way_Genome_size_statistics.html # -rw-rw-r-- 1 5033 May 2 17:10 XenTro9_135-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # XenTro9_135-way_conservation_alignment # XenTro9_135-way_Genome_size_statistics # XenTro9_135-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################ # pushQ readmine (TBD - 2017-11-07 - Hiram) cd /usr/local/apache/htdocs-hgdownload/goldenPath/ce11 find -L `pwd`/multiz135way `pwd`/phastCons135way `pwd`/phyloP135way \ /gbdb/ce11/multiz135way -type f \ > /hive/data/genomes/ce11/bed/multiz135way/downloads/redmine.20216.fileList wc -l /hive/data/genomes/ce11/bed/multiz135way/downloads/redmine.20216.fileList # 1450 /hive/data/genomes/ce11/bed/multiz135way/downloads/redmine.20216.fileList cd /hive/data/genomes/ce11/bed/multiz135way/downloads hgsql -e 'show tables;' ce11 | grep 135way \ | sed -e 's/^/ce11./;' > redmine.20216.table.list ############################################################################