############################################################################# ## 77-Way Multiz (DONE - 2018-11-27 - Hiram) ssh hgwdev mkdir /hive/data/genomes/galGal6/bed/multiz77way cd /hive/data/genomes/galGal6/bed/multiz77way # determine list of databases available from alignments: ls ../lastz.*/fb.galGal6* | egrep -v "RBest|chainSyn" \ | awk -F'/' '{print $2}' | sed -e 's/lastz.//;' | egrep -v tetNig1 \ > db.77.list # this list happens to also fortunately include the galGal6 since # there does exist a ../lastz.galGal6/fb.galGal6* result from the self # alignment # from the 220-way in the source tree, select out the 77 used here: /cluster/bin/phast/tree_doctor \ --prune-all-but `cat db.77.list | xargs echo | tr ' ' ','` \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/220way.nh \ > t.nh # using TreeGraph2 tree editor on the Mac, rearrange to get galGal6 # at the top, and attempt to get the others in phylo order: /cluster/bin/phast/all_dists t.nh | grep galGal6 \ | sed -e "s/galGal6.//" | sort -k2n | sed -e 's/^/#\t/;' # cotJap1 0.066254 # melGal5 0.126972 # tytAlb1 0.152299 # bucRhi1 0.162299 # anaPla1 0.172299 # apaVit1 0.182299 # calAnn1 0.192299 # cucCan1 0.202299 # chaVoc2 0.212299 # fulGla1 0.222299 # tauEry1 0.232299 # opiHoa1 0.242299 # phoRub1 0.252299 # strCam1 0.262299 # tinGut2 0.262299 # colLiv1 0.312299 # lepDis1 0.342299 # merNub1 0.342299 # pelCri1 0.342299 # phaCar1 0.352299 # phaLep1 0.352299 # pteGut1 0.352299 # egrGar1 0.362299 # nipNip1 0.362299 # aptFor1 0.372299 # pygAde1 0.372299 # chlUnd1 0.382299 # colStr1 0.382299 # gavSte1 0.397002 # aquChr2 0.402299 # balPav1 0.402299 # falChe1 0.402299 # falPer1 0.402299 # halAlb1 0.402299 # halLeu1 0.402299 # picPub1 0.402299 # capCar1 0.407002 # amaVit1 0.413002 # araMac1 0.413002 # pseHum1 0.422002 # eurHel1 0.422299 # melUnd1 0.423987 # carCri1 0.442299 # mesUni1 0.442299 # corBra1 0.454068 # corCor1 0.454068 # ficAlb2 0.454068 # acaChl1 0.464068 # serCan1 0.473068 # zonAlb1 0.473525 # geoFor1 0.480329 # taeGut2 0.484068 # apaSpi1 0.617442 # cheMyd1 0.617442 # chrPic2 0.617442 # pelSin1 0.617442 # allMis1 0.632299 # allSin1 0.632299 # nanPar1 0.829442 # pytBiv1 0.887442 # xenTro9 0.907386 # xenLae2 0.929442 # anoCar2 0.934442 # thaSir1 0.987442 # hg38 1.115503 # mm10 1.326078 # gasAcu1 1.758169 # oreNil3 1.824346 # tetNig2 1.846095 # angJap1 1.891116 # fr3 1.925783 # letCam1 1.946543 # petMar3 1.946543 # danRer11 1.971868 # mayZeb1 1.974346 # oryLat2 2.008726 ######################################################################### # It appears that tree is not correct when compared to the featureBits # measurements of the alignments: #query chain synTen rBest chainSyn rBest total # hours hours hours melGal5 94.495% 67.751% 86.165% 1.520 4.184 5.704 Turkey aquChr2 87.687% 78.755% 80.369% 7.449 3.867 11.316 Golden eagle halLeu1 86.923% 77.776% 79.411% 6.407 4.120 10.527 Bald eagle nipNip1 86.903% 77.318% 79.492% 7.802 4.310 12.112 Crested ibis aptFor1 86.834% 78.454% 79.601% 8.047 3.701 11.748 Emperor penguin falPer1 86.500% 76.966% 78.677% 10.423 4.570 14.993 Peregrine falcon falChe1 86.342% 76.498% 78.516% 10.403 4.553 14.956 Saker falcon chaVoc2 86.267% 77.101% 78.919% 7.031 4.043 11.074 Killdeer pygAde1 85.892% 77.734% 78.841% 6.260 3.759 10.019 Adelie penguin egrGar1 85.589% 76.499% 78.042% 6.340 4.037 10.377 Little egret anaPla1 85.509% 74.674% 76.829% 10.400 5.386 15.786 Mallard duck pelCri1 85.344% 72.186% 78.202% 7.479 5.364 12.843 Dalmatian pelican fulGla1 85.332% 73.091% 78.108% 9.987 5.013 15.000 Northern fulmar carCri1 85.094% 72.651% 77.569% 7.956 5.680 13.636 Red-legged seriema lepDis1 85.021% 72.372% 77.495% 8.063 5.776 13.839 Cuckoo roller halAlb1 84.997% 73.173% 77.620% 7.086 4.990 12.076 White-tailed eagle phoRub1 84.793% 70.837% 77.774% 7.551 5.334 12.885 American flamingo gavSte1 84.717% 72.183% 77.534% 10.203 5.240 15.443 Red-throated loon balPav1 84.687% 72.017% 77.336% 8.674 5.294 13.968 Crowned crain opiHoa1 84.579% 74.775% 76.791% 8.151 4.723 12.874 Hoatzin phaLep1 84.519% 70.871% 77.263% 8.705 5.806 14.511 White-tailed tropicbird tauEry1 84.485% 71.434% 76.784% 7.988 5.261 13.249 Red crested turaco phaCar1 84.182% 70.890% 76.937% 8.695 5.616 14.311 Great cormorant tytAlb1 84.122% 70.407% 76.219% 8.504 5.036 13.540 Barn owl capCar1 83.573% 68.596% 75.378% 10.311 6.741 17.052 Chuck-will's-widow cucCan1 83.150% 72.094% 74.353% 8.265 5.510 13.775 Common cuckoo colLiv1 83.073% 72.569% 74.370% 7.953 5.237 13.190 Rock pigeon chlUnd1 83.041% 69.052% 75.109% 7.402 5.265 12.667 Houbara bustard mesUni1 82.649% 67.519% 73.976% 8.597 6.372 14.969 Brown roatelo pteGut1 82.523% 68.882% 74.326% 8.069 5.744 13.813 Yellow-throated sandgrouse eurHel1 82.426% 67.655% 74.252% 11.532 6.463 17.995 Sunbittern strCam1 82.389% 74.592% 75.802% 6.060 3.424 9.484 Ostrich melUnd1 81.694% 69.821% 72.543% 9.650 7.075 16.725 Budgerigar merNub1 81.018% 65.336% 72.239% 10.165 7.687 17.852 Northern carmine bee-eater apaVit1 81.010% 67.063% 72.793% 11.106 6.586 17.692 Bar tailed trogon colStr1 80.987% 64.311% 72.020% 9.799 7.358 17.157 Speckled mousebird corBra1 80.978% 69.377% 71.439% 9.428 6.535 15.963 American crow bucRhi1 80.812% 66.557% 72.242% 10.362 6.817 17.179 Rhinoceros hornbill pseHum1 80.622% 67.986% 70.770% 10.963 7.976 18.939 Tibetan ground jay amaVit1 80.577% 54.029% 72.003% 11.949 8.570 20.519 Parrot corCor1 80.574% 68.064% 71.052% 9.863 7.091 16.954 Hooded crow serCan1 79.903% 66.300% 69.457% 11.959 8.717 20.676 Canary calAnn1 79.733% 67.104% 69.779% 10.762 6.710 17.472 Anna's hummingbird acaChl1 79.489% 64.470% 70.154% 10.695 7.848 18.543 Rifleman ficAlb2 79.354% 66.501% 69.517% 15.282 11.456 26.738 Collared flycatcher geoFor1 79.299% 66.972% 69.271% 12.756 7.057 19.813 Medium ground finch zonAlb1 78.645% 65.813% 68.277% 6.904 6.904 13.808 White-throated sparrow araMac1 76.180% 43.932% 67.099% 14.834 10.677 25.511 Scarlet macaw picPub1 74.258% 60.446% 63.830% 11.784 9.288 21.072 Downy woodpecker taeGut2 73.989% 65.637% 68.089% 8.065 5.552 13.617 Zebra finch tinGut2 73.577% 62.604% 65.205% 6.488 4.756 11.244 White throated tinamou cotJap1 56.121% 40.600% 40.908% 6.704 5.528 12.232 Japanese quail chrPic2 51.724% 41.868% 46.369% 8.319 6.549 14.868 Painted turtle cheMyd1 51.019% 41.642% 45.898% 8.140 5.934 14.074 Green seaturtle allSin1 46.056% 38.551% 41.235% 6.564 3.924 10.488 Chinese alligator allMis1 44.801% 37.363% 40.998% 6.142 3.851 9.993 American alligator pelSin1 41.985% 31.150% 35.931% 10.105 7.710 17.815 Chinese softshell turtle apaSpi1 38.375% 6.975% 32.776% 14.136 10.592 24.728 Spiny softshell turtle galGal6 16.842% n/a n/a 0.000 0.000 0.000 Chicken pytBiv1 15.806% 6.802% 11.843% 5.409 4.474 9.883 Burmese python anoCar2 14.527% 8.896% 11.537% 8.322 6.316 14.638 Lizard thaSir1 11.754% 3.944% 8.850% 4.388 2.991 7.379 Garter snake xenTro9 8.102% 3.975% 6.270% 5.994 4.180 10.174 X. tropicalis xenLae2 7.927% 3.814% 6.549% 6.520 4.040 10.560 African clawed frog nanPar1 7.564% 3.096% 5.757% 4.114 1.901 6.015 Tibetan frog danRer11 6.352% 1.442% 5.199% 4.814 3.025 7.839 Zebrafish angJap1 5.408% 0.824% 4.100% 2.620 1.428 4.048 Japanese eel oreNil3 4.925% 1.433% 4.068% 2.422 1.826 4.248 Nile tilapia gasAcu1 4.623% 1.277% 3.605% 2.175 1.329 3.504 Stickleback oryLat2 4.565% 1.147% 3.596% 1.928 1.450 3.378 Medaka mayZeb1 4.539% 1.354% 3.720% 1.595 1.131 2.726 Zebra mbuna tetNig2 4.148% 1.011% 3.119% 1.717 1.150 2.867 Tetraodon fr3 4.007% 1.122% 3.149% 1.497 0.934 2.431 Fugu petMar3 3.658% 0.159% 2.467% 2.052 1.022 3.074 Lamprey letCam1 3.335% 0.147% 2.186% 1.467 0.958 2.425 Arctic lamprey # what that looks like: ~/kent/src/hg/utils/phyloTrees/asciiTree.pl galGal6.77way.nh | sed -e 's/^/# /;' # ((((((((((((((((((((((((galGal6:0.031254, # cotJap1:0.035):0.01, # melGal5:0.085718):0.011045, # tytAlb1:0.1):0.01, # bucRhi1:0.1):0.01, # anaPla1:0.1):0.01, # apaVit1:0.1):0.01, # calAnn1:0.1):0.01, # cucCan1:0.1):0.01, # chaVoc2:0.1):0.01, # fulGla1:0.1):0.01, # tauEry1:0.1):0.01, # opiHoa1:0.1):0.01, # phoRub1:0.1):0.01, # (colLiv1:0.1, # ((lepDis1:0.1, # merNub1:0.1):0.02, # ((pelCri1:0.1, # (phaCar1:0.1, # phaLep1:0.1):0.01):0.01, # ((pteGut1:0.1, # (nipNip1:0.1, # egrGar1:0.1):0.01):0.01, # ((pygAde1:0.1, # aptFor1:0.1):0.02, # (((((carCri1:0.1, # mesUni1:0.1):0.02, # eurHel1:0.1):0.02, # balPav1:0.1):0.02, # chlUnd1:0.1):0.02, # (((falChe1:0.1, # falPer1:0.1):0.01, # (aquChr2:0.1, # (halAlb1:0.05, # halLeu1:0.05):0.05):0.01):0.02, # ((((corBra1:0.035, # corCor1:0.035):0.035, # (acaChl1:0.06, # (ficAlb2:0.04, # (((serCan1:0.024, # zonAlb1:0.024457):0.01, # geoFor1:0.041261):0.015, # taeGut2:0.06):0.01):0.01):0.02):0.022066, # pseHum1:0.06):0.025, # (gavSte1:0.04, # (capCar1:0.04, # (melUnd1:0.046985, # (amaVit1:0.026, # araMac1:0.026):0.01):0.01):0.01):0.02):0.064703):0.01):0.01):0.01):0.01):0.01):0.01):0.05):0.02, # colStr1:0.2):0.02, # picPub1:0.2):0.02, # (strCam1:0.02, # tinGut2:0.02):0.02):0.16, # (allMis1:0.1, # allSin1:0.1):0.15):0.045143, # ((cheMyd1:0.1, # chrPic2:0.1):0.05, # (pelSin1:0.1, # apaSpi1:0.1):0.05):0.04):0.01, # ((pytBiv1:0.1, # thaSir1:0.2):0.1, # anoCar2:0.247):0.25):0.072, # (hg38:0.145908, # mm10:0.356483):0.460153):0.05, # ((xenTro9:0.177944, # xenLae2:0.2):0.1, # nanPar1:0.2):0.07):0.211354, # (((((tetNig2:0.124159, # fr3:0.203847):0.09759, # (oreNil3:0.1, # mayZeb1:0.25):0.1):0.09759, # oryLat2:0.48197):0.015, # gasAcu1:0.246413):0.27064, # (angJap1:0.45, # danRer11:0.530752):0.2):0.47032):0.2, # (petMar3:0.4, # letCam1:0.4):0.575747); # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ galGal6.77way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e 's/-/_/g;' | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ | sed -e "s#'#_x_#g;" > db.to.name.txt /cluster/bin/phast/tree_doctor --rename "`cat db.to.name.txt`" \ galGal6.77way.nh \ | sed -e 's/0\+)/)/g; s/0\+,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ | sed -e "s#_x_#'#g;" > galGal6.77way.commonNames.nh # verify names translate correctly, scan this output: sed -e 's/(//g; s/:.*//;' galGal6.77way.commonNames.nh \ | sed -e 's/ //g;' | sort | less cat galGal6.77way.commonNames.nh | sed -e 's/^/# /;' # ((((((((((((((((((((((((Chicken:0.031254, # Japanese_quail:0.035):0.01, # Turkey:0.085718):0.011045, # Barn_owl:0.1):0.01, # Rhinoceros_hornbill:0.1):0.01, # Mallard_duck:0.1):0.01, # Bar_tailed_trogon:0.1):0.01, # Anna's_hummingbird:0.1):0.01, # Common_cuckoo:0.1):0.01, # Killdeer:0.1):0.01, # Northern_fulmar:0.1):0.01, # Red_crested_turaco:0.1):0.01, # Hoatzin:0.1):0.01, # American_flamingo:0.1):0.01, # (Rock_pigeon:0.1, # ((Cuckoo_roller:0.1, # Northern_carmine_bee_eater:0.1):0.02, # ((Dalmatian_pelican:0.1, # (Great_cormorant:0.1, # White_tailed_tropicbird:0.1):0.01):0.01, # ((Yellow_throated_sandgrouse:0.1, # (Crested_ibis:0.1, # Little_egret:0.1):0.01):0.01, # ((Adelie_penguin:0.1, # Emperor_penguin:0.1):0.02, # (((((Red_legged_seriema:0.1, # Brown_roatelo:0.1):0.02, # Sunbittern:0.1):0.02, # Crowned_crain:0.1):0.02, # Houbara_bustard:0.1):0.02, # (((Saker_falcon:0.1, # Peregrine_falcon:0.1):0.01, # (Golden_eagle:0.1, # (White_tailed_eagle:0.05, # Bald_eagle:0.05):0.05):0.01):0.02, # ((((American_crow:0.035, # Hooded_crow:0.035):0.035, # (Rifleman:0.06, # (Collared_flycatcher:0.04, # (((Canary:0.024, # White_throated_sparrow:0.024457):0.01, # Medium_ground_finch:0.041261):0.015, # Zebra_finch:0.06):0.01):0.01):0.02):0.022066, # Tibetan_ground_jay:0.06):0.025, # (Red_throated_loon:0.04, # (Chuck_will's_widow:0.04, # (Budgerigar:0.046985, # (Parrot:0.026, # Scarlet_macaw:0.026):0.01):0.01):0.01):0.02):0.064703):0.01):0.01):0.01):0.01):0.01):0.01):0.05):0.02, # Speckled_mousebird:0.2):0.02, # Downy_woodpecker:0.2):0.02, # (Ostrich:0.02, # White_throated_tinamou:0.02):0.02):0.16, # (American_alligator:0.1, # Chinese_alligator:0.1):0.15):0.045143, # ((Green_seaturtle:0.1, # Painted_turtle:0.1):0.05, # (Chinese_softshell_turtle:0.1, # Spiny_softshell_turtle:0.1):0.05):0.04):0.01, # ((Burmese_python:0.1, # Garter_snake:0.2):0.1, # Lizard:0.247):0.25):0.072, # (Human:0.145908, # Mouse:0.356483):0.460153):0.05, # ((X__tropicalis:0.177944, # African_clawed_frog:0.2):0.1, # Tibetan_frog:0.2):0.07):0.211354, # (((((Tetraodon:0.124159, # Fugu:0.203847):0.09759, # (Nile_tilapia:0.1, # Zebra_mbuna:0.25):0.1):0.09759, # Medaka:0.48197):0.015, # Stickleback:0.246413):0.27064, # (Japanese_eel:0.45, # Zebrafish:0.530752):0.2):0.47032):0.2, # (Lamprey:0.4, # Arctic_lamprey:0.4):0.575747); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/galGal6_77way.png ~/kent/src/hg/utils/phyloTrees/scientificNames.sh galGal6.77way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > galGal6.77way.scientificNames.nh cat galGal6.77way.scientificNames.nh | sed -e 's/^/# /;' # ((((((((((((((((((((((((Gallus_gallus:0.031254, # Coturnix_japonica:0.035):0.01, # Meleagris_gallopavo:0.085718):0.011045, # Tyto_alba:0.1):0.01, # Buceros_rhinoceros_silvestris:0.1):0.01, # Anas_platyrhynchos:0.1):0.01, # Apaloderma_vittatum:0.1):0.01, # Calypte_anna:0.1):0.01, # Cuculus_canorus:0.1):0.01, # Charadrius_vociferus:0.1):0.01, # Fulmarus_glacialis:0.1):0.01, # Tauraco_erythrolophus:0.1):0.01, # Opisthocomus_hoazin:0.1):0.01, # Phoenicopterus_ruber_ruber:0.1):0.01, # (Columba_livia:0.1, # ((Leptosomus_discolor:0.1, # Merops_nubicus:0.1):0.02, # ((Pelecanus_crispus:0.1, # (Phalacrocorax_carbo:0.1, # Phaethon_lepturus:0.1):0.01):0.01, # ((Pterocles_gutturalis:0.1, # (Nipponia_nippon:0.1, # Egretta_garzetta:0.1):0.01):0.01, # ((Pygoscelis_adeliae:0.1, # Aptenodytes_forsteri:0.1):0.02, # (((((Cariama_cristata:0.1, # Mesitornis_unicolor:0.1):0.02, # Eurypyga_helias:0.1):0.02, # Balearica_pavonina_gibbericeps:0.1):0.02, # Chlamydotis_undulata_macqueenii:0.1):0.02, # (((Falco_cherrug:0.1, # Falco_peregrinus:0.1):0.01, # (Aquila_chrysaetos_canadensis:0.1, # (Haliaeetus_albicilla:0.05, # Haliaeetus_leucocephalus:0.05):0.05):0.01):0.02, # ((((Corvus_brachyrhynchos:0.035, # Corvus_cornix_cornix:0.035):0.035, # (Acanthisitta_chloris:0.06, # (Ficedula_albicollis:0.04, # (((Serinus_canaria:0.024, # Zonotrichia_albicollis:0.024457):0.01, # Geospiza_fortis:0.041261):0.015, # Taeniopygia_guttata:0.06):0.01):0.01):0.02):0.022066, # Pseudopodoces_humilis:0.06):0.025, # (Gavia_stellata:0.04, # (Caprimulgus_carolinensis:0.04, # (Melopsittacus_undulatus:0.046985, # (Amazona_vittata:0.026, # Ara_macao:0.026):0.01):0.01):0.01):0.02):0.064703):0.01):0.01):0.01):0.01):0.01):0.01):0.05):0.02, # Colius_striatus:0.2):0.02, # Picoides_pubescens:0.2):0.02, # (Struthio_camelus_australis:0.02, # Tinamus_guttatus:0.02):0.02):0.16, # (Alligator_mississippiensis:0.1, # Alligator_sinensis:0.1):0.15):0.045143, # ((Chelonia_mydas:0.1, # Chrysemys_picta_bellii:0.1):0.05, # (Pelodiscus_sinensis:0.1, # Apalone_spinifera:0.1):0.05):0.04):0.01, # ((Python_bivittatus:0.1, # Thamnophis_sirtalis:0.2):0.1, # Anolis_carolinensis:0.247):0.25):0.072, # (Homo_sapiens:0.145908, # Mus_musculus:0.356483):0.460153):0.05, # ((Xenopus_tropicalis:0.177944, # Xenopus_laevis:0.2):0.1, # Nanorana_parkeri:0.2):0.07):0.211354, # (((((Tetraodon_nigroviridis:0.124159, # Takifugu_rubripes:0.203847):0.09759, # (Oreochromis_niloticus:0.1, # Maylandia_zebra:0.25):0.1):0.09759, # Oryzias_latipes:0.48197):0.015, # Gasterosteus_aculeatus:0.246413):0.27064, # (Anguilla_japonica:0.45, # Danio_rerio:0.530752):0.2):0.47032):0.2, # (Petromyzon_marinus:0.4, # Lethenteron_camtschaticum:0.4):0.575747); /cluster/bin/phast/all_dists galGal6.77way.nh | grep galGal6 \ | sed -e "s/galGal6.//" | sort -k2n > 77way.distances.txt # Use this output to create the table below cat 77way.distances.txt | sed -e 's/^/# /;' # cotJap2 0.066254 # melGal5 0.126972 # tytAlb1 0.152299 # bucRhi1 0.162299 # anaPla1 0.172299 # apaVit1 0.182299 # calAnn1 0.192299 # cucCan1 0.202299 # chaVoc2 0.212299 # fulGla1 0.222299 # tauEry1 0.232299 # opiHoa1 0.242299 # phoRub1 0.252299 # strCam1 0.262299 # tinGut2 0.262299 # colLiv1 0.312299 # lepDis1 0.342299 # merNub1 0.342299 # pelCri1 0.342299 # phaCar1 0.352299 # phaLep1 0.352299 # pteGut1 0.352299 # egrGar1 0.362299 # nipNip1 0.362299 # aptFor1 0.372299 # pygAde1 0.372299 # chlUnd1 0.382299 # colStr1 0.382299 # gavSte1 0.397002 # aquChr2 0.402299 # balPav1 0.402299 # falChe1 0.402299 # falPer1 0.402299 # halAlb1 0.402299 # halLeu1 0.402299 # picPub1 0.402299 # capCar1 0.407002 # amaVit1 0.413002 # araMac1 0.413002 # pseHum1 0.422002 # eurHel1 0.422299 # melUnd1 0.423987 # carCri1 0.442299 # mesUni1 0.442299 # corBra1 0.454068 # corCor1 0.454068 # ficAlb2 0.454068 # acaChl1 0.464068 # serCan1 0.473068 # zonAlb1 0.473525 # geoFor1 0.480329 # taeGut2 0.484068 # apaSpi1 0.617442 # cheMyd1 0.617442 # chrPic2 0.617442 # pelSin1 0.617442 # allMis1 0.632299 # allSin1 0.632299 # nanPar1 0.829442 # pytBiv1 0.887442 # xenTro9 0.907386 # xenLae2 0.929442 # anoCar2 0.934442 # thaSir1 0.987442 # hg38 1.115503 # mm10 1.326078 # gasAcu1 1.758169 # oreNil3 1.824346 # tetNig2 1.846095 # angJap1 1.891116 # fr3 1.925783 # letCam1 1.946543 # petMar3 1.946543 # danRer11 1.971868 # mayZeb1 1.974346 # oryLat2 2.008726 printf '#!/usr/bin/env perl use strict; use warnings; open (FH, "<77way.distances.txt") or die "can not read 77way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('"'"'\\s+'"'"', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/galGal6/bed/lastz.$D/fb.galGal6." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\\%%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.galGal6/fb.${D}.chainGalGal6Link.txt"; my $swapMeasure = "0"; if ( -s $swapFile ) { $swapMeasure = `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\\%%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='"'"'$D'"'"';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %%02d %%.4f (%%%% %%06.3f) (%%%% %%06.3f) - %%s %%s\\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); ' > sizeStats.pl chmod +x ./sizeStats.pl ./sizeStats.pl # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # Have not yet done the swaps for this situation # featureBits chainLink measures # chainLink # N distance on galGal6 on other other species # 01 0.0663 (% 90.071) (% 96.366) - Japanese quail cotJap2 # 02 0.1270 (% 94.495) (% 92.491) - Turkey melGal5 # 03 0.1523 (% 84.122) (% 79.540) - Barn owl tytAlb1 # 04 0.1623 (% 80.812) (% 78.727) - Rhinoceros hornbill bucRhi1 # 05 0.1723 (% 85.509) (% 83.814) - Mallard duck anaPla1 # 06 0.1823 (% 81.010) (% 79.322) - Bar tailed trogon apaVit1 # 07 0.1923 (% 79.733) (% 79.955) - Anna's hummingbird calAnn1 # 08 0.2023 (% 83.150) (% 78.921) - Common cuckoo cucCan1 # 09 0.2123 (% 86.267) (% 78.316) - Killdeer chaVoc2 # 10 0.2223 (% 85.332) (% 78.234) - Northern fulmar fulGla1 # 11 0.2323 (% 84.485) (% 77.102) - Red crested turaco tauEry1 # 12 0.2423 (% 84.579) (% 78.069) - Hoatzin opiHoa1 # 13 0.2523 (% 84.793) (% 78.057) - American flamingo phoRub1 # 14 0.2623 (% 82.389) (% 74.035) - Ostrich strCam1 # 15 0.2623 (% 73.577) (% 75.473) - White throated tinamou tinGut2 # 16 0.3123 (% 83.073) (% 79.480) - Rock pigeon colLiv1 # 17 0.3423 (% 85.021) (% 78.924) - Cuckoo roller lepDis1 # 18 0.3423 (% 81.018) (% 79.394) - Northern carmine bee-eater merNub1 # 19 0.3423 (% 85.344) (% 77.377) - Dalmatian pelican pelCri1 # 20 0.3523 (% 84.182) (% 78.405) - Great cormorant phaCar1 # 21 0.3523 (% 84.519) (% 78.068) - White-tailed tropicbird phaLep1 # 22 0.3523 (% 82.523) (% 79.317) - Yellow-throated sandgrouse pteGut1 # 23 0.3623 (% 85.589) (% 77.936) - Little egret egrGar1 # 24 0.3623 (% 86.903) (% 78.539) - Crested ibis nipNip1 # 25 0.3723 (% 86.834) (% 77.790) - Emperor penguin aptFor1 # 26 0.3723 (% 85.892) (% 77.561) - Adelie penguin pygAde1 # 27 0.3823 (% 83.041) (% 78.986) - Houbara bustard chlUnd1 # 28 0.3823 (% 80.987) (% 78.528) - Speckled mousebird colStr1 # 29 0.3970 (% 84.717) (% 78.429) - Red-throated loon gavSte1 # 30 0.4023 (% 87.687) (% 78.786) - Golden eagle aquChr2 # 31 0.4023 (% 84.687) (% 78.619) - Crowned crain balPav1 # 32 0.4023 (% 86.342) (% 79.770) - Saker falcon falChe1 # 33 0.4023 (% 86.500) (% 79.824) - Peregrine falcon falPer1 # 34 0.4023 (% 84.997) (% 78.352) - White-tailed eagle halAlb1 # 35 0.4023 (% 86.923) (% 78.856) - Bald eagle halLeu1 # 36 0.4023 (% 74.258) (% 69.321) - Downy woodpecker picPub1 # 37 0.4070 (% 83.573) (% 77.901) - Chuck-will's-widow capCar1 # 38 0.4130 (% 80.577) (% 77.539) - Parrot amaVit1 # 39 0.4130 (% 76.180) (% 77.878) - Scarlet macaw araMac1 # 40 0.4220 (% 80.622) (% 80.898) - Tibetan ground jay pseHum1 # 41 0.4223 (% 82.426) (% 78.684) - Sunbittern eurHel1 # 42 0.4240 (% 81.694) (% 78.351) - Budgerigar melUnd1 # 43 0.4423 (% 85.094) (% 78.856) - Red-legged seriema carCri1 # 44 0.4423 (% 82.649) (% 79.029) - Brown roatelo mesUni1 # 45 0.4541 (% 80.978) (% 80.022) - American crow corBra1 # 46 0.4541 (% 80.574) (% 80.730) - Hooded crow corCor1 # 47 0.4541 (% 79.354) (% 75.638) - Collared flycatcher ficAlb2 # 48 0.4641 (% 79.489) (% 79.366) - Rifleman acaChl1 # 49 0.4731 (% 79.903) (% 74.870) - Canary serCan1 # 50 0.4735 (% 78.645) (% 79.381) - White-throated sparrow zonAlb1 # 51 0.4803 (% 79.299) (% 79.439) - Medium ground finch geoFor1 # 52 0.4841 (% 73.989) (% 73.358) - Zebra finch taeGut2 # 53 0.6174 (% 38.375) (% 25.398) - Spiny softshell turtle apaSpi1 # 54 0.6174 (% 51.019) (% 33.557) - Green seaturtle cheMyd1 # 55 0.6174 (% 51.724) (% 33.105) - Painted turtle chrPic2 # 56 0.6174 (% 41.985) (% 25.358) - Chinese softshell turtle pelSin1 # 57 0.6323 (% 44.801) (% 24.041) - American alligator allMis1 # 58 0.6323 (% 46.056) (% 23.607) - Chinese alligator allSin1 # 59 0.8294 (% 07.564) (% 04.273) - Tibetan frog nanPar1 # 60 0.8874 (% 15.806) (% 11.426) - Burmese python pytBiv1 # 61 0.9074 (% 08.102) (% 06.961) - X. tropicalis xenTro9 # 62 0.9294 (% 07.927) (% 05.220) - African clawed frog xenLae2 # 63 0.9344 (% 14.527) (% 08.505) - Lizard anoCar2 # 64 0.9874 (% 11.754) (% 09.922) - Garter snake thaSir1 # 65 1.1155 (% 11.459) (% 04.977) - Human hg38 # 66 1.3261 (% 08.388) (% 03.813) - Mouse mm10 # 67 1.7582 (% 04.623) (% 12.925) - Stickleback gasAcu1 # 68 1.8243 (% 04.925) (% 06.828) - Nile tilapia oreNil3 # 69 1.8461 (% 04.148) (% 15.780) - Tetraodon tetNig2 # 70 1.8911 (% 05.408) (% 07.077) - Japanese eel angJap1 # 71 1.9258 (% 04.007) (% 13.581) - Fugu fr3 # 72 1.9465 (% 03.335) (% 04.611) - Arctic lamprey letCam1 # 73 1.9465 (% 03.658) (% 04.837) - Lamprey petMar3 # 74 1.9719 (% 06.352) (% 05.913) - Zebrafish danRer11 # 75 1.9743 (% 04.539) (% 07.874) - Zebra mbuna mayZeb1 # 76 2.0087 (% 04.565) (% 07.741) - Medaka oryLat2 # None of this concern for distances matters in building the first step, the # maf files. The distances will be better calibrated later. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ galGal6.77way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list cat species.list | fold -s -w 77 | sed -e 's/^/# /;' # galGal6 cotJap2 melGal5 tytAlb1 bucRhi1 anaPla1 apaVit1 calAnn1 cucCan1 # chaVoc2 fulGla1 tauEry1 opiHoa1 phoRub1 colLiv1 lepDis1 merNub1 pelCri1 # phaCar1 phaLep1 pteGut1 nipNip1 egrGar1 pygAde1 aptFor1 carCri1 mesUni1 # eurHel1 balPav1 chlUnd1 falChe1 falPer1 aquChr2 halAlb1 halLeu1 corBra1 # corCor1 acaChl1 ficAlb2 serCan1 zonAlb1 geoFor1 taeGut2 pseHum1 gavSte1 # capCar1 melUnd1 amaVit1 araMac1 colStr1 picPub1 strCam1 tinGut2 allMis1 # allSin1 cheMyd1 chrPic2 pelSin1 apaSpi1 pytBiv1 thaSir1 anoCar2 hg38 mm10 # xenTro9 xenLae2 nanPar1 tetNig2 fr3 oreNil3 mayZeb1 oryLat2 gasAcu1 angJap1 # danRer11 petMar3 letCam1 # take an N50 survery to see if the quality of these genomes makes a mark rm -f /dev/shm/n50.txt printf "# db\tN50\tctg count\tN50 size\n" > n50.survey.txt cat species.list.txt | while read db do n50.pl /hive/data/genomes/$db/chrom.sizes > /dev/shm/n50.txt 2>&1 ctgCount=`grep "contig count" /dev/shm/n50.txt | sed -e 's/,//g;' | awk '{print $4}'` n50=`tail -1 /dev/shm/n50.txt | awk '{print $2}'` n50Size=`tail -1 /dev/shm/n50.txt | awk '{print $4}'` printf "%s\t%d\t%d\t%d\n" "$db" "$n50" "$ctgCount" "$n50Size" rm -f /dev/shm/n50.txt done >> n50.survey.txt # joining that with the chainLink featureBits measurements cd /hive/data/genomes/galGal6/bed/multiz75way/lastzRun ./joinChainBits.sh > \ /hive/data/genomes/galGal6/bed/multiz77way/chainBits.txt sort chainBits.txt | join -t$'\t' - <(sort n50.survey.txt) \ | awk -F$'\t' '{printf "%s\t%7s\t%7s\t%7s\t%5d\t%6d\t%8d %s\n", $1,$2,$3,$4,$9,$10,$11, $8}' melGal5 94.495% 67.751% 86.165% 6 231286 59006440 Turkey cotJap2 90.071% 79.633% 80.159% 4 2012 82189922 Japanese quail aquChr2 87.687% 78.755% 80.369% 40 1141 9230743 Golden eagle halLeu1 86.923% 77.776% 79.411% 40 1023 9145499 Bald eagle nipNip1 86.903% 77.318% 79.492% 66 59555 5211696 Crested ibis aptFor1 86.834% 78.454% 79.601% 71 10672 5071598 Emperor penguin falPer1 86.500% 76.966% 78.677% 90 7021 3935757 Peregrine falcon falChe1 86.342% 76.498% 78.516% 84 5863 4154532 Saker falcon chaVoc2 86.267% 77.101% 78.919% 98 15167 3657050 Killdeer pygAde1 85.892% 77.734% 78.841% 72 19264 5118896 Adelie penguin egrGar1 85.589% 76.499% 78.042% 113 11791 3067157 Little egret anaPla1 85.509% 74.674% 76.829% 268 78488 1233631 Mallard duck pelCri1 85.344% 72.186% 78.202% 7852 63982 43364 Dalmatian pelican fulGla1 85.332% 73.091% 78.108% 7016 57389 47208 Northern fulmar carCri1 85.094% 72.651% 77.569% 5958 53474 55197 Red-legged seriema lepDis1 85.021% 72.372% 77.495% 5157 57160 62640 Cuckoo roller halAlb1 84.997% 73.173% 77.620% 5758 50905 57319 White-tailed eagle phoRub1 84.793% 70.837% 77.774% 8592 76190 38071 American flamingo gavSte1 84.717% 72.183% 77.534% 7095 61831 45523 Red-throated loon balPav1 84.687% 72.017% 77.336% 6181 53491 52178 Crowned crain opiHoa1 84.579% 74.775% 76.791% 122 10256 2937227 Hoatzin phaLep1 84.519% 70.871% 77.263% 6817 66785 47896 White-tailed tropicbird tauEry1 84.485% 71.434% 76.784% 5784 59587 56334 Red crested turaco phaCar1 84.182% 70.890% 76.937% 6715 64312 48427 Great cormorant tytAlb1 84.122% 70.407% 76.219% 5943 62122 52818 Barn owl capCar1 83.573% 68.596% 75.378% 6762 70122 46345 Chuck-will's-widow cucCan1 83.150% 72.094% 74.353% 119 14930 2989832 Common cuckoo colLiv1 83.073% 72.569% 74.370% 82 14923 3148738 Rock pigeon chlUnd1 83.041% 69.052% 75.109% 6888 59693 45221 Houbara bustard mesUni1 82.649% 67.519% 73.976% 6540 67520 47102 Brown roatelo pteGut1 82.523% 68.882% 74.326% 6164 58607 49530 Yellow-throated sandgrouse eurHel1 82.426% 67.655% 74.252% 6386 62699 47243 Sunbittern strCam1 82.389% 74.592% 75.802% 101 6915 3593425 Ostrich melUnd1 81.694% 69.821% 72.543% 28 25212 10614383 Budgerigar merNub1 81.018% 65.336% 72.239% 6132 53499 48089 Northern carmine bee-eater apaVit1 81.010% 67.063% 72.793% 5440 54728 56673 Bar tailed trogon colStr1 80.987% 64.311% 72.020% 6512 70188 46063 Speckled mousebird corBra1 80.978% 69.377% 71.439% 49 10547 6953989 American crow bucRhi1 80.812% 66.557% 72.242% 5769 62257 53203 Rhinoceros hornbill pseHum1 80.622% 67.986% 70.770% 16 5406 16337386 Tibetan ground jay amaVit1 80.577% 54.029% 72.003% 16505 182974 19239 Parrot corCor1 80.574% 68.064% 71.052% 19 1298 16358221 Hooded crow serCan1 79.903% 66.300% 69.457% 13 304397 17815079 Canary calAnn1 79.733% 67.104% 69.779% 81 54736 4052191 Anna's hummingbird acaChl1 79.489% 64.470% 70.154% 4486 53876 64469 Rifleman ficAlb2 79.354% 66.501% 69.517% 6 21428 64724594 Collared flycatcher geoFor1 79.299% 66.972% 69.271% 54 27239 5255844 Medium ground finch zonAlb1 78.645% 65.813% 68.277% 52 6018 4866725 White-throated sparrow araMac1 76.180% 43.932% 67.099% 21024 192790 15974 Scarlet macaw picPub1 74.258% 60.446% 63.830% 168 31254 2093929 Downy woodpecker taeGut2 73.989% 65.637% 68.089% 7 37096 62374962 Zebra finch tinGut2 73.577% 62.604% 65.205% 1147 82514 246268 White throated tinamou chrPic2 51.724% 41.868% 46.369% 76 78595 7072151 Painted turtle cheMyd1 51.019% 41.642% 45.898% 158 140023 3864108 Green seaturtle allSin1 46.056% 38.551% 41.235% 325 9317 2188296 Chinese alligator allMis1 44.801% 37.363% 40.998% 1206 14645 508966 American alligator pelSin1 41.985% 31.150% 35.931% 189 19904 3350749 Chinese softshell turtle apaSpi1 38.375% 6.975% 32.776% 225 286620 2306994 Spiny softshell turtle galGal6 16.842% n/a n/a 4 464 91315245 Chicken pytBiv1 15.806% 6.802% 11.843% 1939 39113 213970 Burmese python anoCar2 14.527% 8.896% 11.537% 5 6457 150641573 Lizard thaSir1 11.754% 3.944% 8.850% 639 7930 647592 Garter snake xenTro9 8.102% 3.975% 6.270% 5 6822 135134832 X. tropicalis xenLae2 7.927% 3.814% 6.549% 9 108033 136570856 African clawed frog nanPar1 7.564% 3.096% 5.757% 546 25187 1069101 Tibetan frog danRer11 6.352% 1.442% 5.199% 14 1923 52186027 Zebrafish angJap1 5.408% 0.824% 4.100% 4463 323727 52849 Japanese eel oreNil3 4.925% 1.433% 4.068% 12 2567 37007722 Nile tilapia gasAcu1 4.623% 1.277% 3.605% 8 23 20083130 Stickleback oryLat2 4.565% 1.147% 3.596% 14 7189 29908082 Medaka mayZeb1 4.539% 1.354% 3.720% 62 3725 3702874 Zebra mbuna tetNig2 4.148% 1.011% 3.119% 5 27 13390619 Tetraodon fr3 4.007% 1.122% 3.149% 14 6835 11516971 Fugu petMar3 3.658% 0.159% 2.467% 34 12062 11932523 Lamprey letCam1 3.335% 0.147% 2.186% 203 86126 1051965 Arctic lamprey # rules for selection: # 1. contig count < 100 and chain bits over 30 == syntenic net # 2. contig count >= 100 and recipBest chain bits > syntenic chain bits # 3. all others # bash shell syntax here ... cd /hive/data/genomes/galGal6/bed/multiz77way export H=/hive/data/genomes/galGal6/bed mkdir mafLinks # this did not work for melGal5, it needs to be recip best # good, phylo close assemblies can use syntenic net: for G in chaVoc2 falPer1 falChe1 colLiv1 calAnn1 chrPic2 pygAde1 aptFor1 nipNip1 geoFor1 zonAlb1 corBra1 aquChr2 halLeu1 melUnd1 corCor1 pseHum1 serCan1 taeGut2 ficAlb2 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/axtChain/galGal6.${G}.synNet.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/axtChain/galGal6.${G}.synNet.maf.gz ./mafLinks/$G done # assemblies using recip best net: # for G in melGal5 araMac1 amaVit1 phoRub1 pelCri1 gavSte1 fulGla1 chlUnd1 phaLep1 capCar1 phaCar1 mesUni1 colStr1 eurHel1 balPav1 pteGut1 merNub1 carCri1 tytAlb1 tauEry1 bucRhi1 halAlb1 apaVit1 lepDis1 acaChl1 cotJap2 allMis1 tinGut2 allSin1 anaPla1 apaSpi1 pelSin1 picPub1 cheMyd1 opiHoa1 cucCan1 egrGar1 strCam1 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/mafRBestNet/galGal6.${G}.rbest.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/mafRBestNet/galGal6.${G}.rbest.maf.gz ./mafLinks/$G done # all other assemblies # for G in angJap1 anoCar2 danRer11 fr3 gasAcu1 hg38 letCam1 mayZeb1 mm10 nanPar1 oreNil3 oryLat2 petMar3 pytBiv1 tetNig2 thaSir1 xenLae2 xenTro9 do mkdir mafLinks/$G rm -f mafLinks/$G/galGal6.${G}.maf.gz echo ln -s ${H}/lastz.$G/mafNet/galGal6.${G}.net.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/mafNet/galGal6.${G}.net.maf.gz ./mafLinks/$G done # verify the symLinks are good: ls -ogL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' # 550909609 Oct 17 08:50 mafLinks/acaChl1/galGal6.acaChl1.rbest.maf.gz # 340992517 Oct 17 13:55 mafLinks/allMis1/galGal6.allMis1.rbest.maf.gz # 342667846 Oct 17 21:37 mafLinks/allSin1/galGal6.allSin1.rbest.maf.gz # 563271633 Oct 18 03:00 mafLinks/amaVit1/galGal6.amaVit1.rbest.maf.gz # 580046372 Oct 18 03:47 mafLinks/anaPla1/galGal6.anaPla1.rbest.maf.gz # 43105346 Oct 17 00:36 mafLinks/angJap1/galGal6.angJap1.net.maf.gz # 117472326 Oct 17 13:18 mafLinks/anoCar2/galGal6.anoCar2.net.maf.gz # 282784817 Oct 18 07:33 mafLinks/apaSpi1/galGal6.apaSpi1.rbest.maf.gz # 568597883 Oct 18 04:31 mafLinks/apaVit1/galGal6.apaVit1.rbest.maf.gz # 581862384 Oct 17 21:48 mafLinks/aptFor1/galGal6.aptFor1.synNet.maf.gz # 584764623 Oct 17 18:39 mafLinks/aquChr2/galGal6.aquChr2.synNet.maf.gz # 528701986 Oct 18 08:15 mafLinks/araMac1/galGal6.araMac1.rbest.maf.gz # 588747734 Oct 18 06:28 mafLinks/balPav1/galGal6.balPav1.rbest.maf.gz # 564247198 Oct 18 08:13 mafLinks/bucRhi1/galGal6.bucRhi1.rbest.maf.gz # 515661194 Oct 18 01:42 mafLinks/calAnn1/galGal6.calAnn1.synNet.maf.gz # 579655272 Oct 18 08:10 mafLinks/capCar1/galGal6.capCar1.rbest.maf.gz # 590069847 Oct 18 14:26 mafLinks/carCri1/galGal6.carCri1.rbest.maf.gz # 576325192 Oct 18 11:12 mafLinks/chaVoc2/galGal6.chaVoc2.synNet.maf.gz # 391365585 Oct 18 16:21 mafLinks/cheMyd1/galGal6.cheMyd1.rbest.maf.gz # 577326896 Oct 18 21:50 mafLinks/chlUnd1/galGal6.chlUnd1.rbest.maf.gz # 343405712 Oct 18 17:01 mafLinks/chrPic2/galGal6.chrPic2.synNet.maf.gz # 549943546 Oct 18 19:07 mafLinks/colLiv1/galGal6.colLiv1.synNet.maf.gz # 565420162 Oct 19 07:38 mafLinks/colStr1/galGal6.colStr1.rbest.maf.gz # 530934943 Oct 19 02:27 mafLinks/corBra1/galGal6.corBra1.synNet.maf.gz # 520184560 Oct 19 03:07 mafLinks/corCor1/galGal6.corCor1.synNet.maf.gz # 528131533 Oct 27 03:23 mafLinks/cotJap2/galGal6.cotJap2.rbest.maf.gz # 580850919 Oct 19 17:12 mafLinks/cucCan1/galGal6.cucCan1.rbest.maf.gz # 49151285 Oct 17 11:33 mafLinks/danRer11/galGal6.danRer11.net.maf.gz # 595947114 Oct 19 16:31 mafLinks/egrGar1/galGal6.egrGar1.rbest.maf.gz # 574655612 Oct 20 01:21 mafLinks/eurHel1/galGal6.eurHel1.rbest.maf.gz # 576260250 Oct 19 19:43 mafLinks/falChe1/galGal6.falChe1.synNet.maf.gz # 580031609 Oct 19 19:43 mafLinks/falPer1/galGal6.falPer1.synNet.maf.gz # 514039362 Oct 17 15:15 mafLinks/ficAlb2/galGal6.ficAlb2.synNet.maf.gz # 31624791 Oct 17 10:24 mafLinks/fr3/galGal6.fr3.net.maf.gz # 589724584 Oct 19 23:48 mafLinks/fulGla1/galGal6.fulGla1.rbest.maf.gz # 36032098 Oct 17 00:19 mafLinks/gasAcu1/galGal6.gasAcu1.net.maf.gz # 586015776 Oct 20 00:01 mafLinks/gavSte1/galGal6.gavSte1.rbest.maf.gz # 516211209 Oct 19 19:33 mafLinks/geoFor1/galGal6.geoFor1.synNet.maf.gz # 585880621 Oct 20 07:20 mafLinks/halAlb1/galGal6.halAlb1.rbest.maf.gz # 578164364 Oct 20 03:41 mafLinks/halLeu1/galGal6.halLeu1.synNet.maf.gz # 89571721 Oct 12 11:13 mafLinks/hg38/galGal6.hg38.net.maf.gz # 593570207 Oct 20 10:38 mafLinks/lepDis1/galGal6.lepDis1.rbest.maf.gz # 26394724 Oct 20 07:50 mafLinks/letCam1/galGal6.letCam1.net.maf.gz # 36546409 Oct 20 08:13 mafLinks/mayZeb1/galGal6.mayZeb1.net.maf.gz # 561934526 Oct 17 16:37 mafLinks/melGal5/galGal6.melGal5.rbest.maf.gz # 531860577 Oct 20 11:25 mafLinks/melUnd1/galGal6.melUnd1.synNet.maf.gz # 564779170 Oct 20 19:31 mafLinks/merNub1/galGal6.merNub1.rbest.maf.gz # 572813818 Oct 20 19:14 mafLinks/mesUni1/galGal6.mesUni1.rbest.maf.gz # 66128061 Oct 12 11:14 mafLinks/mm10/galGal6.mm10.net.maf.gz # 59879758 Oct 20 18:33 mafLinks/nanPar1/galGal6.nanPar1.net.maf.gz # 574542147 Oct 20 19:56 mafLinks/nipNip1/galGal6.nipNip1.synNet.maf.gz # 588192552 Oct 21 00:33 mafLinks/opiHoa1/galGal6.opiHoa1.rbest.maf.gz # 39391134 Oct 20 19:02 mafLinks/oreNil3/galGal6.oreNil3.net.maf.gz # 36204778 Oct 20 19:43 mafLinks/oryLat2/galGal6.oryLat2.net.maf.gz # 591870531 Oct 21 03:00 mafLinks/pelCri1/galGal6.pelCri1.rbest.maf.gz # 309458982 Oct 21 06:38 mafLinks/pelSin1/galGal6.pelSin1.rbest.maf.gz # 28516579 Oct 17 00:21 mafLinks/petMar3/galGal6.petMar3.net.maf.gz # 587063102 Oct 21 05:34 mafLinks/phaCar1/galGal6.phaCar1.rbest.maf.gz # 590674551 Oct 21 05:52 mafLinks/phaLep1/galGal6.phaLep1.rbest.maf.gz # 587486272 Oct 21 07:45 mafLinks/phoRub1/galGal6.phoRub1.rbest.maf.gz # 515840610 Oct 21 12:20 mafLinks/picPub1/galGal6.picPub1.rbest.maf.gz # 524268007 Oct 21 06:02 mafLinks/pseHum1/galGal6.pseHum1.synNet.maf.gz # 570738640 Oct 21 13:38 mafLinks/pteGut1/galGal6.pteGut1.rbest.maf.gz # 576330919 Oct 21 08:25 mafLinks/pygAde1/galGal6.pygAde1.synNet.maf.gz # 127546060 Oct 21 07:27 mafLinks/pytBiv1/galGal6.pytBiv1.net.maf.gz # 510037529 Oct 21 11:02 mafLinks/serCan1/galGal6.serCan1.synNet.maf.gz # 590908607 Oct 21 18:05 mafLinks/strCam1/galGal6.strCam1.rbest.maf.gz # 496310398 Oct 17 12:22 mafLinks/taeGut2/galGal6.taeGut2.synNet.maf.gz # 588405382 Oct 21 20:07 mafLinks/tauEry1/galGal6.tauEry1.rbest.maf.gz # 32933480 Oct 21 12:56 mafLinks/tetNig2/galGal6.tetNig2.net.maf.gz # 95906779 Oct 21 15:00 mafLinks/thaSir1/galGal6.thaSir1.net.maf.gz # 524666652 Oct 17 20:11 mafLinks/tinGut2/galGal6.tinGut2.rbest.maf.gz # 581192491 Oct 21 22:28 mafLinks/tytAlb1/galGal6.tytAlb1.rbest.maf.gz # 62256127 Oct 21 16:29 mafLinks/xenLae2/galGal6.xenLae2.net.maf.gz # 63053844 Oct 17 09:31 mafLinks/xenTro9/galGal6.xenTro9.net.maf.gz # 507711592 Oct 16 21:54 mafLinks/zonAlb1/galGal6.zonAlb1.synNet.maf.gz # need to split these things up into smaller pieces for # efficient kluster run. mkdir /hive/data/genomes/galGal6/bed/multiz77way/mafSplit cd /hive/data/genomes/galGal6/bed/multiz77way/mafSplit # mafSplitPos splits on gaps or repeat areas that will not have # any chains, approx 5 Mbp intervals, gaps at least 10,000 mafSplitPos -minGap=10000 galGal6 5 stdout | sort -u \ | sort -k1,1 -k2,2n > mafSplit.bed # see also multiz77way.txt for more discussion of this procedure # run a kluster job to split them all ssh ku cd /hive/data/genomes/galGal6/bed/multiz77way/mafSplit printf '#!/bin/csh -ef set G = $1 set M = $2 mkdir -p $G pushd $G > /dev/null if ( -s galGal6_${M}.00.maf ) then /bin/rm -f galGal6_${M}.*.maf endif /cluster/bin/x86_64/mafSplit ../mafSplit.bed galGal6_ ../../mafLinks/${G}/${M}.maf.gz /bin/gzip galGal6_*.maf popd > /dev/null ' > runOne chmod +x runOne printf '#LOOP runOne $(dir1) $(file1) {check out exists+ $(dir1)/galGal6_chr1.00.maf.gz} #ENDLOOP ' > template find ../mafLinks -type l | awk -F'/' '{printf "%s/%s\n", $3,$4}' \ | sed -e 's/.maf.gz//;' > maf.list gensub2 maf.list single template jobList para -ram=16g create jobList para try ... check ... push ... etc... # Completed: 76 of 76 jobs # CPU time in finished jobs: 22801s 380.02m 6.33h 0.26d 0.001 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 294s 4.89m 0.08h 0.00d # Longest finished job: 437s 7.28m 0.12h 0.01d # Submission to last job: 1648s 27.47m 0.46h 0.02d # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | grep "maf.gz" | wc -l # 11509 find . -type f | grep ".maf.gz$" | xargs -L 1 basename | sort -u \ > run.maf.list wc -l run.maf.list # 280 run.maf.list # number of chroms with data: awk -F'.' '{print $1}' run.maf.list | sed -e 's/galGal6_//;' \ | sort | uniq -c | sort -n | wc -l # 247 mkdir /hive/data/genomes/galGal6/bed/multiz77way/splitRun cd /hive/data/genomes/galGal6/bed/multiz77way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = galGal6 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/galGal6/bed/multiz77way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /dev/shm/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh printf '#LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/galGal6/bed/multiz77way/splitRun/maf/$(root1)} #ENDLOOP ' > template ln -s ../../mafSplit/run.maf.list maf.list ssh ku cd /hive/data/genomes/galGal6/bed/multiz77way/splitRun/run gensub2 maf.list single template jobList # some of these jobs get to be pretty large in memory para -ram=48g create jobList para try ... check ... push ... etc... # Completed: 280 of 280 jobs # CPU time in finished jobs: 6297642s 104960.70m 1749.35h 72.89d 0.200 y # IO & Wait Time: 7278s 121.30m 2.02h 0.08d 0.000 y # Average job time: 22518s 375.29m 6.25h 0.26d # Longest finished job: 464353s 7739.22m 128.99h 5.37d # Submission to last job: 464428s 7740.47m 129.01h 5.38d # one of these jobs took a really really long time and # made a gigantic result file: # -rw-rw-r-- 1 12482236715 Nov 18 03:27 galGal6_chr4.01.maf # put the split maf results back together into a single per-chrom maf file # eliminate duplicate comments ssh hgwdev cd /hive/data/genomes/galGal6/bed/multiz77way/splitRun mkdir ../maf # no need to save the comments since they are lost with mafAddIRows cat << '_EOF_' >> runOne printf '#!/bin/csh -fe set C = $1 if ( -s ../mafRBest/${C}.maf.gz ) then rm -f ../mafRBest/${C}.maf.gz endif if ( -s maf/galGal6_${C}.00.maf ) then head -q -n 1 maf/galGal6_${C}.00.maf | sort -u > ../mafRBest/${C}.maf grep -h -v "^#" `ls maf/galGal6_${C}.*.maf | sort -t. -k2,2n` >> ../mafRBest/${C}.maf tail -q -n 1 maf/galGal6_${C}.00.maf | sort -u >> ../mafRBest/${C}.maf else touch ../mafRBest/${C}.maf endif ' > runOne chmod +x runOne printf '#LOOP runOne $(root1) {check out exists ../mafRBest/$(root1).maf} #ENDLOOP ' > template cut -f1 ../../../chrom.sizes > chr.list ssh ku cd /hive/data/genomes/galGal6/bed/multiz77way/splitRun gensub2 chr.list single template jobList para -ram=16g create jobList para try ... check ... push ... etc ... para -maxJob=32 push # Completed: 464 of 464 jobs # CPU time in finished jobs: 1076s 17.93m 0.30h 0.01d 0.000 y # IO & Wait Time: 1711s 28.52m 0.48h 0.02d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 226s 3.77m 0.06h 0.00d # Submission to last job: 231s 3.85m 0.06h 0.00d cd /hive/data/genomes/galGal6/bed/multiz77way/maf # about 219 of them have empty results, they can be removed ls -ogrt | awk '$3 == 0' | awk '{print $NF}' | xargs rm -f # Load into database mkdir -p /gbdb/galGal6/multiz77way/maf cd /hive/data/genomes/galGal6/bed/multiz77way/maf ln -s `pwd`/*.maf /gbdb/galGal6/multiz77way/maf/ # this generates an immense multiz77way.tab file in the directory # where it is running. Best to run this over in scratch. # This is going to take all day. cd /dev/shm time hgLoadMaf -pathPrefix=/gbdb/galGal6/multiz77way/maf galGal6 multiz77way # needed to run this twice, first time had an error: # Indexing and tabulating /gbdb/galGal6/multiz77way/maf/chrZ.maf # Mysql error during sqlTableExists(extFile) 2013: Lost connection to MySQL server during query # real 23m24.464s # second time success: # Loaded 57517733 mafs in 245 files from /gbdb/galGal6/multiz77way/maf # real 25m50.757s time (cat /gbdb/galGal6/multiz77way/maf/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 galGal6 multiz77waySummary stdin) # Created 3725053 summary blocks from 2028742538 components and 58099551 mafs from stdin # real 64m55.663s # -rw-rw-r-- 1 3002803804 Nov 27 10:07 multiz77way.tab # -rw-rw-r-- 1 178303217 Nov 27 14:09 multiz77waySummary.tab wc -l multiz77*.tab # 58099551 multiz77way.tab # 3725053 multiz77waySummary.tab rm multiz77way*.tab ####################################################################### # GAP ANNOTATE MULTIZ77WAY MAF AND LOAD TABLES (DONE - 2017-11-26 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/galGal6/bed/multiz77way/anno cd /hive/data/genomes/galGal6/bed/multiz77way/anno # check for N.bed files everywhere: for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi cd /hive/data/genomes/galGal6/bed/multiz77way/anno done cd /hive/data/genomes/galGal6/bed/multiz77way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL *.bed | wc -l # 77 screen -S galGal6 # use a screen to control this longish job ssh ku cd /hive/data/genomes/galGal6/bed/multiz77way/anno mkdir result printf '#LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/galGal6/galGal6.2bit {check out line+ result/$(file1)} #ENDLOOP ' > template ls ../maf/*.maf > maf.list gensub2 maf.list single template jobList # no need to limit these jobs, there are only 358 of them para -ram=64g create jobList para try ... check ... para -maxJob=10 push # Completed: 245 of 247 jobs # Crashed: 2 jobs # CPU time in finished jobs: 10717s 178.62m 2.98h 0.12d 0.000 y # IO & Wait Time: 883s 14.71m 0.25h 0.01d 0.000 y # Average job time: 47s 0.79m 0.01h 0.00d # Longest finished job: 1612s 26.87m 0.45h 0.02d # Submission to last job: 1652s 27.53m 0.46h 0.02d # and the last two, chr1 and chr2, completed on hgwdev due to memory limits: # real 48m26.151s du -hsc result # 334G result # Load into database rm -f /gbdb/galGal6/multiz77way/maf/* cd /hive/data/genomes/galGal6/bed/multiz77way/anno/result ln -s `pwd`/*.maf /gbdb/galGal6/multiz77way/maf/ # this generates an immense multiz77way.tab file in the directory # where it is running. Best to run this over in scratch. cd /dev/shm time hgLoadMaf -pathPrefix=/gbdb/galGal6/multiz77way/maf galGal6 multiz77way # Loading multiz77way into database # Loaded 58099551 mafs in 247 files from /gbdb/galGal6/multiz77way/maf # real 42m15.604s # -rw-rw-r-- 1 3002803804 Nov 27 10:07 multiz77way.tab time (cat /gbdb/galGal6/multiz77way/maf/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 galGal6 multiz77waySummary stdin) # Created 4657433 summary blocks from 2028742538 components and 58099551 mafs from stdin # real 75m53.456s # -rw-rw-r-- 1 3002803804 Nov 27 10:07 multiz77way.tab # -rw-rw-r-- 1 223274420 Dec 10 14:48 multiz77waySummary.tab wc -l multiz77*.tab # 58099551 multiz77way.tab # 4657433 multiz77waySummary.tab rm multiz77way*.tab ############################################################################## # MULTIZ7WAY MAF FRAMES (DONE - 2017-11-04 - Hiram) ssh hgwdev mkdir /hive/data/genomes/galGal6/bed/multiz77way/frames cd /hive/data/genomes/galGal6/bed/multiz77way/frames # survey all the genomes to find out what kinds of gene tracks they have printf '#!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "# ${db}: " set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "ncbiRefSeq" || $table == "mgcGenes" || \ $table == "knownGene" || $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end echo end ' > showGenes.csh chmod +x ./showGenes.csh time ./showGenes.csh # galGal6: ncbiRefSeq: 62183, refGene: 7355, xenoRefGene: 146002, # cotJap2: ncbiRefSeq: 47008, refGene: 64, xenoRefGene: 149937, # melGal5: ncbiRefSeq: 48270, refGene: 108, xenoRefGene: 249461, # anaPla1: ensGene: 17169, # aquChr2: xenoRefGene: 360692, # geoFor1: ncbiRefSeq: 16927, xenoRefGene: 363515, # taeGut2: ensGene: 19225, ncbiRefSeq: 20747, refGene: 1272, xenoRefGene: 248420, # melUnd1: ncbiRefSeq: 17125, xenoRefGene: 382914, # allMis1: ncbiRefSeq: 22447, refGene: 54, xenoRefGene: 404750, # chrPic2: xenoRefGene: 187072, # pelSin1: ensGene: 21808, # real 0m1.684s # from that summary, use these gene sets: # ncbiRefSeq - galGal6 cotJap2 melGal5 geoFor1 taeGut2 melUnd1 allMis1 # ensGene - anaPla1 pelSin1 # xenoRefGene - aquChr2 chrPic2 mkdir genes # 1. ncbiRefSeq: galGal6 cotJap2 melGal5 geoFor1 taeGut2 melUnd1 allMis1 for DB in galGal6 cotJap2 melGal5 geoFor1 taeGut2 melUnd1 allMis1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/${DB}.gp.gz genePredCheck -db=${DB} genes/${DB}.gp.gz 2>&1 | sed -e "s/^/ # $DB /;" done # galGal6 checked: 17422 failed: 0 # cotJap2 checked: 15977 failed: 0 # melGal5 checked: 18395 failed: 0 # geoFor1 checked: 14124 failed: 0 # taeGut2 checked: 16280 failed: 0 # melUnd1 checked: 14174 failed: 0 # allMis1 checked: 18106 failed: 0 # 2. ensGene: anaPla1 pelSin1 for DB in anaPla1 pelSin1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz genePredCheck -db=${DB} genes/${DB}.gp.gz 2>&1 | sed -e "s/^/ # $DB /;" done # anaPla1 checked: 15482 failed: 0 # pelSin1 checked: 18093 failed: 0 # 3. xenoRefGene for aquChr2 chrPic2 for DB in aquChr2 chrPic2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz genePredCheck -db=${DB} genes/${DB}.gp.gz 2>&1 | sed -e "s/^/ # $DB /;" done # aquChr2 checked: 15300 failed: 0 # chrPic2 checked: 15228 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/allMis1.gp.gz: 18106 # genes/anaPla1.gp.gz: 15482 # genes/aquChr2.gp.gz: 15006 # genes/chrPic2.gp.gz: 15071 # genes/cotJap2.gp.gz: 15977 # genes/galGal6.gp.gz: 17414 # genes/geoFor1.gp.gz: 14124 # genes/melGal5.gp.gz: 18387 # genes/melUnd1.gp.gz: 14174 # genes/pelSin1.gp.gz: 18093 # genes/taeGut2.gp.gz: 16276 # kluster job to annotate each maf file screen -S galGal6 # manage long running procedure with screen ssh ku cd /hive/data/genomes/galGal6/bed/multiz77way/frames printf '#!/bin/csh -fe set C = $1 set G = $2 cat ../anno/result/${C}.maf | genePredToMafFrames galGal6 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz ' > runOne chmod +x runOne ls ../anno/result/*.maf | sed -e "s#.*result/##; s/.maf//" > chr.list ls genes | sed -e "s/.gp.gz//" > gene.list printf '#LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP ' > template mkdir parts gensub2 chr.list gene.list template jobList para -ram=64g create jobList para try ... check ... push # Completed: 2717 of 2717 jobs # CPU time in finished jobs: 53070s 884.50m 14.74h 0.61d 0.002 y # IO & Wait Time: 5177s 86.28m 1.44h 0.06d 0.000 y # Average job time: 21s 0.36m 0.01h 0.00d # Longest finished job: 933s 15.55m 0.26h 0.01d # Submission to last job: 1300s 21.67m 0.36h 0.02d # collect all results into one file: cd /hive/data/genomes/galGal6/bed/multiz77way/frames time find ./parts -type f | while read F do echo "${F}" 1>&2 zcat ${F} done | sort -k1,1 -k2,2n > multiz77wayFrames.bed # real 0m28.716s # -rw-rw-r-- 1 155257854 Dec 4 11:06 multiz77wayFrames.bed gzip multiz77wayFrames.bed # verify there are frames on everything, should be 11 species: # (count from: ls genes | wc) zcat multiz77wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' > species.check.list wc -l species.check.list # 11 # 364955 allMis1 # 182148 anaPla1 # 176580 aquChr2 # 193676 chrPic2 # 193164 cotJap2 # 177356 galGal6 # 211550 geoFor1 # 191008 melGal5 # 209757 melUnd1 # 265679 pelSin1 # 222413 taeGut2 # load the resulting file ssh hgwdev cd /hive/data/genomes/galGal6/bed/multiz77way/frames time hgLoadMafFrames galGal6 multiz77wayFrames multiz77wayFrames.bed.gz # real 0m14.572s hgsql -e 'select count(*) from multiz77wayFrames;' galGal6 # +----------+ # | count(*) | # +----------+ # | 2388286 | # +----------+ time featureBits -countGaps galGal6 multiz77wayFrames # 36890829 bases of 1065365425 (3.463%) in intersection # real 0m12.344s # enable the trackDb entries: # frames multiz77wayFrames # irows on # zoom to base level in an exon to see codon displays # appears to work OK ######################################################################### # Phylogenetic tree from 77-way (DONE - 2018-11-05 - Hiram) mkdir /hive/data/genomes/galGal6/bed/multiz77way/4d cd /hive/data/genomes/galGal6/bed/multiz77way/4d # the annotated maf's are in: ../anno/result/*.maf # using ncbiRefSeq for galGal6, only transcribed genes and nothing # from the randoms and other misc. hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq where cdsEnd > cdsStart;" galGal6 \ | egrep -E -v "chrM|chrUn|random|_alt" > ncbiRefSeq.gp wc -l *.gp # 49187 ncbiRefSeq.gp # verify it is only on the chroms: cut -f2 ncbiRefSeq.gp | sort | uniq -c | sort -rn | sed -e 's/^/ # /;' # 6305 chr1 # 4341 chr2 # 3682 chr3 # 3554 chr4 # 3101 chr5 # 2296 chrZ # 1926 chr7 # 1883 chr6 # 1736 chr8 # 1523 chr9 # 1293 chr14 # 1262 chr10 # 1184 chr12 # 1163 chr15 # 1045 chr18 # 1041 chr20 # 1008 chr17 # 1003 chr11 # 978 chr19 # 969 chr13 # 889 chr27 # 837 chr26 # 826 chr33 # 794 chr28 # 772 chr21 # 722 chr25 # 687 chr23 # 652 chr24 # 475 chr22 # 420 chr16 # 339 chr31 # 234 chr30 # 125 chrW # 122 chr32 genePredSingleCover ncbiRefSeq.gp stdout | sort > ncbiRefSeqNR.gp wc -l ncbiRefSeqNR.gp # 17121 ncbiRefSeqNR.gp ssh ku mkdir /hive/data/genomes/galGal6/bed/multiz77way/4d/run cd /hive/data/genomes/galGal6/bed/multiz77way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/galGal6/bed/multiz77way" set c = $1 set infile = $r/anno/result/$2 set outfile = $3 cd /dev/shm # 'clean' maf, removes all chrom names, leaves only the db name perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/ncbiRefSeqNR.gp | sed -e "s/\t$c\t/\tgalGal6.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/galGal6/bed/multiz77way/anno/result/*.maf \ | sed -e "s#.*multiz77way/anno/result/##" \ | egrep -E -v "chrM|chrUn|random|_alt" > maf.list printf '#LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP ' > template gensub2 maf.list single template jobList para -ram=64g create jobList para try ... check ... push ... etc... para time # CPU time in finished jobs: 9637s 160.62m 2.68h 0.11d 0.000 y # IO & Wait Time: 90s 1.50m 0.02h 0.00d 0.000 y # Average job time: 304s 5.07m 0.08h 0.00d # Longest finished job: 1564s 26.07m 0.43h 0.02d # Submission to last job: 2786s 46.43m 0.77h 0.03d # the two failures need more memory than 64g, finish on hgwdev: ./4d.csh chr1 chr1.maf ../mfa/chr1.mfa & ./4d.csh chr2 chr2.maf ../mfa/chr2.mfa wait # real 27m59.700s # combine mfa files ssh hgwdev cd /hive/data/genomes/galGal6/bed/multiz77way/4d # verify no tiny files: ls -og mfa | sort -k3nr | tail -2 # -rw-rw-r-- 1 235884 Nov 3 11:25 chrY.mfa #want comma-less species.list time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # real 0m3.430s # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 77 sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../galGal6.77way.nh sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../galGal6.77way.nh > tree-commas.nh # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree-commas.nh 4d.all.mfa # real 82m14.661s mv phyloFit.mod all.mod grep TREE all.mod # ((((((((((((((((((((((((galGal6:0.0407994,cotJap2:0.075032):0.00678692, # melGal5:0.0460563):0.152361,tytAlb1:0.0674945):0.00362352, # bucRhi1:0.123125):0.00197824,anaPla1:0.155414):0.00324622, # apaVit1:0.120774):0.00236511,calAnn1:0.154521):0.00405339, # cucCan1:0.124946):0.00558146,chaVoc2:0.0681884):0.00341058, # fulGla1:0.0444483):0.00206427,tauEry1:0.0874257):0.00215936, # opiHoa1:0.086691):0.00214164,phoRub1:0.053355):0.00214444, # (colLiv1:0.130697,((lepDis1:0.0683669,merNub1:0.145762):0.0147436, # ((pelCri1:0.0517251,(phaCar1:0.0682455, # phaLep1:0.0630482):0.00464673):0.00345114,((pteGut1:0.09113, # (nipNip1:0.0461867,egrGar1:0.0615126):0.00680979):0.0024641, # ((pygAde1:0.015753,aptFor1:0.0127504):0.0294345,(((((carCri1:0.0598096, # mesUni1:0.11204):0.00462025,eurHel1:0.112293):0.00213836, # balPav1:0.0651192):0.00215447,chlUnd1:0.0848454):0.00267882, # (((falChe1:0.0024831,falPer1:0.00259406):0.0798053,(aquChr2:0.0142362, # (halAlb1:0.00214356,halLeu1:0.00214934):0.0176244):0.034005):0.00624518, # ((((corBra1:0.00212182,corCor1:0.00240367):0.0502936,(acaChl1:0.180434, # (ficAlb2:0.0640719,(((serCan1:0.0405358,zonAlb1:0.0478928):0.00254573, # geoFor1:0.0304481):0.0247191, # taeGut2:0.0446712):0.0146035):0.0148619):0.00208483):0.00205379, # pseHum1:0.0540727):0.156263,(gavSte1:0.0505629,(capCar1:0.0761202, # (melUnd1:0.04375,(amaVit1:0.0284858, # araMac1:0.0268072):0.0151638):0.0778525):0.00921769):0.00196119):0.00317845):0.00333798):0.0032279):0.00209549):0.00217019):0.00413516):0.00218366):0.00218773):0.0169635, # colStr1:0.131992):0.00618949,picPub1:0.178329):0.0578306, # (strCam1:0.0691605,tinGut2:0.162739):0.0403754):0.186966, # (allMis1:0.0115232,allSin1:0.0100885):0.186207):0.0235685, # ((cheMyd1:0.0387783,chrPic2:0.0359402):0.0315313, # (pelSin1:0.0266918,apaSpi1:0.0301989):0.0938125):0.0851605):0.109974, # ((pytBiv1:0.0908161,thaSir1:0.210469):0.201079, # anoCar2:0.28687):0.240369):0.193154,(hg38:0.153209, # mm10:0.288885):0.335161):0.195426, # ((xenTro9:0.131605,xenLae2:0.135359):0.444445, # nanPar1:0.586412):0.336097):0.462138, # (((((tetNig2:0.205709,fr3:0.191549):0.248318, # (oreNil3:0.0361466,mayZeb1:0.044998):0.298866):0.0164327, # oryLat2:0.452326):0.0531957,gasAcu1:0.24726):0.29165, # (angJap1:0.453715,danRer11:0.786127):0.0950112):0.129939):0.343984, # (petMar3:0.0600265,letCam1:0.0573159):0.343984); # compare these calculated lengths to what we started with /cluster/bin/phast/all_dists ../galGal6.77way.nh | grep galGal6 \ | sed -e "s/galGal6.//;" | sort > original.dists grep TREE all.mod | sed -e 's/TREE: //;' \ | /cluster/bin/phast/all_dists /dev/stdin | grep galGal6 \ | sed -e "s/galGal6.//;" | sort > galGal6.dists # printing out the 'original', the 'new' the 'difference' and # percent difference/delta join original.dists galGal6.dists | awk '{ printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' | sort -k4n # melGal5 0.126972 0.093643 0.033329 35.591555 # cotJap2 0.066254 0.115831 -0.049577 -42.801150 # tytAlb1 0.152299 0.267442 -0.115143 -43.053447 # fulGla1 0.222299 0.268654 -0.046355 -17.254536 # phoRub1 0.252299 0.283926 -0.031627 -11.139170 # aptFor1 0.372299 0.287673 0.084626 29.417429 # chaVoc2 0.212299 0.288984 -0.076685 -26.536071 # pygAde1 0.372299 0.290675 0.081624 28.080846 # pelCri1 0.342299 0.296398 0.045901 15.486272 # nipNip1 0.362299 0.298853 0.063446 21.229835 # aquChr2 0.402299 0.306540 0.095759 31.238664 # gavSte1 0.397002 0.307756 0.089246 28.998947 # halAlb1 0.402299 0.312072 0.090227 28.912238 # halLeu1 0.402299 0.312078 0.090221 28.909760 # phaLep1 0.352299 0.312368 0.039931 12.783320 # tauEry1 0.232299 0.313696 -0.081397 -25.947733 # egrGar1 0.362299 0.314179 0.048120 15.316110 # opiHoa1 0.242299 0.315120 -0.072821 -23.108974 # phaCar1 0.352299 0.317565 0.034734 10.937603 # balPav1 0.402299 0.318668 0.083631 26.243928 # carCri1 0.442299 0.320117 0.122182 38.167920 # lepDis1 0.342299 0.320197 0.022102 6.902626 # bucRhi1 0.162299 0.326696 -0.164397 -50.321094 # apaVit1 0.182299 0.329569 -0.147270 -44.685635 # chlUnd1 0.382299 0.336240 0.046059 13.698251 # pteGut1 0.352299 0.336986 0.015313 4.544106 # cucCan1 0.202299 0.340160 -0.137861 -40.528281 # falChe1 0.402299 0.340587 0.061712 18.119306 # falPer1 0.402299 0.340698 0.061601 18.080822 # capCar1 0.407002 0.342531 0.064471 18.821946 # anaPla1 0.172299 0.360963 -0.188664 -52.266853 # colLiv1 0.312299 0.365600 -0.053301 -14.579048 # calAnn1 0.192299 0.365681 -0.173382 -47.413456 # eurHel1 0.422299 0.367980 0.054319 14.761400 # mesUni1 0.442299 0.372348 0.069951 18.786458 # colStr1 0.382299 0.381671 0.000628 0.164540 # araMac1 0.413002 0.386234 0.026768 6.930514 # amaVit1 0.413002 0.387913 0.025089 6.467687 # melUnd1 0.423987 0.388013 0.035974 9.271339 # merNub1 0.342299 0.397593 -0.055294 -13.907186 # strCam1 0.262299 0.423235 -0.160936 -38.025211 # picPub1 0.402299 0.434198 -0.031899 -7.346648 # pseHum1 0.422002 0.465568 -0.043566 -9.357602 # corBra1 0.454068 0.465964 -0.011896 -2.552987 # corCor1 0.454068 0.466246 -0.012178 -2.611926 # taeGut2 0.484068 0.489770 -0.005702 -1.164220 # ficAlb2 0.454068 0.494568 -0.040500 -8.188965 # geoFor1 0.480329 0.500266 -0.019937 -3.985280 # serCan1 0.473068 0.512900 -0.039832 -7.766036 # tinGut2 0.262299 0.516814 -0.254515 -49.246924 # zonAlb1 0.473525 0.520257 -0.046732 -8.982484 # acaChl1 0.464068 0.596068 -0.132000 -22.145124 # chrPic2 0.617442 0.676866 -0.059424 -8.779286 # cheMyd1 0.617442 0.679704 -0.062262 -9.160164 # allSin1 0.632299 0.696961 -0.064662 -9.277707 # allMis1 0.632299 0.698395 -0.066096 -9.463985 # pelSin1 0.617442 0.729898 -0.112456 -15.407084 # apaSpi1 0.617442 0.733406 -0.115964 -15.811706 # anoCar2 0.934442 1.161447 -0.227005 -19.545016 # pytBiv1 0.887442 1.166472 -0.279030 -23.920849 # thaSir1 0.987442 1.286125 -0.298683 -23.223481 # hg38 1.115503 1.315732 -0.200229 -15.218069 # mm10 1.326078 1.451408 -0.125330 -8.635063 # xenTro9 0.907386 1.934935 -1.027549 -53.105091 # xenLae2 0.929442 1.938689 -1.009247 -52.058221 # nanPar1 0.829442 1.945297 -1.115855 -57.361678 # gasAcu1 1.758169 2.153775 -0.395606 -18.368028 # angJap1 1.891116 2.163591 -0.272475 -12.593646 # letCam1 1.946543 2.230210 -0.283667 -12.719295 # petMar3 1.946543 2.232920 -0.286377 -12.825224 # oreNil3 1.824346 2.311156 -0.486810 -21.063485 # mayZeb1 1.974346 2.320007 -0.345661 -14.899136 # oryLat2 2.008726 2.412036 -0.403310 -16.720729 # fr3 1.925783 2.416010 -0.490227 -20.290769 # tetNig2 1.846095 2.430170 -0.584075 -24.034327 # danRer11 1.971868 2.496003 -0.524135 -20.998973 ######################################################################### # phastCons 77-way (DONE - 2018-12-14 - Hiram) # split 77way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/galGal6/bed/multiz77way/cons/ss mkdir -p /hive/data/genomes/galGal6/bed/multiz77way/cons/msa.split cd /hive/data/genomes/galGal6/bed/multiz77way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/galGal6/bed/multiz77way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/galGal6/bed/multiz77way/cons/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh printf '#LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP ' > template # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list # all can finish OK at a 64Gb memory limit gensub2 maf.list single template jobList para -ram=64g create jobList para try ... check ... etc para push # Completed: 247 of 247 jobs # CPU time in finished jobs: 49455s 824.25m 13.74h 0.57d 0.002 y # IO & Wait Time: 2892s 48.20m 0.80h 0.03d 0.000 y # Average job time: 212s 3.53m 0.06h 0.00d # Longest finished job: 18111s 301.85m 5.03h 0.21d # Submission to last job: 18151s 302.52m 5.04h 0.21d # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/galGal6/bed/multiz77way/cons/run.cons cd /hive/data/genomes/galGal6/bed/multiz77way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/galGal6/bed/multiz77way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons/ss set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$c/$f.ss $tmp else ln -s $ssSrc/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix printf '#LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP ' > template ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list wc -l ss.list # 229 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/galGal6/bed/multiz77way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList # beware overwhelming the cluster with these fast running high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push # Completed: 299 of 299 jobs # CPU time in finished jobs: 35432s 590.54m 9.84h 0.41d 0.001 y # IO & Wait Time: 2145s 35.74m 0.60h 0.02d 0.000 y # Average job time: 126s 2.09m 0.03h 0.00d # Longest finished job: 387s 6.45m 0.11h 0.00d # Submission to last job: 14019s 233.65m 3.89h 0.16d # create Most Conserved track cd /hive/data/genomes/galGal6/bed/multiz77way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do echo $C 1>&2 ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m55.745s # -rw-rw-r-- 1 246650784 Dec 14 13:59 tmpMostConserved.bed time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ > mostConserved.bed # real 0m40.460s # -rw-rw-r-- 1 253184497 Dec 14 14:01 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/galGal6/bed/multiz77way/cons/all time hgLoadBed galGal6 phastConsElements77way mostConserved.bed # Read 7438714 elements of size 5 from mostConserved.bed # real 0m55.443s # Try for at least 5% overall cov, and 70% CDS cov # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits galGal6 -enrichment ncbiRefSeq:cds phastConsElements77way # ncbiRefSeq:cds 2.917%, phastConsElements77way 13.012%, both 2.015%, cover 69.06%, enrich 5.31x # real 0m41.525s time featureBits galGal6 -enrichment refGene:cds phastConsElements77way # refGene:cds 0.855%, phastConsElements77way 13.012%, both 0.677%, cover 79.21%, enrich 6.09x # real 0m37.093s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/galGal6/bed/multiz77way/cons/all mkdir downloads time for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" 1>&2 find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons77way.wigFix.gz done # real 5m16.918s # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons77way.wig phastCons77way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 3m15.097s du -hsc *.wi? # 975M phastCons77way.wib # 102M phastCons77way.wig # encode into a bigWig file: # (warning wigToBigWig process may be too large for memory limits # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin \ ../../../../chrom.sizes phastCons77way.bw) > bigWig.log 2>&1 egrep "VmPeak|real" bigWig.log # pid=72262: VmPeak: 11716084 kB # real 8m41.950s # real 8m16.091s # -rw-rw-r-- 1 6688955 Dec 14 14:28 bigWig.log bigWigInfo phastCons77way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 1,254,221,116 # primaryIndexSize: 35,327,900 # zoomLevels: 10 # chromCount: 209 # basesCovered: 1,021,715,761 # mean: 0.149853 # min: 0.000000 # max: 1.000000 # std: 0.324058 # if you wanted to use the bigWig file, loading bigWig table: # but we don't use the bigWig file mkdir /gbdb/galGal6/bbi ln -s `pwd`/phastCons77way.bw /gbdb/galGal6/bbi hgsql galGal6 -e 'drop table if exists phastCons77way; \ create table phastCons77way (fileName varchar(255) not null); \ insert into phastCons77way values ("/gbdb/galGal6/bbi/phastCons77way.bw");' # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/galGal6/bed/multiz77way/cons/all ln -s `pwd`/phastCons77way.wib /gbdb/galGal6/multiz77way/phastCons77way.wib time hgLoadWiggle -pathPrefix=/gbdb/galGal6/multiz77way galGal6 \ phastCons77way phastCons77way.wig # real 0m6.769s time wigTableStats.sh galGal6 phastCons77way # db.table min max mean count sumData # galGal6.phastCons77way 0 1 0.149853 1021715761 1.53107e+08 # stdDev viewLimits # 0.324058 viewLimits=0:1 # real 0m3.513s # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/galGal6/bed/multiz77way/cons/all time hgWiggle -doHistogram -db=galGal6 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons77way > galGal6.phastCons77way.histogram.data 2>&1 # real 0m36.635s # create plot of histogram: # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change) printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" set output "galGal6.phastCons77way.histo.png" set size 1.0, 1.0 set style line 1 lt 2 lc rgb "#ff88ff" lw 2 set style line 2 lt 2 lc rgb "#66ff66" lw 2 set style line 3 lt 2 lc rgb "#ffff00" lw 2 set style line 4 lt 2 lc rgb "#ffffff" lw 2 set border lc rgb "#ffff00" set key left box ls 3 set key tc variable set grid noxtics set y2tics set grid ytics ls 4 set title " Chicken/galGal6 Histogram phastCons77way track" \ tc rgb "#ffffff" set xlabel " phastCons124way score" tc rgb "#ffffff" set ylabel " Relative Frequency" tc rgb "#ff88ff" set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" set y2range [0:1] set yrange [0:0.02] plot "galGal6.phastCons77way.histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \ "galGal6.phastCons77way.histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 ' | gnuplot display galGal6.phastCons77way.histo.png ######################################################################### # phyloP for 77-way (DONE - 2018-12-06 - Hiram) # # split SS files into 1M chunks, this business needs smaller files # to complete ssh ku mkdir /hive/data/genomes/galGal6/bed/multiz77way/consPhyloP cd /hive/data/genomes/galGal6/bed/multiz77way/consPhyloP mkdir ss run.split cd run.split printf '#!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/galGal6/bed/multiz77way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/galGal6/bed/multiz77way/consPhyloP/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running ' > doSplit.csh chmod +x doSplit.csh # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list # this needs a {check out line+ $(root1.done)} test for verification: printf '#LOOP ./doSplit.csh $(root1) $(root1).done #ENDLOOP ' > template gensub2 maf.list single template jobList # all can complete successfully at the 64Gb memory limit para -ram=64g create jobList para try ... check ... push ... etc... # Completed: 247 of 247 jobs # CPU time in finished jobs: 38282s 638.03m 10.63h 0.44d 0.001 y # IO & Wait Time: 2674s 44.57m 0.74h 0.03d 0.000 y # Average job time: 166s 2.76m 0.05h 0.00d # Longest finished job: 12645s 210.75m 3.51h 0.15d # Submission to last job: 17690s 294.83m 4.91h 0.20d # run phyloP with score=LRT ssh ku mkdir /cluster/data/galGal6/bed/multiz77way/consPhyloP cd /cluster/data/galGal6/bed/multiz77way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACK ../../4d/all.mod # BACKGROUND: 0.274952 0.278952 0.175546 0.270550 grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.454 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../4d/all.mod 0.454 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.273000 0.227000 0.227000 0.273000 printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set ssFile = $1:t set out = $2 set cName = $f:h set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/galGal6/bed/multiz77way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "$cons/ss/$cName/$ssFile" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null echo source: $ssSrc.ss $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $ssFile.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$ssFile.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp ' > doPhyloP.csh chmod +x doPhyloP.csh # Create list of chunks find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list # make sure the list looks good wc -l ss.list # 1243 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/galGal6/bed/multiz77way/consPhyloP/all cd /hive/data/genomes/galGal6/bed/multiz77way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overloading the cluster with these quick and high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push para time > run.time # Completed: 1243 of 1243 jobs # CPU time in finished jobs: 1719561s 28659.35m 477.66h 19.90d 0.055 y # IO & Wait Time: 8564s 142.73m 2.38h 0.10d 0.000 y # Average job time: 1390s 23.17m 0.39h 0.02d # Longest finished job: 2069s 34.48m 0.57h 0.02d # Submission to last job: 8198s 136.63m 2.28h 0.09d mkdir downloads time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" 1>&2 find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP77way.wigFix.gz done # real 15m2.353s du -hsc downloads # 2.1G downloads # check integrity of data with wigToBigWig time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/galGal6/chrom.sizes \ phyloP77way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=129258: VmPeak: 11713968 kB # real 9m23.984s bigWigInfo phyloP77way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 3,133,928,233 # primaryIndexSize: 35,343,692 # zoomLevels: 10 # chromCount: 209 # basesCovered: 1,021,715,761 # mean: 0.049308 # min: -20.000000 # max: 8.265000 # std: 1.474732 # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP77way.wig phyloP77way.wib) # Converted stdin, upper limit 8.27, lower limit -20.00 # real 4m35.907s # Converted stdin, upper limit 1.31, lower limit -20.00 # real 17m36.880s # -rw-rw-r-- 1 1021715761 Dec 10 09:17 phyloP77way.wib # -rw-rw-r-- 1 112855301 Dec 10 09:17 phyloP77way.wig du -hsc *.wi? # 975M phyloP77way.wib # 108M phyloP77way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP77way.wib /gbdb/galGal6/multiz77way/phyloP77way.wib time hgLoadWiggle -pathPrefix=/gbdb/galGal6/multiz77way galGal6 \ phyloP77way phyloP77way.wig # real 0m6.892s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh galGal6 phyloP77way # db.table min max mean count sumData # galGal6.phyloP77way -20 8.265 0.0493081 1021715761 5.03789e+07 # stdDev viewLimits # 1.47473 viewLimits=-7.32435:7.42297 # that range is: 20+8.265= 28.265 for hBinSize=0.028265 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.028265 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=galGal6 phyloP77way > histogram.data 2>&1 # real 0m43.634s # xaxis range: grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \ | sed -e 's/^/# /;' # Q1 -9.994190 # median -3.917220 # Q3 2.159760 # average -3.923422 # min -20.000000 # max 8.236730 # count 861 # total -3378.066208 # standard deviation 7.041258 # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ | sed -e 's/^/# /;' # Q1 0.000005 # median 0.000081 # Q3 0.000507 # average 0.001161 # min 0.000000 # max 0.019245 # count 861 # total 0.999978 # standard deviation 0.002967 # the y2axis range grep -v chrom histogram.data | grep "^[0-9]" | ave -col=7 stdin \ | sed -e 's/^/# /;' # Q1 0.000224 # median 0.018227 # Q3 0.949630 # average 0.341045 # min 0.000001 # max 1.000000 # count 862 # total 293.980895 # standard deviation 0.441921 # create plot of histogram: # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change) printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" set output "ce11.phyloP77way.histo.png" set size 1.0, 1.0 set style line 1 lt 2 lc rgb "#ff88ff" lw 2 set style line 2 lt 2 lc rgb "#66ff66" lw 2 set style line 3 lt 2 lc rgb "#ffff00" lw 2 set style line 4 lt 2 lc rgb "#ffffff" lw 2 set border lc rgb "#ffff00" set key left box ls 3 set key tc variable set grid noxtics set y2tics set grid ytics ls 4 set title " Chicken/galGal6 Histogram phyloP77way track" \ tc rgb "#ffffff" set xlabel " phyloP77way score" tc rgb "#ffffff" set ylabel " Relative Frequency" tc rgb "#ff88ff" set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" set xrange [-5:5] set y2range [0:1] set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 ' | gnuplot # verify it looks sane display ce11.phyloP77way.histo.png & ############################################################################# # construct download files for 77-way (DONE - 2018-12-19 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/multiz77way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phastCons77way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phyloP77way mkdir /hive/data/genomes/galGal6/bed/multiz77way/downloads cd /hive/data/genomes/galGal6/bed/multiz77way/downloads mkdir multiz77way phastCons77way phyloP77way ######################################################################### ## create upstream ncbiRefSeq maf files cd /hive/data/genomes/galGal6/bed/multiz77way/downloads/multiz77way # bash script #!/bin/sh export geneTbl="ncbiRefSeq" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits galGal6 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags galGal6 multiz77way \ stdin stdout \ -orgs=/hive/data/genomes/galGal6/bed/multiz77way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 171m42.872s -rw-rw-r-- 1 52659159 Nov 6 11:46 upstream1000.knownGene.maf.gz -rw-rw-r-- 1 451126665 Nov 6 12:15 upstream2000.knownGene.maf.gz -rw-rw-r-- 1 1080533794 Nov 6 12:55 upstream5000.knownGene.maf.gz ###################################################################### ## compress the maf files cd /hive/data/genomes/galGal6/bed/multiz77way/downloads/multiz77way mkdir maf rsync -a -P ../../anno/result/ ./maf/ du -hsc maf/ # 334G maf/ cd maf time gzip *.maf & # real 135m1.784s du -hscL maf ../../anno/result/ # 35G maf # 334G ../../anno/result/ cd maf md5sum *.maf.gz *.nh > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/multiz77way/maf cd maf ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/multiz77way/maf cd -- ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/multiz77way/ ########################################################################### cd /hive/data/genomes/galGal6/bed/multiz77way/downloads/multiz77way grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > galGal6.77way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh galGal6.77way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > galGal6.77way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh galGal6.77way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > galGal6.77way.scientificName.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m3.147s ln -s `pwd`/*.maf.gz `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/multiz77way du -hsc ./maf ../../anno/result # 18G ./maf # 156G ../../anno/result # obtain the README.txt from galGal6/multiz20way and update for this # situation ln -s `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/multiz77way/ ##################################################################### cd /hive/data/genomes/galGal6/bed/multiz77way/downloads/phastCons77way mkdir galGal6.77way.phastCons cd galGal6.77way.phastCons ln -s ../../../cons/all/downloads/*.wigFix.gz . md5sum *.gz > md5sum.txt cd /hive/data/genomes/galGal6/bed/multiz77way/downloads/phastCons77way ln -s ../../cons/all/phastCons77way.bw ./galGal6.phastCons77way.bw ln -s ../../cons/all/all.mod ./galGal6.phastCons77way.mod time md5sum *.mod *.bw > md5sum.txt # real 0m20.354s # obtain the README.txt from galGal6/phastCons20way and update for this mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phastCons77way/galGal6.77way.phastCons cd galGal6.77way.phastCons ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phastCons77way/galGal6.77way.phastCons cd .. # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phastCons77way ##################################################################### cd /hive/data/genomes/galGal6/bed/multiz77way/downloads/phyloP77way mkdir galGal6.77way.phyloP cd galGal6.77way.phyloP ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz . md5sum *.wigFix.gz > md5sum.txt cd .. ln -s ../../consPhyloP/run.phyloP/all.mod galGal6.phyloP77way.mod ln -s ../../consPhyloP/all/phyloP77way.bw galGal6.phyloP77way.bw md5sum *.mod *.bw > md5sum.txt # obtain the README.txt from galGal6/phyloP20way and update for this mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phyloP77way/galGal6.77way.phyloP cd galGal6.77way.phyloP ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phyloP77way/galGal6.77way.phyloP cd .. # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6/phyloP77way ############################################################################# # hgPal downloads (DONE - 2018-12-14 - Hiram) # FASTA from 77-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S galGal6HgPal mkdir /hive/data/genomes/galGal6/bed/multiz77way/pal cd /hive/data/genomes/galGal6/bed/multiz77way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop can take hours on a high contig count assembly # it is just fine on human/galGal6, just a few seconds export mz=multiz77way export gp=ncbiRefSeq export db=galGal6 export I=0 export D=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time (sh -x ./$gp.jobs) > $gp.jobs.log 2>&1 # real 122m41.761s export mz=multiz77way export gp=ncbiRefSeq time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 2m25.192s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 9m58.686s # -rw-rw-r-- 1 1125919919 Dec 14 17:33 ncbiRefSeq.multiz77way.exonAA.fa.gz # -rw-rw-r-- 1 1840895133 Dec 14 17:45 ncbiRefSeq.multiz77way.exonNuc.fa.gz export mz=multiz77way export gp=ncbiRefSeq export db=galGal6 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # wiki page for 77-way (TBD - 2017-11-06 - Hiram) mkdir /hive/users/hiram/bigWays/galGal6.77way cd /hive/users/hiram/bigWays echo "galGal6" > galGal6.77way/ordered.list awk '{print $1}' /hive/data/genomes/galGal6/bed/multiz77way/77way.distances.txt \ >> galGal6.77way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh galGal6.77way/ordered.list # dbDb.sh constructs galGal6.77way/XenTro9_77-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh galGal6 77way # sizeStats.pl constructs galGal6.77way/XenTro9_77-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl galGal6 77way # defCheck.pl constructs XenTro9_77-way_conservation_lastz_parameters.html ./defCheck.pl galGal6 77way # this constructs the html pages in galGal6.77way/: # -rw-rw-r-- 1 6247 May 2 17:07 XenTro9_77-way_conservation_alignment.html # -rw-rw-r-- 1 8477 May 2 17:09 XenTro9_77-way_Genome_size_statistics.html # -rw-rw-r-- 1 5033 May 2 17:10 XenTro9_77-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # XenTro9_77-way_conservation_alignment # XenTro9_77-way_Genome_size_statistics # XenTro9_77-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################ # pushQ readmine (TBD - 2017-11-07 - Hiram) cd /usr/local/apache/htdocs-hgdownload/goldenPath/galGal6 find -L `pwd`/multiz77way `pwd`/phastCons77way `pwd`/phyloP77way \ /gbdb/galGal6/multiz77way -type f \ > /hive/data/genomes/galGal6/bed/multiz77way/downloads/redmine.20216.fileList wc -l /hive/data/genomes/galGal6/bed/multiz77way/downloads/redmine.20216.fileList # 1450 /hive/data/genomes/galGal6/bed/multiz77way/downloads/redmine.20216.fileList cd /hive/data/genomes/galGal6/bed/multiz77way/downloads hgsql -e 'show tables;' galGal6 | grep 77way \ | sed -e 's/^/galGal6./;' > redmine.20216.table.list ############################################################################# # wiki page for 77-way (TBD - 2017-05-02 - Hiram) mkdir /hive/users/hiram/bigWays/galGal6.77way cd /hive/users/hiram/bigWays echo "galGal6" > galGal6.77way/ordered.list sort -k2,2nr /hive/data/genomes/galGal6/bed/multiz77way/chainBits.txt \ | grep -v "^#" | awk '{print $1}' >> galGal6.77way/ordered.list # awk '{print $1}' /hive/data/genomes/galGal6/bed/multiz77way/77way.distances.txt \ # >> galGal6.77way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh galGal6.77way/ordered.list # dbDb.sh constructs galGal6.77way/XenTro9_77-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh galGal6 77way # sizeStats.pl constructs galGal6.77way/XenTro9_77-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl galGal6 77way # defCheck.pl constructs XenTro9_77-way_conservation_lastz_parameters.html ./defCheck.pl galGal6 77way # this constructs the html pages in galGal6.77way/: # -rw-rw-r-- 1 6247 May 2 17:07 XenTro9_77-way_conservation_alignment.html # -rw-rw-r-- 1 8477 May 2 17:09 XenTro9_77-way_Genome_size_statistics.html # -rw-rw-r-- 1 5033 May 2 17:10 XenTro9_77-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # XenTro9_77-way_conservation_alignment # XenTro9_77-way_Genome_size_statistics # XenTro9_77-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################