######################################################################## # an alignment of 44 closely related SARS viruses (DONE - 2020-03-13 - Hiram) ######################################################################## # List of 44 sequences obtained via email from MIT/Irwin via Markd: cd /hive/data/genomes/wuhCor1/bed/lastzStrains/sequences # -rw-r--r-- 1 7074 Mar 13 13:39 SarbecovirusGenomes.44.RAxML.Mapped.nh # head -15 SarbecovirusGenomes.44.RAxML.Mapped.nh | grep "[a-z]" # NC_045512_Wuhan_seafood_market_pneumonia_virus:0.02561, # MN996532_Bat_coronavirus_RaTG13:0.02795 # MG772933_Bat_SARS_like_coronavirus_CoVZC45:0.01539, # MG772934_Bat_SARS_like_coronavirus_CoVZXC21:0.0187 # ... etc ... tail SarbecovirusGenomes.44.RAxML.Mapped.nh | grep "[a-z]" # GQ153547_Bat_SARS_coronavirus_HKU3_12:0.00659 # KY352407_SARS_related_coronavirus_strain_BtKY72:0.33662, # NC_014470_Bat_coronavirus_BM48_31_BGR_2008:0.26995 # extract the accession names from that .nh file: sed -e 's/^ \+//;' SarbecovirusGenomes.44.RAxML.Mapped.nh \ | egrep -i "[a-z]" | sort -u \ | sed -e 's/_Coronavirus.*//i; s/_Bat.*//;' \ | sed -e 's/_[a-z]\+.*//i;' | sort -u > MIT_Irwin.list44 # only 41 of those are new, we already have 3 here. Fetch the # new list: ################################################ cat MIT_Irwin.list44 | while read acc do alreadyHere=`ls fasta/${acc}* 2> /dev/null | wc -l` if [ "${alreadyHere}" -gt 0 ]; then ls -og fasta/${acc}* 1>&2 else rm -f "fasta/${acc}.fa" if [ ! -s "fasta/${acc}.fa" ]; then printf "# to be fetched $acc\n" 1>&2 wget -O fasta/${acc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$acc" betterName=`grep "^>" fasta/${acc}.fa | sed -e 's/>//; s/ .*//;'` if [ "x${betterName}" != "xy" ]; then mv fasta/${acc}.fa fasta/${betterName}.fa ls -og fasta/${acc}* 1>&2 wget -O gbk/${acc}.gbk \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=gb&sendto=on&id=$acc" mv gbk/${acc}.gbk gbk/${betterName}.gbk fi fi fi done ################################################ # Construct safe names for F in fasta/*.fa chinaFasta/*.fa do B=`basename $F | sed -e "s/\.\([0-9]\+\)/v\1/; s/-/_/g;"` if [ -s "safeNameFa/${B}" ]; then ls -og safeNameFa/${B} else sed -e 's/^>\([^ ]\+\).*/>\1/;' "${F}" | sed -e 's/\./v/; s/-/_/g;' \ > safeNameFa/${B} ls -og safeNameFa/${B} fi done ################################################ # make a list of the new sequences for lastz alignment procedure ls -ogrt safeNameFa | tail -41 | awk '{print $NF}' \ | sed -e 's/.fa//;' > ../new41.list ################################################ # generate 2bit and chrom.sizes files cd /hive/data/genomes/wuhCor1/bed/lastzStrains for asmId in `ls sequences/safeNameFa | sed -e 's/.fa//;'` do twoBit="twoBits/${asmId}.2bit" sizes="sizes/${asmId}.chrom.sizes" if [ -s "${sizes}" ]; then ls -og "${sizes}" "${twoBit}" 1>&2 else faToTwoBit sequences/safeNameFa/${asmId}.fa "${twoBit}" twoBitInfo "${twoBit}" "${sizes}" ls -og "${sizes}" "${twoBit}" 1>&2 fi done ################################################ # prepare alignment scripts for db in `cat new41.list`; do ./prepOne.pl $db; done # find those new scripts to get into a run list: find ./runDir -type f | grep runAlignment.sh \ | xargs ls -ogrt | grep "Mar 13" | awk '{print $NF}' > alignment.run41.list # and then run them time (./perlPara.pl 13 alignment.run41.list) >> alignment41.log 2>&1 & # real 8m24.503s ################################################ # calculate kmers for these new sequences cd /hive/data/genomes/wuhCor1/bed/lastzStrains/kmers for F in `cat ../new41.list` do printf "kmerOne $F {check out exists+ kmers/$F/$F.31mers.profile.txt.gz}\n" done > new41.jobList para create new41.jobList para try para push para check para time # Completed: 41 of 41 jobs # CPU time in finished jobs: 27s 0.45m 0.01h 0.00d 0.000 y # IO & Wait Time: 89s 1.48m 0.02h 0.00d 0.000 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 5s 0.08m 0.00h 0.00d # Submission to last job: 41s 0.68m 0.01h 0.00d ##################################################### # need to compute the compare matrix for 44 sequences ##################################################### ln -s ../sequences/MIT_Irwin.list44 ./ Completed: 946 of 946 jobs CPU time in finished jobs: 186s 3.09m 0.05h 0.00d 0.000 y IO & Wait Time: 2364s 39.41m 0.66h 0.03d 0.000 y Average job time: 3s 0.04m 0.00h 0.00d Longest finished job: 6s 0.10m 0.00h 0.00d Submission to last job: 113s 1.88m 0.03h 0.00d find ./compare.31 -type f | xargs cat > 44x44.kmers31.compared.txt ~/kent/src/hg/makeDb/doc/wuhCor1/matrixUp.pl 44x44.kmers31.compared.txt \ > 44x44.data31mer.csv mv matrixKey.txt matrixKey.44x44.31mer.txt mkdir /hive/data/genomes/wuhCor1/bed/lastzStrains/kmers/phylip44x44 cd /hive/data/genomes/wuhCor1/bed/lastzStrains/kmers/phylip44x44 mkTestData.pl 31 > infile 2> name.31mer.translate.txt ### After neighbor is done, rename the results, and restore ### the sequence names in the final upgma.31mer.nh tree: if [ -s outfile ]; then rm -f outfile.31mer mv outfile outfile.31mer fi if [ -s outtree ]; then rm -f outtree.31mer mv outtree outtree.31mer fi cat outtree.31mer \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=name.31mer.translate.txt -noInternal -lineOutput /dev/stdin \ > upgma.31mer.nh # fixup the last line due to bug in binaryTree.pl # NC_014470v1:0.4932):0.00132):0.1:0.1; # becomes # NC_014470v1:0.4932):0.00132); # (((NC_045512v2:0.3188,MN996532v1:0.3188):0.16942,((((DQ022305v2:0.10465, # GQ153547v1:0.10465):0.10735,GQ153542v1:0.212):0.24472,(MG772933v1:0.2271, # MG772934v1:0.2271):0.22962):0.01207,((((((DQ071615v1:0.3384, # KJ473815v1:0.3384):0.01263,((((FJ588686v1:0.23975, # KY770858v1:0.23975):0.02765,((((((KF367457v1:0.12575, # KY417144v1:0.12575):0.04487,(KY417151v1:0.10235, # KY417152v1:0.10235):0.06828):0.01948,((KY417142v1:0.12335, # MK211377v1:0.12335):0.04481,(MK211376v1:0.12075, # MK211378v1:0.12075):0.04741):0.02194):0.02759,((((KT444582v1:0.1421, # KY417143v1:0.1421):0.03015,KY417149v1:0.17225):0.01565, # KY417146v1:0.1879):0.00607,(KY417147v1:0.1504, # KY417148v1:0.1504):0.04357):0.02373):0.02307,(KJ473816v1:0.23015, # KY417145v1:0.23015):0.01062):0.00826,MK211375v1:0.24903):0.01837):0.03006, # NC_004718v3:0.29746):0.044,KP886808v1:0.34146):0.00956):0.02562, # MK211374v1:0.37664):0.02202,(KF569996v1:0.3649, # KU973692v1:0.3649):0.03377):0.01393,JX993988v1:0.41259):0.04802, # ((((DQ412042v1:0.0557,DQ648856v1:0.0557):0.11303,(KJ473812v1:0.0946, # KY770860v1:0.0946):0.07413):0.10069,KY938558v1:0.26941):0.17723, # ((DQ412043v1:0.38955,KJ473814v1:0.38955):0.03895, # JX993987v1:0.4285):0.01814):0.01397):0.00818):0.01943):0.0063, # (KY352407v1:0.4932,NC_014470v1:0.4932):0.00132); ######################################################################### ## setting up multiz mkdir /hive/data/genomes/wuhCor1/bed/multiz44way cd /hive/data/genomes/wuhCor1/bed/multiz44way ~/kent/src/hg/utils/phyloTrees/asciiTree.pl \ ../lastzStrains/kmers/phylip44x44/wuhCor1.44way.nh > wuhCor1.44way.nh # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ wuhCor1.44way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh # (((NC_045512v2 MN996532v1) ((((DQ022305v2 GQ153547v1) GQ153542v1) # (MG772933v1 MG772934v1)) ((((((DQ071615v1 KJ473815v1) ((((FJ588686v1 # KY770858v1) ((((((KF367457v1 KY417144v1) (KY417151v1 KY417152v1)) # ((KY417142v1 MK211377v1) (MK211376v1 MK211378v1))) ((((KT444582v1 # KY417143v1) KY417149v1) KY417146v1) (KY417147v1 KY417148v1))) (KJ473816v1 # KY417145v1)) MK211375v1)) NC_004718v3) KP886808v1)) MK211374v1) (KF569996v1 # KU973692v1)) JX993988v1) ((((DQ412042v1 DQ648856v1) (KJ473812v1 KY770860v1)) # KY938558v1) ((DQ412043v1 KJ473814v1) JX993987v1))))) (KY352407v1 NC_014470v1)) sed 's/[()]//g; s/,/ /g' tree.nh > species.list cat species.list | tr ' ' '\n' | sort > 44.grep.list # all the nets are the same, can use ordinary mafNet: # bash shell syntax here ... cd /hive/data/genomes/wuhCor1/bed/multiz44way export H=/hive/data/genomes/wuhCor1/bed mkdir mafLinks # the grep -v NC_045512v2 eliminates the self alignment ls -d ../lastzStrains/runDir/lastz.* | grep -v NC_045512v2 \ | grep -f 44.grep.list | while read D do if [ -d "${D}" ]; then asmId=`basename $D | sed -e 's/lastz.//;'` mafNet="${D}/mafNet/NC_045512v2.maf.gz" if [ ! -f "${mafNet}" ]; then echo "ERROR: can not find mafNet file ${mafNet}" 1>&2 exit 255 fi echo ln -s ../$mafNet mafLinks/NC_045512v2.$asmId.maf.gz ln -s ../$mafNet mafLinks/NC_045512v2.$asmId.maf.gz fi done # verify the symLinks are good: ls -ogrtL mafLinks/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' | head -4 # 18023 Mar 9 17:24 mafLinks/NC_045512v2.NC_014470v1.maf.gz # 17928 Mar 9 17:34 mafLinks/NC_045512v2.NC_004718v3.maf.gz # 18069 Mar 13 15:36 mafLinks/NC_045512v2.KF569996v1.maf.gz # 18057 Mar 13 15:36 mafLinks/NC_045512v2.DQ071615v1.maf.gz ls -ogrtL mafLinks/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' | tail -4 # 18016 Mar 13 15:40 mafLinks/NC_045512v2.MK211376v1.maf.gz # 18230 Mar 13 15:40 mafLinks/NC_045512v2.KY417149v1.maf.gz # 18076 Mar 13 15:42 mafLinks/NC_045512v2.MK211378v1.maf.gz # 17190 Mar 13 15:42 mafLinks/NC_045512v2.MN996532v1.maf.gz # leaves 43 files: ls -ogrtL mafLinks/*.maf.gz | wc -l # 43 # scan the names to verify sanity: zcat mafLinks/*.maf.gz | grep "^s " | awk '{print $2}' | sort \ | uniq -c | sort -rn | sed -e 's/^/# /;' | less # should look like: # 98 NC_045512v2.NC_045512v2 # 5 NC_004718v3.NC_004718v3 # 4 KY938558v1.KY938558v1 # 4 KY352407v1.KY352407v1 # 3 NC_014470v1.NC_014470v1 # ... # 1 MG772933v1.MG772933v1 # 1 KY417149v1.KY417149v1 # 1 KY417146v1.KY417146v1 # 1 KY417143v1.KY417143v1 # 1 JX993987v1.JX993987v1 # 1 GQ153542v1.GQ153542v1 ssh ku cd /hive/data/genomes/wuhCor1/bed/multiz44way mkdir run maf cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn ls ../mafLinks | sed -e 's/.maf.gz//; s/NC_045512v2.//' > maf.list printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/wuhCor1/bed/multiz44way/maf/$(root1).maf} #ENDLOOP ' > template printf '#!/bin/csh -ef set db = NC_045512v2 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/wuhCor1/bed/multiz44way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../tree.nh ../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //;" species.list`) set in = $pairs/$db.$s.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result.maf /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp ' > autoMultiz.csh gensub2 maf.list single template jobList para create jobList para try ... check ... push para time # Completed: 43 of 43 jobs # CPU time in finished jobs: 6254s 104.23m 1.74h 0.07d 0.000 y # IO & Wait Time: 117s 1.95m 0.03h 0.00d 0.000 y # Average job time: 148s 2.47m 0.04h 0.00d # Longest finished job: 152s 2.53m 0.04h 0.00d # Submission to last job: 170s 2.83m 0.05h 0.00d # put the results back together into a single file, and fixup the s line names # the first sed is duplicating the sequence name on the s lines, transforming # from: # s MN996528v1 0 1 + 29891 A # to # s MN996528v1.MN996528v1 0 1 + 29891 A # and the ones that belong to the reference: # s NC_045512v2 0 1 + 29903 A # become via the second sed: # s wuhCor1.NC_045512v2 cd /hive/data/genomes/wuhCor1/bed/multiz44way head -1 maf/NC_004718v3.maf > multiz44way.maf for F in maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} | sed -e 's#^s \([A-Z0-9a-z_]*\)#s \1.\1#;' done | sed -e 's#^s NC_045512v2.NC_045512v2#s wuhCor1.NC_045512v2#;' \ >> multiz44way.maf tail -1 maf/NC_004718v3.maf >> multiz44way.maf # -rw-rw-r-- 1 63426765 Mar 13 20:23 multiz44way.maf # scan names to verify sanity: grep "^s " multiz44way.maf | awk '{print $2}' | sort | uniq -c \ | sort -rn | sed -e 's/^/# /;' | less # 3956 wuhCor1.NC_045512v2 # 3784 MG772933v1.MG772933v1 # 3741 MG772934v1.MG772934v1 # 3569 MN996532v1.MN996532v1 # 3526 DQ022305v2.DQ022305v2 ... # 2666 KY770858v1.KY770858v1 # 2666 JX993987v1.JX993987v1 # 2494 JX993988v1.JX993988v1 # 2064 KJ473816v1.KJ473816v1 # 1978 KJ473815v1.KJ473815v1 # Load into database ssh hgwdev cd /hive/data/genomes/wuhCor1/bed/multiz44way mkdir /gbdb/wuhCor1/multiz44way ln -s `pwd`/multiz44way.maf /gbdb/wuhCor1/multiz44way cd /dev/shm time hgLoadMaf wuhCor1 multiz44way # Loaded 3956 mafs in 1 files from /gbdb/wuhCor1/multiz44way # real 0m0.667s # merge that into a single block: cd /hive/data/genomes/wuhCor1/bed/multiz44way time mafFrag wuhCor1 multiz44way NC_045512v2 0 29903 + mafFrag.multiz44way.maf # real 0m0.529s printf '#!/usr/bin/env perl use strict; use warnings; my $file = shift; open (FH, "<$file") or die "can not read $file"; while (my $line = ) { if ($line =~ m/^s /) { chomp $line; my @a = split('"'"'\s+'"'"', $line); if (scalar(@a) == 7) { if ($a[1] !~ m/wuhCor1/) { $a[1] = "$a[1].$a[1]"; } $a[6] =~ s/\./-/g; print join(" ", @a), "\\n"; } else { die "ERROR: s line found not 7 fields ?"; } } else { printf "%%s", $line; } } close (FH); ' > dotToDash.pl chmod +x dotToDash.pl ./dotToDash.pl mafFrag.multiz44way.maf > defraged.multiz44way.maf # and reload: rm /gbdb/wuhCor1/multiz44way/multiz44way.maf ln -s `pwd`/defraged.multiz44way.maf \ /gbdb/wuhCor1/multiz44way/multiz44way.maf cd /dev/shm time hgLoadMaf wuhCor1 multiz44way # Loaded 1 mafs in 1 files from /gbdb/wuhCor1/multiz44way # real 0m0.038s # -rw-rw-r-- 1 38 Mar 13 20:35 multiz44way.tab cat multiz44way.tab # 585 NC_045512v2 0 29903 5 29 0.500000 # do NOT need to load MafSummary, this is already small enough, # no summary level needed # construct strain-name track mkdir /cluster/data/wuhCor1/bed/multiz44way/strainName cd /cluster/data/wuhCor1/bed/multiz44way/strainName cut -f1,3 ../acc.date.description.list | sed -e 's/ /_/g;' \ | awk -F$'\t' '{printf "s#%s.%s#%s#g;\n", $1, $1, $2}' > accToName.sed sed -f accToName.sed ../defraged.multiz44way.maf \ | sed -e 's#wuhCor1.SARS-CoV-2/Wuhan-Hu-1#wuhCor1.NC_045512v2#;' \ > strainName.multiz44way.maf rm /gbdb/wuhCor1/multiz44way/strainName44way.maf ln -s `pwd`/strainName.multiz44way.maf \ /gbdb/wuhCor1/multiz44way/strainName44way.maf cd /dev/shm time hgLoadMaf -loadFile=/gbdb/wuhCor1/multiz44way/strainName44way.maf \ wuhCor1 strainName44way ############################################################################## # frames 2020-03-13 - Hiram first attempt, one gene track used genePredSingleCover ../ncbiGene/wuhCor1.ncbiGene.ucsc.genePred.gz stdout \ | genePredToMafFrames wuhCor1 mafFrag.multiz44way.maf \ frames.tab wuhCor1 /dev/stdin wc -l frames.tab # 10 frames.tab hgLoadMafFrames wuhCor1 multiz44wayFrames frames.tab ############################################################################## # braney: build more inclusive single coverage ORF annotation for MAF codon display and for mafSnp display mkdir /hive/data/genomes/wuhCor1/bed/multiz44way/mafFrames cd /hive/data/genomes/wuhCor1/bed/multiz44way/mafFrames cp -p "/hive/data/outside/ncbi/genomes/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_protein.faa.gz" . zgrep "^>" GCF_009858895.2_ASM985889v3_protein.faa.gz >YP_009724389.1 orf1ab polyprotein [Wuhan seafood market pneumonia virus] >YP_009724390.1 surface glycoprotein [Wuhan seafood market pneumonia virus] >YP_009724391.1 ORF3a protein [Wuhan seafood market pneumonia virus] >YP_009724392.1 envelope protein [Wuhan seafood market pneumonia virus] >YP_009724393.1 membrane glycoprotein [Wuhan seafood market pneumonia virus] >YP_009724394.1 ORF6 protein [Wuhan seafood market pneumonia virus] >YP_009724395.1 ORF7a protein [Wuhan seafood market pneumonia virus] >YP_009724396.1 ORF8 protein [Wuhan seafood market pneumonia virus] >YP_009724397.2 nucleocapsid phosphoprotein [Wuhan seafood market pneumonia virus] >YP_009725255.1 ORF10 protein [Wuhan seafood market pneumonia virus] >YP_009725295.1 orf1a polyprotein [Wuhan seafood market pneumonia virus] >YP_009725296.1 ORF7b [Wuhan seafood market pneumonia virus] blat -noHead -t=dnax -q=prot /hive/data/genomes/wuhCor1/wuhCor1.2bit \ GCF_009858895.2_ASM985889v3_protein.faa.gz stdout | pslToBed stdin full.bed cp full.bed hack.bed #edit hack.bed to remove overlap of YP_009725296.1 with YP_009724395.1 # change the 27758 to 27755 and the 129 to 126 diff full.bed hack.bed 12c12 < NC_045512v2 27755 27884 YP_009725296.1 1000 + 27755 27884 0 1 129, 0, --- > NC_045512v2 27758 27884 YP_009725296.1 1000 + 27758 27884 0 1 126, 0, bedToGenePred hack.bed stdout | genePredSingleCover stdin singleCover44way.gp hgLoadGenePred wuhCor1 singleCover44way singleCover44way.gp genePredToMafFrames wuhCor1 ../mafFrag.multiz44way.maf frames.tab wuhCor1 singleCover44way.gp hgLoadMafFrames wuhCor1 multiz44wayFrames frames.tab hgLoadMafFrames wuhCor1 strainName44wayFrames frames.tab ############################################################################## # constructing download files (DONE - 2020-03-13 - Hiram) ############################################################################## mkdir /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way cd /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way cat ../../4d/44way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -noInternal -lineOutput \ /dev/stdin > wuhCor1.44way.nh ln -s ../phastCons44way/nameList44.txt ./wuhCor1.44way.nameList.txt cut -f1,3 wuhCor1.44way.nameList.txt > accession.descriptiveName.tsv cat ../../4d/44way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl -quoteNames \ -nameTranslate=accession.descriptiveName.tsv -noInternal -lineOutput \ -bothNames /dev/stdin > wuhCor1.44way.descriptiveName.nh # These .nh files end up with an extra bit of distance business on # the last line: # "Wigeon CoV HKU20":0.181934):0.1:0.1; # edit that down to just: # "Wigeon CoV HKU20":0.181934); # same for: # NC_016995v1:0.181934); # probably a bug in the binaryTree.pl script # do not need this two column list: rm accession.descriptiveName.tsv cp -p ../../defraged.multiz44way.maf ./wuhCor1.multiz44way.maf gzip wuhCor1.multiz44way.maf # use the README from eboVir3 and edit for circumstances here # generate phylo distance list: /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/all_dists \ wuhCor1.44way.descriptiveName.nh | grep NC_045512v2 \ | sed -e "s#'NC_045512v2/Wuhan-Hu-1'##;" \ | awk -F$'\t' '{printf "%s\t%s\n", $3,$2}' \ > wuhCor1.44way.phyloDistance.txt # construct redmine fileList cd /hive/data/genomes/wuhCor1/bed/multiz44way/downloads find /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1 /gbdb/wuhCor1/multiz44way ! -type d \ | egrep "multiz44way|phastCons44way|phyloP44way" \ > redmine25090.file.list hgsql -e 'show tables;' wuhCor1 | grep 44 > redmine25090.table.list ############################################################################## # constructing download files (DONE - 2020-03-13 - Hiram) ############################################################################## mkdir /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way cd /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way ln -s ../../acc.date.description.list ./wuhCor1.44way.nameList.txt cut -f1,3 wuhCor1.44way.nameList.txt > accession.descriptiveName.tsv # the sed -e 's#:0.1:0.1;#;#;' removes the error output last line # of binaryTree.pl cat ../../4d/44way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl -quoteNames \ -nameTranslate=accession.descriptiveName.tsv -noInternal -lineOutput \ -bothNames /dev/stdin | sed -e 's#:0.1:0.1;#;#;' \ > wuhCor1.44way.descriptiveName.nh # do not need this two column list: rm accession.descriptiveName.tsv cp -p ../../strainName/strainName.multiz44way.maf ./wuhCor1.multiz44way.maf gzip wuhCor1.multiz44way.maf # use the README from the 119-way and edit for circumstances here # generate phylo distance list: cut -f1,3 ../../acc.date.description.list | sed -e 's/ /_/g;' \ | sed -e 's/\./v/;' \ | awk -F$'\t' '{printf "s#%s#%s.%s#g;\n", $1, $1, $2}' > accToName.sed /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/all_dists \ ../../4d/44way.nh | grep NC_045512v2 | cut -f2-3 | sort -k2,2n \ | sed -f accToName.sed > wuhCor1.44way.phyloDistance.txt # do not need to deliver this rm accToName.sed # and the sequences mkdir /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way/sequences cd /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way/sequences # need a file list: cut -f1 ../wuhCor1.44way.nameList.txt | sed -e 's/\./v/; s/$/.fa/;' | xargs echo DQ022305v2.fa DQ071615v1.fa DQ412042v1.fa DQ412043v1.fa DQ648856v1.fa FJ588686v1.fa KY352407v1.fa NC_014470v1.fa GQ153542v1.fa GQ153547v1.fa JX993987v1.fa JX993988v1.fa KF569996v1.fa KF367457v1.fa KU973692v1.fa KY417143v1.fa KY417144v1.fa KY417145v1.fa KY770860v1.fa KJ473812v1.fa KJ473814v1.fa KJ473815v1.fa KJ473816v1.fa KP886808v1.fa KT444582v1.fa KY417146v1.fa KY417147v1.fa KY417148v1.fa KY417149v1.fa KY770858v1.fa MN996532v1.fa KY417142v1.fa KY417151v1.fa KY417152v1.fa MG772934v1.fa KY938558v1.fa MK211374v1.fa MK211375v1.fa MK211376v1.fa MK211377v1.fa MK211378v1.fa MG772933v1.fa NC_004718v3.fa NC_045512v2.fa # using that file list: cd /hive/data/genomes/wuhCor1/bed/lastzStrains/sequences/safeNameFa tar cvzf /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way/sequences/dnaFasta44.tgz DQ022305v2.fa DQ071615v1.fa DQ412042v1.fa DQ412043v1.fa DQ648856v1.fa FJ588686v1.fa KY352407v1.fa NC_014470v1.fa GQ153542v1.fa GQ153547v1.fa JX993987v1.fa JX993988v1.fa KF569996v1.fa KF367457v1.fa KU973692v1.fa KY417143v1.fa KY417144v1.fa KY417145v1.fa KY770860v1.fa KJ473812v1.fa KJ473814v1.fa KJ473815v1.fa KJ473816v1.fa KP886808v1.fa KT444582v1.fa KY417146v1.fa KY417147v1.fa KY417148v1.fa KY417149v1.fa KY770858v1.fa MN996532v1.fa KY417142v1.fa KY417151v1.fa KY417152v1.fa MG772934v1.fa KY938558v1.fa MK211374v1.fa MK211375v1.fa MK211376v1.fa MK211377v1.fa MK211378v1.fa MG772933v1.fa NC_004718v3.fa NC_045512v2.fa # and the proteins cut -f1 ../wuhCor1.44way.nameList.txt | sed -e 's/\./v/; s/$/.faa.gz/;' \ | xargs echo DQ022305v2.faa.gz DQ071615v1.faa.gz DQ412042v1.faa.gz DQ412043v1.faa.gz DQ648856v1.faa.gz FJ588686v1.faa.gz KY352407v1.faa.gz NC_014470v1.faa.gz GQ153542v1.faa.gz GQ153547v1.faa.gz JX993987v1.faa.gz JX993988v1.faa.gz KF569996v1.faa.gz KF367457v1.faa.gz KU973692v1.faa.gz KY417143v1.faa.gz KY417144v1.faa.gz KY417145v1.faa.gz KY770860v1.faa.gz KJ473812v1.faa.gz KJ473814v1.faa.gz KJ473815v1.faa.gz KJ473816v1.faa.gz KP886808v1.faa.gz KT444582v1.faa.gz KY417146v1.faa.gz KY417147v1.faa.gz KY417148v1.faa.gz KY417149v1.faa.gz KY770858v1.faa.gz MN996532v1.faa.gz KY417142v1.faa.gz KY417151v1.faa.gz KY417152v1.faa.gz MG772934v1.faa.gz KY938558v1.faa.gz MK211374v1.faa.gz MK211375v1.faa.gz MK211376v1.faa.gz MK211377v1.faa.gz MK211378v1.faa.gz MG772933v1.faa.gz NC_004718v3.faa.gz NC_045512v2.faa.gz cd /hive/data/genomes/wuhCor1/bed/lastzStrains/sequences/gbkProteins tar cvzf /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way/sequences/proteinFasta44.tgz DQ022305v2.faa.gz DQ071615v1.faa.gz DQ412042v1.faa.gz DQ412043v1.faa.gz DQ648856v1.faa.gz FJ588686v1.faa.gz KY352407v1.faa.gz NC_014470v1.faa.gz GQ153542v1.faa.gz GQ153547v1.faa.gz JX993987v1.faa.gz JX993988v1.faa.gz KF569996v1.faa.gz KF367457v1.faa.gz KU973692v1.faa.gz KY417143v1.faa.gz KY417144v1.faa.gz KY417145v1.faa.gz KY770860v1.faa.gz KJ473812v1.faa.gz KJ473814v1.faa.gz KJ473815v1.faa.gz KJ473816v1.faa.gz KP886808v1.faa.gz KT444582v1.faa.gz KY417146v1.faa.gz KY417147v1.faa.gz KY417148v1.faa.gz KY417149v1.faa.gz KY770858v1.faa.gz MN996532v1.faa.gz KY417142v1.faa.gz KY417151v1.faa.gz KY417152v1.faa.gz MG772934v1.faa.gz KY938558v1.faa.gz MK211374v1.faa.gz MK211375v1.faa.gz MK211376v1.faa.gz MK211377v1.faa.gz MK211378v1.faa.gz MG772933v1.faa.gz NC_004718v3.faa.gz NC_045512v2.faa.gz # and the proteins in tab format: cut -f1 ../wuhCor1.44way.nameList.txt | sed -e 's/\./v/; s/$/.faa.tab.gz/;' \ | xargs echo tar cvzf /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way/sequences/proteinTab44.tgz DQ022305v2.faa.tab.gz DQ071615v1.faa.tab.gz DQ412042v1.faa.tab.gz DQ412043v1.faa.tab.gz DQ648856v1.faa.tab.gz FJ588686v1.faa.tab.gz KY352407v1.faa.tab.gz NC_014470v1.faa.tab.gz GQ153542v1.faa.tab.gz GQ153547v1.faa.tab.gz JX993987v1.faa.tab.gz JX993988v1.faa.tab.gz KF569996v1.faa.tab.gz KF367457v1.faa.tab.gz KU973692v1.faa.tab.gz KY417143v1.faa.tab.gz KY417144v1.faa.tab.gz KY417145v1.faa.tab.gz KY770860v1.faa.tab.gz KJ473812v1.faa.tab.gz KJ473814v1.faa.tab.gz KJ473815v1.faa.tab.gz KJ473816v1.faa.tab.gz KP886808v1.faa.tab.gz KT444582v1.faa.tab.gz KY417146v1.faa.tab.gz KY417147v1.faa.tab.gz KY417148v1.faa.tab.gz KY417149v1.faa.tab.gz KY770858v1.faa.tab.gz MN996532v1.faa.tab.gz KY417142v1.faa.tab.gz KY417151v1.faa.tab.gz KY417152v1.faa.tab.gz MG772934v1.faa.tab.gz KY938558v1.faa.tab.gz MK211374v1.faa.tab.gz MK211375v1.faa.tab.gz MK211376v1.faa.tab.gz MK211377v1.faa.tab.gz MK211378v1.faa.tab.gz MG772933v1.faa.tab.gz NC_004718v3.faa.tab.gz NC_045512v2.faa.tab.gz mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/multiz44way/sequences cd /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/multiz44way/sequences ln -s /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way/sequences/* . cd /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/multiz44way ln -s /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/multiz44way/* . # error OK: # ln: failed to create symbolic link './sequences': File exists # construct redmine fileList cd /hive/data/genomes/wuhCor1/bed/multiz44way/downloads find /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1 /gbdb/wuhCor1/multiz44way ! -type d \ | egrep "multiz44way|phastCons44way|phyloP44way" \ > redmine25187.file.list hgsql -e 'show tables;' wuhCor1 | grep 44 > redmine25187.table.list # the table.list should just be: mafSnpStrainName44way strainName44way strainName44wayFrames strainPhastCons44way strainPhastConsElements44way strainPhyloP44way Not the other 44way tables. Also the file list needed a couple multiz44way files removed.