######################################################################### ## collecting sequences - (DONE - 2020-03-06 - Hiram) ######################################################################### mkdir -p /hive/data/genomes/wuhCor1/bed/lastzStrains/sequences cd /hive/data/genomes/wuhCor1/bed/lastzStrains/sequences # The SARS-CoV-2.acc.2020-03-06.list is obtained from Entrez with search term: # SARS-CoV-2 # produces 108 sequences as of 06 March With that list, fetch sequences and gbk records: mkdir -p fasta gbk cat SARS-CoV-2.acc.2020-03-06.list | while read acc do if [ ! -s "fasta/${acc}.fa" ]; then wget -O fasta/${acc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$acc" wget -O gbk/${acc}.gbk \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=gb&sendto=on&id=$acc" fi done # The coronaviridae.list was obtained from Entrez with the search string: # Coronaviridae[Organism] AND srcdb_refseq[PROP] NOT wgs[PROP] NOT cellular organisms[ORGN] NOT AC_000001:AC_999999[PACC] # Which was created from the page: # https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=11118 # when asked to show "RefSeq nucleotides" # this is a list of 55 sequences, all RefSeq with NC_ identifiers # With that list, fetch sequences and gbk records: cat coronaviridae.list | while read acc do if [ ! -s "fasta/${acc}.fa" ]; then wget -O fasta/${acc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$acc" wget -O gbk/${acc}.gbk \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=gb&sendto=on&id=$acc" fi done # The china.all.fasta is from: the 'download all sequences' button # https://bigd.big.ac.cn/ncov/release_genome # As well as the listing, from the 'download table' button: # china.sequence.list.txt # Can select the 'complete' sequence list from that table: awk -F$'\t' '$5 == "Complete"' 'china.sequence.list.txt' | cut -f1,2,5 # split of the china.all.fasta record: sed -e 's/^>.* />/;' china.all.fasta > shortNames.china.all.fasta faSplit byname shortNames.china.all.fasta chinaFasta/ # Now, to eliminate the duplicates from those two sources: faToTwoBit fasta/*.fa chinaFasta/*.fa t.2bit twoBitDup t.2bit | awk '{print $1}' | sort > dup.list # extract the non duplicates twoBitToFa t.2bit stdout | faSomeRecords -exclude stdin dup.list nonDups.fa # the duplicates were moved to ncbiDups/ and chinaDups/ directories # need to ensure our reference is in there, it was a duplicate: # NC_045512.2 and MN908947.3 are identical # and the MN908947.3 sequence was used, move those around to get the # NC_045512.2 sequence there instead # some sequences were too short, they were just gene sequences: twoBitInfo t.2bit stdout | awk '$2 < 25000' > too.small # LC522350.1 182 # LC523807.1 357 # MN938385.1 287 # MN938387.1 107 # MN970003.1 290 # MT008022.1 322 # MT042773.1 294 # MT050414.1 562 # MT072667.1 670 # MT072668.1 810 # MT081059.1 1260 # MT081066.1 1260 # MT111895.1 770 # MT111896.1 569 # MT127113.1 615 # MT127114.1 1411 # MT127115.1 1269 # MT127116.1 459 # MT152900.1 322 # NM_001371415.1 3339 # NM_021804.3 3596 # moved to ncbiTooSmall/ directory # Construct UCSC safe sequence names: mkdir -p safeNameFa for F in fasta/*.fa chinaFasta/*.fa do B=`basename $F | sed -e "s/\.\([0-9]\+\)/v\1/; s/-/_/g;"` printf "%s\n" "${B}" # head -12 "${F}" | sed -e 's/^>\([^ ]\+\).*/>\1/;' | sed -e 's/\./v/; s/-/_/g;' sed -e 's/^>\([^ ]\+\).*/>\1/;' "${F}" | sed -e 's/\./v/; s/-/_/g;' \ > safeNameFa/${B} done # this leaves 119 sequences, mostly from NCBI, a few from China: # GWHABKP00000000.fa MT019532v1.fa NC_001846v1.fa NC_018871v1.fa # GWHABKW00000000.fa MT019533v1.fa NC_002306v3.fa NC_019843v3.fa # LC521925.fa MT020781v1.fa NC_002645v1.fa NC_022103v1.fa # LC522972.fa MT027062v1.fa NC_003045v1.fa NC_023760v1.fa # LC522973.fa MT027064v1.fa NC_003436v1.fa NC_025217v1.fa # LC522974.fa MT039873v1.fa NC_004718v3.fa NC_026011v1.fa # LC522975.fa MT039887v1.fa NC_005831v2.fa NC_028752v1.fa # LC528232v1.fa MT039888v1.fa NC_006213v1.fa NC_028806v1.fa # LC528233v1.fa MT039890v1.fa NC_006577v2.fa NC_028811v1.fa # LR757995v1.fa MT044257v1.fa NC_009019v1.fa NC_028814v1.fa # LR757996v1.fa MT044258v1.fa NC_009020v1.fa NC_028824v1.fa # LR757997v1.fa MT049951v1.fa NC_009021v1.fa NC_028833v1.fa # LR757998v1.fa MT066175v1.fa NC_009657v1.fa NC_030292v1.fa # MN938384v1.fa MT066176v1.fa NC_009988v1.fa NC_030886v1.fa # MN975262v1.fa MT072688v1.fa NC_010437v1.fa NC_032107v1.fa # MN985325v1.fa MT093571v1.fa NC_010438v1.fa NC_032730v1.fa # MN988668v1.fa MT093631v1.fa NC_010646v1.fa NC_034440v1.fa # MN988713v1.fa MT106052v1.fa NC_010800v1.fa NC_034972v1.fa # MN994467v1.fa MT106053v1.fa NC_011547v1.fa NC_035191v1.fa # MN994468v1.fa MT106054v1.fa NC_011549v1.fa NC_038294v1.fa # MN996527v1.fa MT118835v1.fa NC_011550v1.fa NC_038861v1.fa # MN996528v1.fa MT123290v1.fa NC_012936v1.fa NC_039207v1.fa # MN996529v1.fa MT123291v1.fa NC_014470v1.fa NC_039208v1.fa # MN996530v1.fa MT123292v1.fa NC_016991v1.fa NC_045512v2.fa # MN996531v1.fa MT123293v1.fa NC_016992v1.fa NMDC60013002_05.fa # MN997409v1.fa MT126808v1.fa NC_016993v1.fa NMDC60013002_06.fa # MT007544v1.fa MT135041v1.fa NC_016994v1.fa NMDC60013002_07.fa # MT019529v1.fa MT135043v1.fa NC_016995v1.fa NMDC60013002_09.fa # MT019530v1.fa MT152824v1.fa NC_016996v1.fa NMDC60013002_10.fa # MT019531v1.fa NC_001451v1.fa NC_017083v1.fa # verify no dups: faToTwoBit safeNameFa/*.fa safeName.2bit twoBitDup safeName.2bit # is silent, and sizes are reasonable: twoBitInfo safeName.2bit stdout | sort -k2,2n | head -2 # NC_039208v1 25425 # NC_035191v1 25995 twoBitInfo safeName.2bit stdout | sort -k2,2n | tail -2 # NC_025217v1 31491 # NC_010646v1 31686 ######################################################################### ## kmer calculations for phylo tree generation (DONE - 2020-03-06 - Hiram) ######################################################################### mkdir /hive/data/genomes/wuhCor1/bed/lastzStrains/kmers cd /hive/data/genomes/wuhCor1/bed/lastzStrains/kmers # construct a parasol job list: ls ../sequences/safeNameFa | sed -e 's/.fa//;' | while read acc do printf "kmerOne %s {check out exists+ kmers/%s/%s.31mers.profile.txt.gz}\n" "${acc}" "${acc}" "${acc}" done > jobList ### The kmerOne script run in that jobList is: ############################################## #!/bin/bash set -beEu -o pipefail export SORT="$HOME/bin/x86_64/gnusort -S32G --parallel=4 -T /dev/shm" acc=$1 fa="../sequences/safeNameFa/$acc.fa" if [ ! -s "kmers/${acc}/${acc}.31mers.profile.txt.gz" ]; then mkdir -p "kmers/${acc}" ./kmerPrint.pl 31 ${fa} | gzip -c > kmers/${acc}/${acc}.31mers.txt.gz zcat kmers/${acc}/${acc}.31mers.txt.gz | cut -f1 | ${SORT} | uniq -c \ | awk '{printf "%s\t%d\n", $2, $1}' | ${SORT} | gzip -c > kmers/${acc}/${acc}.31mers.profile.txt.gz rm -f kmers/${acc}/${acc}.31mers.txt.gz fi ############################################## ### Takes about 3 minutes to run that parasol job on hgwdev ### After the kmers are done, compare all by all ### Construct a parasol job list to do this with the script: ~/kent/src/hg/makeDb/doc/wuhCor1/mkCompareJobList.pl > compare.jobList ### That jobList uses the script comparePair cp -p ~/kent/src/hg/makeDb/doc/wuhCor1/comparePair ./ ### Takes 10 minutes to run on hgwdev: # Completed: 7021 of 7021 jobs # CPU time in finished jobs: 1367s 22.78m 0.38h 0.02d 0.000 y # IO & Wait Time: 17245s 287.42m 4.79h 0.20d 0.001 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 6s 0.10m 0.00h 0.00d # Submission to last job: 618s 10.30m 0.17h 0.01d ### Collect all the comparisons together into a single file: find ./compare.31 -type f | xargs cat > allByAll.kmers31.compared.txt ### That is a four column list: 1. target sequence name 2. perCent of target sequence matching to query 3. query sequence name 4. perCent of query sequence matching to target ### The number for 'matching to query' and 'matching to target' is the ### same number: basesMatched, it is just different denominators: $targetPercent = 100.0 * $basesMatched / $targetSize; $queryPercent = 100.0 * $basesMatched / $querySize; ### For example: # LR757996v1 99.7755 MT123290v1 99.6986 # LR757996v1 99.8760 MT106053v1 99.8292 # LR757996v1 99.7755 MN994468v1 99.7253 ### Using that four column file, construct an all by all matrix, using ### the matrixUp.pl script: ~/kent/src/hg/makeDb/doc/wuhCor1/matrixUp.pl ~/kent/src/hg/makeDb/doc/wuhCor1/matrixUp.pl allByAll.kmers31.compared.txt \ > data31mer.csv mv matrixKey.txt matrixKey.31mer.txt ### that matrixKey is the row and column names, the data31mer.csv matrix ### has not column or row header names. ### Now, using the phylip 'neighbor' program to do neighbor joining of that ### matrix into a phylo tree: mkdir /hive/data/genomes/wuhCor1/bed/lastzStrains/kmers/phylip cd /hive/data/genomes/wuhCor1/bed/lastzStrains/kmers/phylip ### This 'neighbor' command is very limited, it can only handle 'names' ### that are integers and not longer that 10 digits. The names are ### shortened by the mkTestData.pl script: ~/kent/src/hg/makeDb/doc/wuhCor1/mkTestData.pl ### I realize now, that script is overly complicated, it should simply assign ### sequence integers to each name and output the correspondence table rather ### than trying to shorten the names magically. Way too much work. ### Done the hard way: ~/kent/src/hg/makeDb/doc/wuhCor1/mkTestData.pl 31 > infile \ 2> name.31mer.translate.txt ### The 'neighbor' command is an interactive command, it expects to ### find the input file: 'infile' which was just made by mkTestData.pl ### it displays an interactive menu, use the 'N' selection command to select ### UPGMA type of joining, see also additional comments of how this was run ### in ../staAur2/multiz369way.txt ### After neighbor is done, rename the results, and restore the sequence names in the final upgma.31mer.nh tree: if [ -s outfile ]; then rm -f outfile.31mer mv outfile outfile.31mer fi if [ -s outtree ]; then rm -f outtree.31mer mv outtree outtree.31mer fi # Note the duplicate sequence MN908947v3 was used instead of NC_045512v2 # fixup the name with the sed cat outtree.31mer \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -nameTranslate=name.31mer.translate.txt -noInternal -lineOutput /dev/stdin \ | sed -e 's/MN908947v3/NC_045512v2/;' > upgma.31mer.nh # Using the TreeGraph editor on the Mac, rework the tree to get the # reference NC_045512v2 at the top of the tree, this makes the tree: ~/kent/src/hg/makeDb/doc/wuhCor1/wuhCor1.119way.nh ######################################################################### ## setting up multiz mkdir /hive/data/genomes/wuhCor1/bed/multiz119way cd /hive/data/genomes/wuhCor1/bed/multiz119way # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../../goldenPath/multiz119way/sequenceNames.119way.nh \ | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # all the nets are the same, can use ordinary mafNet: # bash shell syntax here ... cd /hive/data/genomes/wuhCor1/bed/multiz119way export H=/hive/data/genomes/wuhCor1/bed mkdir mafLinks # the grep -v NC_045512v2 eliminates the self alignment ls -d ../lastzStrains/runDir/lastz.* | grep -v NC_045512v2 | while read D do if [ -d "${D}" ]; then asmId=`basename $D | sed -e 's/lastz.//;'` mafNet="${D}/mafNet/NC_045512v2.maf.gz" if [ ! -f "${mafNet}" ]; then echo "ERROR: can not find mafNet file ${mafNet}" 1>&2 exit 255 fi echo ln -s ../$mafNet mafLinks/NC_045512v2.$asmId.maf.gz ln -s ../$mafNet mafLinks/NC_045512v2.$asmId.maf.gz fi done # verify the symLinks are good: ls -ogrtL mafLinks/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' | head -4 # 17193 Mar 9 17:09 mafLinks/NC_045512v2.GWHABKP00000000.maf.gz # 12617 Mar 9 17:17 mafLinks/NC_045512v2.NC_038294v1.maf.gz # 11529 Mar 9 17:17 mafLinks/NC_045512v2.NC_012936v1.maf.gz # 12046 Mar 9 17:20 mafLinks/NC_045512v2.NC_034440v1.maf.gz ls -ogrtL mafLinks/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' | tail -4 # 11733 Mar 9 17:38 mafLinks/NC_045512v2.NC_017083v1.maf.gz # 8549 Mar 9 17:38 mafLinks/NC_045512v2.NC_016991v1.maf.gz # 13176 Mar 9 17:38 mafLinks/NC_045512v2.MT039888v1.maf.gz # 13143 Mar 9 17:39 mafLinks/NC_045512v2.NMDC60013002_06.maf.gz # leaves 118 files: ls -ogrtL mafLinks/*.maf.gz | wc -l # 118 # scan the names to verify sanity: zcat mafLinks/*.maf.gz | grep "^s " | awk '{print $2}' | sort \ | uniq -c | sort -rn | sed -e 's/^/# /;' | less # should look like: # 775 NC_045512v2.NC_045512v2 # 22 NC_025217v1.NC_025217v1 # 22 LR757997v1.LR757997v1 # 21 NC_016991v1.NC_016991v1 # 21 GWHABKW00000000.GWHABKW00000000 # 18 NC_012936v1.NC_012936v1 # ... # 1 LC528232v1.LC528232v1 # 1 LC522975.LC522975 # 1 LC522974.LC522974 # 1 LC522973.LC522973 # 1 LC522972.LC522972 # 1 LC521925.LC521925 # 1 GWHABKP00000000.GWHABKP00000000 ssh ku cd /hive/data/genomes/wuhCor1/bed/multiz119way mkdir run maf cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn ls ../mafLinks | sed -e 's/.maf.gz//; s/NC_045512v2.//' > maf.list printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/wuhCor1/bed/multiz119way/maf/$(root1).maf} #ENDLOOP ' > template printf '#!/bin/csh -ef set db = NC_045512v2 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/wuhCor1/bed/multiz119way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../tree.nh ../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //;" species.list`) set in = $pairs/$db.$s.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp ' > autoMultiz.csh gensub2 maf.list single template jobList para create jobList para try ... check ... push para time # Completed: 118 of 118 jobs # CPU time in finished jobs: 61696s 1028.27m 17.14h 0.71d 0.002 y # IO & Wait Time: 338s 5.63m 0.09h 0.00d 0.000 y # Average job time: 526s 8.76m 0.15h 0.01d # Longest finished job: 555s 9.25m 0.15h 0.01d # Submission to last job: 580s 9.67m 0.16h 0.01d # put the results back together into a single file, and fixup the s line names # the first sed is duplicating the sequence name on the s lines, transforming # from: # s MN996528v1 0 1 + 29891 A # to # s MN996528v1.MN996528v1 0 1 + 29891 A # and the ones that belong to the reference: # s NC_045512v2 0 1 + 29903 A # become via the second sed: # s wuhCor1.NC_045512v2 cd /hive/data/genomes/wuhCor1/bed/multiz119way head -1 maf/NC_018871v1.maf > multiz119way.maf for F in maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} | sed -e 's#^s \([A-Z0-9a-z_]*\)#s \1.\1#;' done | sed -e 's#^s NC_045512v2.NC_045512v2#s wuhCor1.NC_045512v2#;' \ >> multiz119way.maf tail -1 maf/NC_018871v1.maf >> multiz119way.maf # -rw-rw-r-- 1 950715307 Mar 9 22:34 multiz119way.maf # scan names to verify sanity: grep "^s " multiz119way.maf | awk '{print $2}' | sort | uniq -c \ | sort -rn | sed -e 's/^/# /;' | less # 132042 wuhCor1.NC_045512v2 # 132042 MT135043v1.MT135043v1 # 132042 MT135041v1.MT135041v1 # 132042 MT049951v1.MT049951v1 # 132042 MT039890v1.MT039890v1 # 132042 MT007544v1.MT007544v1 # 131570 MT019531v1.MT019531v1 ... # 37288 NC_011547v1.NC_011547v1 # 33630 NC_016992v1.NC_016992v1 # 28202 NC_016993v1.NC_016993v1 # 25724 NC_039208v1.NC_039208v1 # Load into database ssh hgwdev cd /hive/data/genomes/wuhCor1/bed/multiz119way mkdir /gbdb/wuhCor1/multiz119way ln -s `pwd`/multiz119way.maf /gbdb/wuhCor1/multiz119way cd /dev/shm time hgLoadMaf wuhCor1 multiz119way # Loaded 132042 mafs in 1 files from /gbdb/wuhCor1/multiz119way # real 0m6.780s # merge that into a single block: cd /hive/data/genomes/wuhCor1/bed/multiz119way time mafFrag wuhCor1 multiz119way NC_045512v2 0 29903 + mafFrag.multiz119way.maf # real 0m12.119s printf '#!/usr/bin/env perl use strict; use warnings; my $file = shift; open (FH, "<$file") or die "can not read $file"; while (my $line = ) { if ($line =~ m/^s /) { chomp $line; my @a = split("\s+", $line); if (scalar(@a) == 7) { if ($a[1] !~ m/wuhCor1/) { $a[1] = "$a[1].$a[1]"; } $a[6] =~ s/\./-/g; print join(" ", @a), "\n"; } else { die "ERROR: s line found not 7 fields ?"; } } else { printf "%s", $line; } } close (FH); ' > dotToDash.pl chmod +x dotToDash.pl ./dotToDash.pl mafFrag.multiz119way.maf > defraged.multiz119way.maf # and reload: rm /gbdb/wuhCor1/multiz119way/multiz119way.maf ln -s `pwd`/defraged.multiz119way.maf \ /gbdb/wuhCor1/multiz119way/multiz119way.maf cd /dev/shm time hgLoadMaf wuhCor1 multiz119way # Loaded 1 mafs in 1 files from /gbdb/wuhCor1/multiz119way # real 0m0.056s # construct strain-name track 2020-04-30 - Hiram mkdir /hive/data/genomes/wuhCor1/bed/multiz119way/strainName cd /hive/data/genomes/wuhCor1/bed/multiz119way/strainName cut -f1,3 ~/kent/src/hg/makeDb/doc/wuhCor1/nameList119.txt | sed -e 's/ /_/g;' \ | awk -F$'\t' '{printf "s#%s.%s#%s#g;\n", $1, $1, $2}' > accToName.sed cat ../defraged.multiz119way.maf \ | sed -f accToName.sed > strainName119way.maf mkdir /gbdb/wuhCor1/strainName119way rm /gbdb/wuhCor1/strainName119way/strainName119way.maf ln -s `pwd`/strainName.multiz119way.maf \ /gbdb/wuhCor1/strainName119way/strainName119way.maf cd /dev/shm time hgLoadMaf -loadFile=/gbdb/wuhCor1/multiz119way/strainName119way.maf \ wuhCor1 strainName119way # -rw-rw-r-- 1 39 Apr 30 21:38 strainName119way.tab cat strainName119way.tab 585 NC_045512v2 0 29903 14 29 0.500000 rm strainName119way.tab ############################################################################## # frames 2020-03-09 - Hiram first attempt, one gene track used genePredSingleCover ../ncbiGene/wuhCor1.ncbiGene.ucsc.genePred.gz stdout \ | genePredToMafFrames wuhCor1 mafFrag.multiz119way.maf \ frames.tab wuhCor1 /dev/stdin hgLoadMafFrames wuhCor1 multiz119wayFrames frames.tab ############################################################################## # braney: build more inclusive single coverage ORF annotation for MAF codon display and for mafSnp display mkdir /hive/data/genomes/wuhCor1/bed/multiz119way/mafFrames cd /hive/data/genomes/wuhCor1/bed/multiz119way/mafFrames wget "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_protein.faa.gz" blat -noHead -t=dnax -q=prot /hive/data/genomes/wuhCor1/wuhCor1.2bit GCF_009858895.2_ASM985889v3_protein.faa.gz stdout | pslToBed stdin full.bed cp full.bed hack.bed #edit hack.bed to remove overlap of YP_009725296.1 with YP_009724395.1 cd /hive/data/genomes/wuhCor1/bed/multiz119way/mafFrames bedToGenePred hack.bed stdout | genePredSingleCover stdin singleCover.gp hgLoadGenePred wuhCor1 singleCover singleCover.gp genePredToMafFrames wuhCor1 ../mafFrag.multiz119way.maf frames.tab wuhCor1 singleCover.gp hgLoadMafFrames wuhCor1 multiz119wayFrames frames.tab hgLoadMafFrames wuhCor1 strainName119wayFrames frames.tab ############################################################################## # constructing download files (DONE - 2020-03-13 - Hiram) ############################################################################## mkdir /hive/data/genomes/wuhCor1/bed/multiz119way/downloads/multiz119way cd /hive/data/genomes/wuhCor1/bed/multiz119way/downloads/multiz119way cat ../../4d/119way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \ -noInternal -lineOutput \ /dev/stdin > wuhCor1.119way.nh ln -s ../phastCons119way/nameList119.txt ./wuhCor1.119way.nameList.txt cut -f1,3 wuhCor1.119way.nameList.txt > accession.descriptiveName.tsv cat ../../4d/119way.nh \ | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \ | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl -quoteNames \ -nameTranslate=accession.descriptiveName.tsv -noInternal -lineOutput \ -bothNames /dev/stdin > wuhCor1.119way.descriptiveName.nh # These .nh files end up with an extra bit of distance business on # the last line: # "Wigeon CoV HKU20":0.181934):0.1:0.1; # edit that down to just: # "Wigeon CoV HKU20":0.181934); # same for: # NC_016995v1:0.181934); # probably a bug in the binaryTree.pl script # do not need this two column list: rm accession.descriptiveName.tsv cp -p ../../defraged.multiz119way.maf ./wuhCor1.multiz119way.maf gzip wuhCor1.multiz119way.maf # use the README from eboVir3 and edit for circumstances here # generate phylo distance list: /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/all_dists \ wuhCor1.119way.descriptiveName.nh | grep NC_045512v2 \ | sed -e "s#'NC_045512v2/Wuhan-Hu-1'##;" \ | awk -F$'\t' '{printf "%s\t%s\n", $3,$2}' \ > wuhCor1.119way.phyloDistance.txt # construct redmine fileList cd /hive/data/genomes/wuhCor1/bed/multiz119way/downloads find /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1 /gbdb/wuhCor1/multiz119way ! -type d \ | egrep "multiz119way|phastCons119way|phyloP119way" \ > redmine25090.file.list hgsql -e 'show tables;' wuhCor1 | grep 119 > redmine25090.table.list