# for emacs: -*- mode: sh; -*- # This file describes tracks built for the manuscript by Yatish Turakhia, Russ Corbet-Detig # et al. about apparent recurrent mutations (some erroneous, some real) that can cause trouble for # phylogenetic analyses, and comparisons of phylogenetic trees from different groups # (mainly Nextstrain and COG-UK). # Relevant github repos: # https://github.com/yatisht/strain_phylogenetics # https://github.com/lgozasht/COVID-19-Lab-Specific-Bias-Filter ######################################################################### # Lab-associated mutations (DONE - 2020-06-03 - Angie) mkdir /hive/data/genomes/wuhCor1/bed/labAssocMuts cd /hive/data/genomes/wuhCor1/bed/labAssocMuts # Saved file '2020-04-19 - Table_S1_Lab_Associated.tsv' emailed from Landen Gozashti # Convert file to bed & bigBed with labAssocMuts.as: perl -wne 's/[\r\n]//g; @w = split("\t"); next unless (@w and $w[0] =~ m/^[A-Z](\d+)[A-Z]$/); $pos = $1; # Tweak columns to match output of github.com/lgozasht/COVID-19-Lab-Specific-Bias-Filter ($name, $aaChange, $articPrimer, $altCount, $pars, $comment, $maf) = @w; $name =~ s/T/U/; $aaChange =~ s/^AACHANGE=//; $articPrimer =~ s/left/LEFT/g; $articPrimer =~ s/right/RIGHT/g; $articPrimer =~ s/^/nCoV-2019_/ if ($articPrimer ne ""); $articPrimer =~ s/, /,nCoV-2019_/g; print join("\t", "NC_045512v2", ($pos-1), $pos, $name, $pars, $altCount, $maf, $articPrimer, $aaChange, $comment) . "\n";' \ '2020-04-19 - Table_S1_Lab_Associated.tsv' \ | sort -k2n,2n \ > labAssocMutsNs419.bed bedToBigBed -tab -type=bed4+9 -as=$HOME/kent/src/hg/lib/labAssocMuts.as \ labAssocMutsNs419.bed ../../chrom.sizes labAssocMutsNs419.bb # Install in /gbdb mkdir /gbdb/wuhCor1/phylogenetics/ ln -s `pwd`/labAssocMutsNs419.bb /gbdb/wuhCor1/phylogenetics/ ######################################################################### # SARS-CoV-2 PHYLOGENY (DONE - 2020-09-03 - Angie) (sampleColorFile added 2020-09-27) # First done 2020-07-15; updated 2020-07-22 with tree from Yatish Turakhia, collapsed to keep # only nodes with associated mutations. # Updated 2020-09-03 with latest release 28-08-20. # NOTE: updates from 2020-11 onward are driven by scripts in hg/utils/otto/sarscov2phylo/ . releaseLabel=28-08-20 mkdir -p /hive/data/genomes/wuhCor1/bed/sarsCov2Phylo/$releaseLabel cd /hive/data/genomes/wuhCor1/bed/sarsCov2Phylo/$releaseLabel wget https://github.com/roblanf/sarscov2phylo/releases/download/28-08-20/ft_SH.tree # aln_global_unmasked.fa.xz emailed from Rob # Figure out what fasta sequences are not in the tree and therefore need to be excluded from VCF: xzcat aln_global_unmasked.fa.xz \ | grep ^\> \ | sed -re 's/^>//;' \ | sort > fa.ids.sorted sed -re 's/\)[0-9.]+:/\):/g; s/:[0-9e.:-]+//g; s/[\(\);]//g; s/,/\n/g;'" s/'//g;" ft_SH.tree \ | sort > tree.ids.sorted # How many samples? wc -l tree.ids.sorted #59600 tree.ids.sorted # Make sure the number of samples in tree but not fasta is 0 # (if not, probably need to fix sed command): comm -13 fa.ids.sorted tree.ids.sorted | wc -l #0 # Make list of samples in fasta but not tree: comm -23 fa.ids.sorted tree.ids.sorted > rob-$releaseLabel.idsToExclude # Get the name for the reference (Wuhan-Hu-1 or Wuhan/Hu-1): grep EPI_ISL_402125 fa.ids.sorted #hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31 # Run faToVcf. First without -ambiguousToN and -minAc for Yatish Turakhia's tools: xzcat aln_global_unmasked.fa.xz \ | faToVcf stdin gisaid-$releaseLabel.unfiltered.vcf -includeRef \ -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \ -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude #Read 59889 sequences. #Using hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31 as reference. #Excluded 289 sequences named in rob-28-08-20.idsToExclude (59600 sequences remaining including reference) # Remove some redundant parts of the very long concatenated IDs. perl -wpi -e 's@hCoV-19/@@g; s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.unfiltered.vcf ls -l gisaid-$releaseLabel.unfiltered.vcf #-rw-rw-r-- 1 angie genecats 2155993752 Sep 2 17:59 gisaid-28-08-20.unfiltered.vcf wc -l gisaid-$releaseLabel.unfiltered.vcf #18059 gisaid-28-08-20.unfiltered.vcf gzip -f gisaid-$releaseLabel.unfiltered.vcf # Cross-check number of samples in tree with VCF sample IDs: zcat gisaid-$releaseLabel.unfiltered.vcf.gz | head | g ^#CHROM | cut -f 10- | wc -w #59600 # Use -ambiguousToN to avoid complaints about ambiguous alleles from display code # (also to prevent more situations where it looks like CZB maybe saw an alt, but according # to them they didn't see an alt, it was a low-conf base). xzcat aln_global_unmasked.fa.xz \ | faToVcf stdin gisaid-$releaseLabel.ambigToN.vcf -includeRef \ -ambiguousToN \ -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \ -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude perl -wpi -e 's@hCoV-19/@@g; s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.ambigToN.vcf ls -l gisaid-$releaseLabel.ambigToN.vcf #-rw-rw-r-- 1 angie genecats 1705011225 Sep 2 18:05 gisaid-28-08-20.ambigToN.vcf wc -l gisaid-$releaseLabel.ambigToN.vcf #14278 gisaid-28-08-20.ambigToN.vcf bgzip -f gisaid-$releaseLabel.ambigToN.vcf tabix -p vcf gisaid-$releaseLabel.ambigToN.vcf.gz # Then with -ambiguousToN and -minAc (0.1% = 59600 / 1000 = 60) # for not-terribly-slow browser display: xzcat aln_global_unmasked.fa.xz \ | faToVcf stdin gisaid-$releaseLabel.minAf.001.vcf -includeRef \ -ambiguousToN -minAc=60 \ -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \ -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude perl -wpi -e 's@hCoV-19/@@g; s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.minAf.001.vcf ls -l gisaid-$releaseLabel.minAf.001.vcf #-rw-rw-r-- 1 angie genecats 63014097 Sep 2 18:07 gisaid-28-08-20.minAf.001.vcf wc -l gisaid-$releaseLabel.minAf.001.vcf #509 gisaid-28-08-20.minAf.001.vcf bgzip -f gisaid-$releaseLabel.minAf.001.vcf tabix -p vcf gisaid-$releaseLabel.minAf.001.vcf.gz # Make an even more filtered version with -minAc=596 (1%): xzcat aln_global_unmasked.fa.xz \ | faToVcf stdin gisaid-$releaseLabel.minAf.01.vcf -includeRef \ -ambiguousToN -minAc=596 \ -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \ -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude perl -wpi -e 's@hCoV-19/@@g; s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.minAf.01.vcf ls -l gisaid-$releaseLabel.minAf.01.vcf #-rw-rw-r-- 1 angie genecats 10185589 Sep 2 18:12 gisaid-28-08-20.minAf.01.vcf wc -l gisaid-$releaseLabel.minAf.01.vcf #66 gisaid-28-08-20.minAf.01.vcf bgzip -f gisaid-$releaseLabel.minAf.01.vcf tabix -p vcf gisaid-$releaseLabel.minAf.01.vcf.gz # Shorten tree IDs to match VCF: perl -wpe 's@hCoV-19/@@g; s@\|20(\d\d)@|$1@g; '"s@'@@g;" ft_SH.tree \ > ft_SH.shorterNames.tree # Set the root of the tree to the Wuhan/Hu-1 (NC_045512.2) reference not WIV04: ~/github/newick_utils/src/nw_reroot ft_SH.shorterNames.tree \ 'Wuhan/Hu-1/2019|EPI_ISL_402125|19-12-31' > ft_SH.reroot.nh # Use Yatish Turakhia's usher program to collapse nodes of the tree at which there is # no mutation, but not to condense identical samples into new nodes. #*** NOTE FOR NEXT TIME: use ~angie/github/yatish_usher/build/usher; output file names changed #*** and --print_uncondensed-final-tree changed to --write-uncondensed-final-tree ~angie/github/yatish_strain_phylogenetics/build/usher \ --vcf gisaid-$releaseLabel.unfiltered.vcf.gz \ --tree ft_SH.reroot.nh \ --collapse-final-tree \ --print_uncondensed-final-tree \ | tail -1 \ > ft_SH.collapsed.nh # Get sample metadata from GISAID so we can color samples & branches by lineage. # This requires registering with gisaid.org. Then go to https://www.epicov.org/epi3/frontend, # log in, click Downloads, and click "nextmeta" in the dialog that pops up. That downloads # a local metadata_*.tsv.gz file with timestamp name. # Extract the EPI_ ID and lineage columns: zcat metadata_2020-09-01*.tsv.gz \ | tail -n +2 \ | cut -f3,18,19 \ | sort > metadata_2020-09-01.epiToLineageAndGisaid # Map EPI ID to sample name used in tree & vcf awk -F\| '{print $2 "\t" $0;}' tree.ids.sorted \ | perl -wpe 's@hCoV-19/@@g; s@\|20(\d\d)@|$1@g; '"s@'@@g;" \ | sort > epiToSample # Join on EPI ID to associate tree sample names with lineages. join epiToSample metadata_2020-09-01.epiToLineageAndGisaid -o 1.2,2.2,2.3 > sampleToLineage # Add files with lineage colors and GISAID clade colors (trackDb setting sampleColorFile). # Color choices adapted from Figure 1 of # https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.32.2001410 $HOME/kent/src/hg/makeDb/doc/wuhCor1/cladeLineageColors.pl sampleToLineage \ | gzip > lineageColors.gz $HOME/kent/src/hg/makeDb/doc/wuhCor1/gisaidCladeColors.pl sampleToLineage \ | gzip > gisaidColors.gz # Install files in /gbdb. mkdir -p /gbdb/wuhCor1/sarsCov2Phylo for f in gisaid-$releaseLabel.{ambigToN,minAf.001,minAf.01}.vcf.gz; do destName=$(echo $f | sed -re "s/-$releaseLabel//") ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2Phylo/$destName ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2Phylo/$destName.tbi done ln -sf `pwd`/lineageColors.gz /gbdb/wuhCor1/sarsCov2Phylo/sarscov2phylo.lineageColors.gz ln -sf `pwd`/gisaidColors.gz /gbdb/wuhCor1/sarsCov2Phylo/sarscov2phylo.gisaidColors.gz #TODO ln -sf `pwd`/ft_SH.collapsed.nh \ /gbdb/wuhCor1/sarsCov2Phylo/sarscov2phylo.ft.nh ######################################################################### # SARS-CoV-2 PHYLOGENY - PUBLIC SEQUENCE (DONE - 2020-10-15 - Angie) # NOTE: updates from 2020-11 onward are driven by scripts in hg/utils/otto/sarscov2phylo/ . releaseLabel=28-08-20 cd /hive/data/genomes/wuhCor1/bed/sarsCov2Phylo/$releaseLabel # First, regenerate collapsed tree with latest usher so that branch lengths aren't munged. ~angie/github/yatish_usher/build/usher --vcf gisaid-28-08-20.unfiltered.vcf.gz --tree ft_SH.reroot.nh --collapse-final-tree --write-uncondensed-final-tree mv uncondensed-final-tree.nh ft_SH.collapsed.nh # Use Chris's latest mapping of EPI IDs to public sequence IDs to map tree IDs to public IDs. tawk '{if ($4 == "") { print $1, $2 "|" $3;} else { print $1, $2 "|" $3 "|" $4;} }' \ /hive/users/angie/gisaid/epiToPublicAndDate.2020-10-08 \ | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/' \ > epiToPublicName sed -re 's/[\):][^,]+,/\n/g; s/\(//g; s/,/\n/g; s/\)[0-9:]*;//;' ft_SH.collapsed.nh \ | awk -F"|" '{print $2 "\t" $0;}' | sort > epiToTreeName join -t" " epiToTreeName epiToPublicName | cut -f 2,3 > treeToPublic wc -l treeToPublic #30241 treeToPublic # Use new utils to limit tree and VCF to just the public sequences: phyloRenameAndPrune ft_SH.collapsed.nh treeToPublic ft_SH.collapsed.public.nh sed -re 's/,/,\n/g' ft_SH.collapsed.public.nh | wc -l #30241 vcfRenameAndPrune gisaid-$releaseLabel.ambigToN.vcf.gz treeToPublic \ public-$releaseLabel.ambigToN.vcf head public-$releaseLabel.ambigToN.vcf | grep ^#CHROM | wc # 1 30250 1460345 bgzip public-$releaseLabel.ambigToN.vcf tabix -p vcf public-$releaseLabel.ambigToN.vcf.gz # Make allele-frequency-filtered versions zcat public-$releaseLabel.ambigToN.vcf.gz \ | perl -wne 'if (/^#/) { print; } else { die unless m/^(\S+\t){7}AC=([\d,]+);AN=(\d+)/; ($acVal, $an) = ($2, $3); @acs = sort {$b <=> $a} split(",", $acVal); $maxAc = $acs[0]; $freq = $maxAc / $an; if ($freq >= 0.01) { print; } }' \ > public-$releaseLabel.minAf.01.vcf wc -l public-$releaseLabel.minAf.01.vcf #64 public-$releaseLabel.minAf.01.vcf bgzip public-$releaseLabel.minAf.01.vcf tabix -p vcf public-$releaseLabel.minAf.01.vcf.gz zcat public-$releaseLabel.ambigToN.vcf.gz \ | perl -wne 'if (/^#/) { print; } else { die unless m/^(\S+\t){7}AC=([\d,]+);AN=(\d+)/; ($acVal, $an) = ($2, $3); @acs = sort {$b <=> $a} split(",", $acVal); $maxAc = $acs[0]; $freq = $maxAc / $an; if ($freq >= 0.001) { print; } }' \ > public-$releaseLabel.minAf.001.vcf wc -l public-$releaseLabel.minAf.001.vcf #594 public-$releaseLabel.minAf.001.vcf bgzip public-$releaseLabel.minAf.001.vcf tabix -p vcf public-$releaseLabel.minAf.001.vcf.gz sed -re 's/\|/\\|/g;' treeToPublic \ | tawk '{print "s@" $1 "@" $2 "@;";}' \ > treeToPublic.sed # NOTE FOR NEXT TIME: this sed took 20 minutes! Need a faster substitution method. # ALSO we ended up with some garbage at the end of the file! lineFileUdcMayOpen on local file # somehow ignored the garbage which started with a large block of null characters, but # netLineFileMayOpen on remote compressed file did not ignore the garbage. Fixed 10/21/20. zcat lineageColors.gz | sed -rf treeToPublic.sed | grep -v EPI_ISL_ > publicLineageColors wc -l publicLineageColors #30239 publicLineageColors gzip publicLineageColors # Author credits file... strip GenBank version numbers because NCBI metadata doesn't have those cut -f 2 treeToPublic \ | cut -d \| -f 1 \ | sed -re 's/^([A-Z][A-Z][0-9]{6})\.[0-9]/\1/;' \ | sort > publicIdsInTree # Three categories of IDs, different metadata sources: # CNA0013884 -- from https://bigd.big.ac.cn/ncov/release_genome # * Advanced search, database = CNGBdb # * Select Column button, enable originating & submitting labs [no authors option unfortunately] # * Download Table button # * That saves an Excel .xlsx file; export to TSV, save as cncb.metadata..tsv # LC528232.1 -- from GenBank # * https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=SARS-CoV-2,%20taxid:2697049 # * Download button # * Current table view result --> CSV format, Next button # * Download all records, Next button # * Select Accession and Authors [no labs options unfortunately] # * Download button, save as ncbi.authors.date.csv # England/BIRM-5E2A3/2020 -- from COG-UK # * https://www.ebi.ac.uk/ena/browser/view/PRJEB37886 # * select columns center_name, sample_accession, sample_alias # * Download report: TSV # * file saved to filereport_read_run_PRJEB37886_tsv.txt (extra first column, run_accession) tail -n+2 cncb.metadata.20-10-15.tsv \ | cut -f 2,12,14 \ | egrep -v '^[A-Z][A-Z][0-9]{6}' \ | sed -e 's/"//g; s/$/\tn\/a/;' \ > cncb.credits tail -n+2 ncbi.authors.20-10-15.csv \ | csvToTab \ | tawk '{print $1, "n/a", "n/a", $2;}' \ > ncbi.credits tail -n+2 filereport_read_run_PRJEB37886_tsv.txt \ | tawk '{print $4, $3, $3, "COVID-19 Genomics UK Consortium";}' \ | sed -e 's@^COG-UK/@@;' \ | sort -u \ > cogUk.credits.partialIds grep / publicIdsInTree \ | awk -F/ '{print $2 "\t" $0;}' \ | sort \ > cogUk.partialToFull join -a 2 -e "n/a" -t" " -o 2.2,1.2,1.3,1.4 cogUk.credits.partialIds cogUk.partialToFull \ | tawk '$4 == "n/a" { $4 = "COVID-19 Genomics UK Consortium"; }' \ > cogUk.credits /bin/echo -e "accession\toriginating_lab\tsubmitting_lab\tauthors" > acknowledgements.tsv grep -Fwf publicIdsInTree cncb.credits >> acknowledgements.tsv grep -Fwf publicIdsInTree ncbi.credits >> acknowledgements.tsv grep -Fwf publicIdsInTree cogUk.credits >> acknowledgements.tsv gzip acknowledgements.tsv # Install mkdir /gbdb/wuhCor1/sarsCov2PhyloPub for f in public-$releaseLabel.{ambigToN,minAf.001,minAf.01}.vcf.gz; do destName=$(echo $f | sed -re "s/-$releaseLabel//") ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2Phylo/$destName ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2Phylo/$destName.tbi done ln -s `pwd`/ft_SH.collapsed.public.nh \ /gbdb/wuhCor1/sarsCov2PhyloPub/sarscov2phylo.pub.ft.nh ln -s `pwd`/publicLineageColors.gz \ /gbdb/wuhCor1/sarsCov2PhyloPub/sarscov2phylo.pub.lineageColors.gz ln -s `pwd`/acknowledgements.tsv.gz \ /gbdb/wuhCor1/sarsCov2PhyloPub/acknowledgements.tsv.gz ######################################################################### # PROBLEMATIC SITES (DONE - 2021-11-01 - Angie) # Previously updated 2020-08-26, 2020-12-30, 2021-08-16 today=$(date +%y-%m-%d) mkdir -p /hive/data/genomes/wuhCor1/bed/problematicSites/$today cd /hive/data/genomes/wuhCor1/bed/problematicSites/$today # They call the format VCF, but it is really just VCF-like with extra tab-sep columns. wget https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/problematic_sites_sarsCov2.vcf # Make a bigBed4+. First make sure the columns haven't changed since last time. columns=$(grep ^#CHROM problematic_sites_sarsCov2.vcf) expected=$(cat ../columns.expected) if [ "$columns" != "$expected" ]; then echo "STOP! COLUMNS CHANGED! Change the script." fi perl -we \ 'while (<>) { chomp; chomp; # The header defines some keywords and expanded values; store those for later use. if (/^##\s+(\w+) = (.+)/) { ($key, $val) = ($1, $2); if (exists $expand{$key}) { warn "Clash for key {$key}: |$expand{$key}| vs |$val|\n"; } $expand{$key} = $val; } if (/^#/) { next; } (undef, $pos, undef, $ref, $alt, undef, $filter, $info) = split("\t"); %info = map { ($key, $val) = split("="); $key => $val } split(";", $info); # Used to be columns: $contrib, $exc, $country, $lab, $gene, $aaPos, $aaRef, $aaAlt) # Now in info: SUB (submitter), EXC, SRC_COUNTRY, SRC_LAB, GENE, AA_POS, AA_REF, AA_ALT foreach $tag (qw/SRC_COUNTRY SRC_LAB GENE AA_POS AA_REF AA_ALT/) { if (! exists $info{$tag} || $info{$tag} eq ".") { $info{$tag} = ""; } } # Expand keywords used in a couple of the columns. @contribs = map { $expand{$_} || $_ } split(",", $info{SUB}); @labs = map { $expand{$_} || $_ } split(",", $info{SRC_LAB}); if ($info{SRC_LAB} ne "") { @countries = split(",", $info{SRC_COUNTRY}); if (scalar @labs != scalar @countries) { if (scalar @countries == 1) { for ($i = 1; $i < @labs; $i++) { $countries[$i] = $countries[0]; } } else { die "Differing numbers of countries and labs"; } } else { for ($i = 0; $i < @labs; $i++) { $labs[$i] .= " ($countries[$i])" } } } $info{GENE} =~ s/gene-//g; # Print out one joined record for each sequence of "seq_end" single-base annotations. if (defined $seqEndStart && $info{EXC} !~ /seq_end/) { print join("\t", "NC_045512v2", $seqEndStart, $seqEndEnd, "seq_end", "mask", "", "", $expand{"NDM"}, "", "", "", "", "") . "\n"; $seqEndStart = $seqEndEnd = undef; } if ($info{EXC} eq "seq_end") { if (! defined $seqEndStart) { $seqEndStart = $pos-1; } $seqEndEnd = $pos; } else { print join("\t", "NC_045512v2", $pos-1, $pos, $info{EXC}, $filter, $ref, $alt, join(", ", @contribs), join(", ", @labs), $info{GENE}, $info{AA_POS}, $info{AA_REF}, $info{AA_ALT}) . "\n"; } } if (defined $seqEndStart) { print join("\t", "NC_045512v2", $seqEndStart, $seqEndEnd, "seq_end", "mask", "", "", $expand{"NDM"}, "", "", "", "", "") . "\n"; }' problematic_sites_sarsCov2.vcf \ > problematicSites.bed tawk '$5 == "mask"' problematicSites.bed | cut -f 1-4,6- \ | sort -k2n > problematicSitesMask.bed tawk '$5 == "caution"' problematicSites.bed | cut -f 1-4,6- \ | sort -k2n > problematicSitesCaution.bed # Split into two subtracks so we can color them differently: red for mask, orange for caution bedToBigBed -type=bed4+ -as=$HOME/kent/src/hg/lib/problematicSites.as -tab \ problematicSitesMask.bed ../../../chrom.sizes problematicSitesMask.bb bedToBigBed -type=bed4+ -as=$HOME/kent/src/hg/lib/problematicSites.as -tab \ problematicSitesCaution.bed ../../../chrom.sizes problematicSitesCaution.bb echo "$columns" > ../columns.expected # Install files mkdir -p /gbdb/wuhCor1/problematicSites ln -sf `pwd`/problematicSites{Mask,Caution}.bb /gbdb/wuhCor1/problematicSites/ ######################################################################### # # updates to SARS-CoV-2 PHYLOGENY (both GISAID and PUBLIC SEQUENCE) from # 2020-11 onward are driven by scripts in hg/utils/otto/sarscov2phylo/ . # ######################################################################### # B.1.1.7 variants (DONE - 2021-01-05 - angie) mkdir /hive/data/genomes/wuhCor1/bed/lineageB_1_1_7_US cd /hive/data/genomes/wuhCor1/bed/lineageB_1_1_7_US # I downloaded the first eight U.S. lineage B.1.1.7 sequences from GenBank (MW422255, MW422256) # and GISAID (EPI_ISL_755593, EPI_ISL_755594, EPI_ISL_755595, EPI_ISL_755596, EPI_ISL_755940, # EPI_ISL_755941), used hgPhyloPlace to align them to the genome, place in phylo tree and make # custom tracks including a VCF track with the 8 samples. Copy the custom track VCF and Newick: cp /data/apache/userdata/sessions/fb/AngieHinrichs/0f8b7e37/ct/ct_hgwdev_1761a_38f864.vcf \ lineageB_1_1_7_US_first8.vcf # Run minimap2 to get VCF for deletion variants that hgPhyloPlace ignores: minimap2 --cs NC_045512v2.fa MW422255.1.fa | paftools.js call -L 20000 -f NC_045512v2.fa - #NC_045512v2 11287 . GTCTGGTTTT G #NC_045512v2 21764 . ATACATG A #NC_045512v2 21990 . TTTA T #NC_045512v2 28270 . TA T # Edit lineageB_1_1_7_US_first8.vcf to add those deletion variants and remove no-call SNVs at # overlapping positions. # Add a column for B.1.1.7 consensus variants; default to match reference, then edit to set to alt # allele in specific locations. perl -wne 'chomp; @w = split("\t"); if (@w == 1) { print $_ . "\n"; } elsif ($w[0] eq "#CHROM") { print join("\t", @w[0..8], "B.1.1.7", @w[9..$#w]) . "\n"; } else { if ($w[7] =~ /^(AC=\d+;AN=)(\d+)$/) { $newAN = $2 + 1; $w[7] = "$1$newAN"; } else { die "Unexpected INFO column $w[7]"; } print join("\t", @w[0..8], "0", @w[9..$#w]) . "\n"; }' \ lineageB_1_1_7_US_first8.vcf > tmp # Edit tmp to set genotype value for B.1.1.7 to 1 where appropriate. # Exclude variants with no alt allele (included in hgPhyloPlace VCF only to highlight where a # sample has N's at variants in the big tree; not needed here & not handled quite right by our # VCF display...) tawk '$5 != "*"' tmp > lineageB_1_1_7_US_first8.vcf bgzip -f lineageB_1_1_7_US_first8.vcf tabix -p vcf lineageB_1_1_7_US_first8.vcf.gz # Copy the Newick tree file showing the relationship between the 8 samples: cp /data/apache/userdata/sessions/fb/AngieHinrichs/0f8b7e37/ct/uploadedSamples_hgwdev_16d7d_38f4d0.nwk \ lineageB_1_1_7_US_first8.nwk # Edit the Newick file to add B.1.1.7 consensus. # 2020-01-05: More sequences have appeard in GenBank; replace EPI_ IDs with GenBank accessions # so we can share VCF: zcat lineageB_1_1_7_US_first8.vcf.gz \ | sed -re 's/EPI_ISL_755593/MW430974.1/; s/EPI_ISL_755594/MW430966.1/; s/EPI_ISL_755595/MW430967.1/; s/EPI_ISL_755596/MW430968.1/;' \ > lineageB_1_1_7_US_first8.vcf bgzip -f lineageB_1_1_7_US_first8.vcf tabix -p vcf lineageB_1_1_7_US_first8.vcf.gz sed -re 's/EPI_ISL_755593/MW430974.1/; s/EPI_ISL_755594/MW430966.1/; s/EPI_ISL_755595/MW430967.1/; s/EPI_ISL_755596/MW430968.1/;' \ lineageB_1_1_7_US_first8.nwk > tmp mv tmp lineageB_1_1_7_US_first8.nwk # That VCF file will have to suppressed from download, but we can make a public-only VCF # file to offer for download: zcat lineageB_1_1_7_US_first8.vcf.gz \ | cut -f 1-16 \ | vcfFilter -minAc=1 stdin \ > lineageB_1_1_7_US_first6.vcf bgzip lineageB_1_1_7_US_first6.vcf tabix -p vcf lineageB_1_1_7_US_first6.vcf.gz # 2020-01-05: Sequence just became available from NY; add it in starting with hgPhyloPlace VCF... tail -n+399 /data/apache/trash/ct/ct_pp_hgwdev_angie_bcfa_4f0e10.ct | tawk '$5 != "*"' > tmp # Get deletion lines to add in: zcat lineageB_1_1_7_US_first8.vcf.gz | grep del >> tmp # Edit the deletion lines into the correct places so lines are sorted by position. # Check for other edits to make: zcat lineageB_1_1_7_US_first8.vcf.gz | cut -f 1-9 > /data/tmp/angie/1 cut -f 1-9 tmp > /data/tmp/angie/2 diff /data/tmp/angie/[12] #26c26 #< NC_045512v2 17615 A17615G A G . . AC=5;AN=9 GT #--- #> NC_045512v2 17615 A17615G A G . . AC=6;AN=9 GT # -- aha, the new sequence adds 1 to the AC for A17615G. # But the same variants are there in each file, so we can paste columns and then fix up AC and AN # with vcfFilter. cut -f 18 tmp > tmp2 zcat lineageB_1_1_7_US_first8.vcf.gz \ | paste - tmp2 \ | vcfFilter -minAc=1 stdin \ > lineageB_1_1_7_US_first9.vcf # Edit lineageB_1_1_7_US_first9.vcf to fix the pasted ##fileformat and ##reference header lines. bgzip -f lineageB_1_1_7_US_first9.vcf tabix -p vcf lineageB_1_1_7_US_first9.vcf.gz # Now copy the Newick file: cp /data/apache/trash/ct/uploadedSamples_hgwdev_angie_bcfa_4f0e10.nwk lineageB_1_1_7_US_first9.nwk # Edit the Newick file to add B.1.1.7 consensus. # Ha, I just missed the GenBank release! Well, replace EPI_ISL_767594 with MW440433.1. :) zcat lineageB_1_1_7_US_first9.vcf.gz \ | sed -re 's/EPI_ISL_767594/MW440433.1/;' \ > lineageB_1_1_7_US_first9.vcf bgzip -f lineageB_1_1_7_US_first9.vcf tabix -p vcf lineageB_1_1_7_US_first9.vcf.gz sed -re 's/EPI_ISL_767594/MW440433.1/;' lineageB_1_1_7_US_first9.nwk > tmp mv tmp lineageB_1_1_7_US_first9.nwk # Update the GenBank subset VCF that we can offer for download: zcat lineageB_1_1_7_US_first9.vcf.gz \ | cut -f 1-16,19 \ | vcfFilter -minAc=1 stdin \ > lineageB_1_1_7_US_first7.vcf bgzip lineageB_1_1_7_US_first7.vcf tabix -p vcf lineageB_1_1_7_US_first7.vcf.gz # Install in /gbdb mkdir /gbdb/wuhCor1/lineageB_1_1_7_US ln -s `pwd`/lineageB_1_1_7_US_first*.vcf.gz* /gbdb/wuhCor1/lineageB_1_1_7_US/ ln -s `pwd`/lineageB_1_1_7_US_first*.nwk /gbdb/wuhCor1/lineageB_1_1_7_US/ ######################################################################### # Normalize VCF+tree sample names in 13-11-20 public-all set; it will be basis of # incrementally updated public tree going forward. (DONE - 2021-01-07 - Angie) cd /hive/data/outside/otto/sarscov2phylo/13-11-20 # Names currently used in VCF+tree: zcat public-13-11-20.all.minAf.01.vcf.gz | head | grep ^#CHROM \ | sed -re 's/\t/\n/g;' | tail -n+10 \ > public-13-11-20.all.names wc -l public-13-11-20.all.names #157816 public-13-11-20.all.names # There are some inconsistent name and date formats that we need to clean up e.g. #NorthernIreland/NIRE-102049/2020|Northern_Ireland/NIRE-102049/2020|20-03-23 #Northern_Ireland/NIRE-106F84/2020|2020-05-26 # i.e. gisaidStrain|cogUkName|shortenedDate vs. cogUkName|date # --> Use only cogUkName since that's all we'll get going forward # --> Shorten dates that are not shortened # While we're at it, remove the rev-strand (doh!) CNCB version of GZMU0016 # (Guangzhou/GZMU0016/2020|CNA0013710|20-02-25) because mafft made a garbage alignment # for it, while the GenBank version is fine. awk -F\| '{ if ($2 ~ /^(England|Northern|Scotland|Wales)/) { print $0 "\t" $2 "|" $3; } else { print $0 "\t" $0; } }' \ public-13-11-20.all.names \ | grep -v CNA0013710 \ | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/; s/, /_/g; s/[, ]/_/g;' \ > cleanup.renaming wc -l cleanup.renaming #157815 cleanup.renaming # Make sure there aren't any dups cut -f 1 cleanup.renaming | sort -u | wc -l #157815 cut -f 2 cleanup.renaming | sort -u | wc -l #157815 # Make new VCF+tree with cleaned-up names phyloRenameAndPrune public-13-11-20.all.nh cleanup.renaming public-13-11-20.all.renamed.nh vcfRenameAndPrune public-13-11-20.all.vcf.gz cleanup.renaming stdout \ | bgzip -c > public-13-11-20.all.renamed.vcf.gz # Make a protobuf for UShER/hgPhyloPlace -- mask out problematic sites first.. problematicSitesVcf=/hive/data/genomes/wuhCor1/bed/problematicSites/20-12-30/problematic_sites_sarsCov2.vcf tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf time vcfFilter -excludeVcf=mask.vcf public-13-11-20.all.renamed.vcf.gz \ | gzip -c \ > public-13-11-20.all.renamed.masked.vcf.gz usherDir=~angie/github/usher usher=$usherDir/build/usher time $usher -c -u \ -v public-13-11-20.all.renamed.masked.vcf.gz \ -t public-13-11-20.all.renamed.nh \ -o public-13-11-20.all.renamed.masked.pb #real 21m48.333s # Likewise for the notMasked.pb time $usher -c -u \ -v <(zcat public-13-11-20.all.renamed.vcf.gz) \ -t public-13-11-20.all.renamed.nh \ -o public-13-11-20.all.renamed.notMasked.pb # Now a script in hg/utils/otto/sarscov2phylo/ can use those files as a base # for incrementally adding in new public sequences. ######################################################################### # Remove rev-strand version of GZMU0016 from cncb fasta (DONE - 2021-01-07 - Angie) cd /hive/data/outside/otto/sarscov2phylo/cncb.2020-10-15 grep ^\> cncb.nonGenBank.fasta | sed -re 's/^>//;' | grep CNA0013710 | awk '{print $1;}' \ > exclude.CNA0013710 faSomeRecords -exclude cncb.nonGenBank.fasta exclude.CNA0013710 tmp faSize cncb.nonGenBank.fasta #7754193 bases (35730 N's 7718463 real 7718459 upper 4 lower) in 260 sequences in 1 files #Total size: mean 29823.8 sd 235.7 min 26973 (BetaCoV/Wuhan/WH19002/2019) max 29941 (Guangzhou/GZ8H0005/2020) median 29861 faSize tmp #7724289 bases (35730 N's 7688559 real 7688555 upper 4 lower) in 259 sequences in 1 files #Total size: mean 29823.5 sd 236.1 min 26973 (BetaCoV/Wuhan/WH19002/2019) max 29941 (Guangzhou/GZ8H0005/2020) median 29860 mv tmp cncb.nonGenBank.fasta #########################################################################