######################################################################### # fetch sequences from Entrez (DONE - 2017-06-07 - Hiram) mkdir /hive/data/genomes/hbv1/bed/multipleAlignment cd /hive/data/genomes/hbv1/bed/multipleAlignment # obtain list of accessions with the search in Nucleotide: # Hepatitis B complete # set limit to 200 (the maximum), view 'accessions' # screen scrape the list into the file: # accession.list head accession.list AF384372.1 AF384371.1 ... etc ... # running this script to fetch this set: #!/bin/sh mkdir fasta gbk cat accession.list | while read acc do wget -O fasta/${acc}.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$acc" wget -O gbk/${acc}.gbk \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=gb&sendto=on&id=$acc" done # create a fasta file of them all with simple names: cd fasta sed -e 's/ .*//;' *.fa > ../hbv120.fa cd /hive/data/genomes/hbv1/bed/multipleAlignment # check for duplicates faToTwoBit hbv120.fa 200.2bit twoBitDup 200.2bit AB775201.1 and AB775200.1 are identical ######################################################################### # lastz the 200 sequences (DONE - 2017-06-07 - Hiram) mkdir /hive/data/genomes/hbv1/bed/multipleAlignment/lastz cd /hive/data/genomes/hbv1/bed/multipleAlignment/lastz faToTwoBit ../hbv120.fa hbv120.2bit target="/hive/data/genomes/hbv1/hbv1.2bit" query="/hive/data/genomes/hbv1/bed/multipleAlignment/lastz/hbv120.2bit" axtResult="hbv1.hbv120.axt" pslResult="hbv1.hbv120.psl" twoBitInfo $target stdout | sort -k2nr > target.chrom.sizes twoBitInfo $query stdout | sort -k2nr > query.chrom.sizes /cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz \ $target[multiple] $query H=2000 --format=axt+ > "${axtResult}" axtToPsl "${axtResult}" target.chrom.sizes query.chrom.sizes \ "${pslResult}" # only 185 sequences pass the pslScore test pslScore "${pslResult}" | awk '$5 > 2000' > filtered.2000 cut -f4 filtered.2000 | sed -e 's/:.*//;' \ > filtered.query.in.order.txt # this listToTree.pl takes the ordered list of sequence names, # ordered by pslScore, highest first, and converts that to a binary # .nh tree format for multiz ./listToTree.pl > 185way.nh where listToTree.pl is: #!/usr/bin/env perl use strict; use warnings; my $treeDepth = 0; my $specCount = 0; my $nameCount = `cat filtered.query.in.order.txt | wc -l`; chomp $nameCount; my $treeOut = "NC_003977v2:0.1"; ++$treeDepth; open (FH, ") { chomp $line; $line =~ s/\./v/; if ($treeDepth < 2) { $treeOut = sprintf("%s,%s:0.1):0.1,\n", $treeOut, $line); } else { if ($treeDepth < $nameCount) { $treeOut = sprintf("%s%s:0.1):0.1,\n", $treeOut, $line); } else { for (my $i = 0; $i < $treeDepth; ++$i) { printf "("; } printf "\n%s%s:0.1);\n", $treeOut, $line; } } ++$treeDepth; } close (FH); ######################################################################### # multiple alignment (DONE - 2017-06-07 - Hiram) mkdir /hive/data/genomes/hbv1/bed/multipleAlignment/multiz cd /hive/data/genomes/hbv1/bed/multipleAlignment/multiz echo NC_003977v2 > species.list cp -p ../lastz/185way.nh . rm -f species.list /cluster/home/hiram/kent/src/hg/utils/phyloTrees/speciesList.sh \ 185way.nh | sed -e 's/\./v/;' >> species.list sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 185way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh # rearrange the axt results from the lastz run into maf files # with appropriate target and query sequence names to work with 'multiz' export srcAxt="../lastz/hbv1.hbv120.axt" export targetSizes="../lastz/target.chrom.sizes" export querySizes="../lastz/query.chrom.sizes" axtSwap $srcAxt $targetSizes $querySizes swapped.hbv120.axt mkdir queryAxt axtSplitByTarget swapped.hbv120.axt queryAxt mkdir targetAxt for F in queryAxt/*.axt do B=`basename $F` axtSwap ${F} $querySizes $targetSizes targetAxt/${B} done mkdir queryMaf for F in targetMaf/*.axt do B=`basename $F | sed -e 's/.axt/.maf/;'` axtToMaf -qPrefix=qry -tPrefix=targ ${F} ${targetSizes} \ ${querySizes} queryMaf/${B} done mkdir mafLinks for F in queryMaf/*.maf do B=`basename $F | sed -e 's/.maf//;'` noDots=`echo $B | sed -e 's/\./v/;'` printf "%s\n" "${B}" sed -e "s/targNC_003977v2/NC_003977v2.NC_003977v2/; s/qry${B}/${noDots}.${noDots}/;" ${F} > mafLinks/NC_003977v2.${noDots}.maf done mkdir /hive/data/genomes/hbv1/bed/multipleAlignment/multiz/run mkdir /hive/data/genomes/hbv1/bed/multipleAlignment/multiz/maf cd /hive/data/genomes/hbv1/bed/multipleAlignment/multiz/run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn join <(sort ../species.list) <(ls ../mafLinks | sed -e 's/.maf//; s/NC_003977v2.//' | sort) > maf.list printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/hbv1/bed/multipleAlignment/multiz/maf/$(root1).maf} #ENDLOOP ' > template printf '#!/bin/csh -ef set db = NC_003977v2 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/hbv1/bed/multipleAlignment/multiz/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../tree.nh ../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/grep -v "$db" species.list`) set in = $pairs/$db.$s.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp ' > autoMultiz.csh chmod +x autoMultiz.csh ssh ku cd /hive/data/genomes/hbv1/bed/multipleAlignment/multiz/run gensub2 maf.list single template jobList para create jobList para try ... check ... push para time # Completed: 184 of 184 jobs # CPU time in finished jobs: 56969s 949.49m 15.82h 0.66d 0.002 y # IO & Wait Time: 940s 15.66m 0.26h 0.01d 0.000 y # Average job time: 315s 5.25m 0.09h 0.00d # Longest finished job: 320s 5.33m 0.09h 0.00d # Submission to last job: 643s 10.72m 0.18h 0.01d # back on hgwdev cd /hive/data/genomes/hbv1/bed/multipleAlignment/multiz # put the results back together into a single file: head -1 maf/AB775198v1.maf > multiz185way.maf for F in maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} | sed -e 's#^s \([A-Z0-9a-z_]*\)#s \1.\1#;' done | sed -e 's#^s NC_003977v2.NC_003977v2#s hbv1.NC_003977v2#;' \ >> multiz185way.maf tail -1 maf/AB775198v1.maf >> multiz185way.maf printf '#!/usr/bin/env perl use strict; use warnings; my $file = shift; open (FH, "<$file") or die "can not read $file"; while (my $line = ) { if ($line =~ m/^s /) { chomp $line; my @a = split("\s+", $line); if (scalar(@a) == 7) { if ($a[1] !~ m/hbv1/) { $a[1] = "$a[1].$a[1]"; } $a[6] =~ s/\./-/g; print join(" ", @a), "\\n"; } else { die "ERROR: s line found not 7 fields ?"; } } else { printf "%%s", $line; } } close (FH); ' > dotToDash.pl chmod +x dotToDash.pl cd /hive/data/genomes/hbv1/bed/multipleAlignment/multiz mkdir /gbdb/hbv1/multiz185way ln -s `pwd`/multiz185way.maf /gbdb/hbv1/multiz185way/multiz185way.maf cd /dev/shm hgLoadMaf hbv1 multiz185way cd /hive/data/genomes/hbv1/bed/multipleAlignment/multiz mafFrag hbv1 multiz185way NC_003977v2 0 3182 + mafFrag.multiz185way.maf ./dotToDash.pl mafFrag.multiz185way.maf > defraged.multiz185way.maf rm -f /gbdb/hbv1/multiz185way/multiz185way.maf ln -s `pwd`/defraged.multiz185way.maf /gbdb/hbv1/multiz185way/multiz185way.maf cd /dev/shm hgLoadMaf hbv1 multiz185way #########################################################################