# for emacs: -*- mode: sh; -*- mkdir -p /hive/data/genomes/eboVir3/bed/multiz160way cd /hive/data/genomes/eboVir3/bed/multiz160way # construct a species list ordered by number of words in the psl pair # alignment results: echo KM034562v1 > species.list for D in ../../lastz160way/KM034562v1/pairs/* do echo -n "${D}: "; zcat ${D}/pslParts/part000.lst.psl.gz \ | grep -v "^#" | wc done | sort -k4n | sed -e 's#../../lastz160way/KM034562v1/pairs/##;' \ | sed -e 's/:.*//; s/marVir1.//;' | grep -v KM034562v1 >> species.list # verify 160 count sort -u species.list | wc -l # 160 # construct 160way.nh tree from that species.list, beginning as so, with the first two as the first pair: (((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((KM034562v1:0.1,KJ660346v2:0.1):0.1, KJ660347v2:0.1):0.1, KJ660348v2:0.1):0.1, KM034554v1:0.1):0.1, ... KC545391v1:0.1):0.1, KC545392v1:0.1):0.1, JN638998v1:0.1):0.1, NC_024781v1:0.1):0.1, NC_001608v3:0.1); # construct tree with just the names for multiz: sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 160way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh # create mafLinks: mkdir -p /hive/data/genomes/eboVir3/bed/multiz160way/mafLinks cd /hive/data/genomes/eboVir3/bed/multiz160way/mafLinks # one with eboVir3 db sneaked in after this was all moved over from # eboVir2, the sed catches both ls ../../lastz160way/KM034562v1/pairs | while read D do onePair="../../lastz160way/KM034562v1/pairs/${D}" if [ -d "${onePair}" ]; then mafNet=`ls ${onePair}/mafNet/*.maf.gz 2> /dev/null` if [ ! -f "${mafNet}" ]; then echo "ERROR: can not find mafNet file ${mafNet}" 1>&2 exit 255 fi D=`echo $D | sed -e 's/marVir1.//'` echo ${D} ${mafNet} 1>&2 zcat ${mafNet} \ | sed -e "s#^s eboVir2.${D}#s ${D}.${D}#; s#s eboVir2.KM034562v1#s KM034562v1.KM034562v1#; s#^s eboVir3.${D}#s ${D}.${D}#; s#s eboVir3.KM034562v1#s KM034562v1.KM034562v1#;" \ | gzip -c > KM034562v1.${D}.maf.gz fi done # remove the self alignment: rm KM034562v1.KM034562v1.maf.gz # leaves 159 files: ls *.maf.gz | wc -l # 159 # scan the names to verify sanity: zcat *.maf.gz | grep "^s " | awk '{print $2}' | sort \ | uniq -c | sort -rn | less # should look like: # 262 KM034562v1.KM034562v1 # 11 NC_024781v1.NC_024781v1 # 11 NC_001608v3.NC_001608v3 # 5 NC_014372v1.NC_014372v1 # 5 NC_004161v1.NC_004161v1 # 5 KC589025v1.KC589025v1 # 5 JN638998v1.JN638998v1 # ... # 1 HQ613402v1.HQ613402v1 # 1 EU224440v2.EU224440v2 # 1 AY354458v1.AY354458v1 # 1 AY142960v1.AY142960v1 # 1 AF499101v1.AF499101v1 # 1 AF272001v1.AF272001v1 # 1 AF086833v2.AF086833v2 cd /hive/data/genomes/eboVir3/bed/multiz160way mkdir run maf cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn ls ../mafLinks | sed -e 's/.maf.gz//; s/KM034562v1.//' > maf.list cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/eboVir3/bed/multiz160way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > autoMultiz.csh #!/bin/csh -ef set db = KM034562v1 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/eboVir3/bed/multiz160way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../tree.nh ../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/grep -v "$db" species.list`) set in = $pairs/$db.$s.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result.maf /bin/cp -p $tmp/$c $result.maf /bin/rm -fr $tmp '_EOF_' # << happy emacs gensub2 maf.list single template jobList para create jobList para try ... check ... push para time # Completed: 159 of 159 jobs # CPU time in finished jobs: 217592s 3626.54m 60.44h 2.52d 0.007 y # IO & Wait Time: 961s 16.01m 0.27h 0.01d 0.000 y # Average job time: 1375s 22.91m 0.38h 0.02d # Longest finished job: 1502s 25.03m 0.42h 0.02d # Submission to last job: 1514s 25.23m 0.42h 0.02d # put the results back together into a single file: head -1 maf/AB050936v1.maf > multiz160way.maf for F in maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} | sed -e 's#^s \([A-Z0-9a-z_]*\)#s \1.\1#;' done | sed -e 's#^s KM034562v1.KM034562v1#s eboVir3.KM034562v1#;' \ >> multiz160way.maf tail -1 maf/AB050936v1.maf >> multiz160way.maf # scan names to verify sanity: grep "^s " multiz160way.maf | awk '{print $2}' | sort | uniq -c \ | sort -rn | less # 27030 eboVir3.KM034562v1 # 27030 NC_002549v1.NC_002549v1 # 27030 KM233113v1.KM233113v1 # 27030 KM233109v1.KM233109v1 # 27030 KM233070v1.KM233070v1 # ... # 18126 KM233060v1.KM233060v1 # 17490 KM034563v1.KM034563v1 # 17172 KM233117v1.KM233117v1 # 16854 KM233118v1.KM233118v1 # 15105 FJ621585v1.FJ621585v1 # 6201 NC_024781v1.NC_024781v1 # 6201 NC_001608v3.NC_001608v3 mkdir /gbdb/eboVir3/multiz160way rm -f /gbdb/eboVir3/multiz160way/multiz160way.maf ln -s `pwd`/multiz160way.maf \ /gbdb/eboVir3/multiz160way/multiz160way.maf cd /dev/shm time hgLoadMaf eboVir3 multiz160way # Loaded 27030 mafs in 1 files from /gbdb/eboVir3/multiz160way # real 0m8.057s # merge that into a single block: cd /hive/data/genomes/eboVir3/bed/multiz160way mafFrag eboVir3 multiz160way KM034562v1 0 18957 + mafFrag.multiz160way.maf # change dots to dash cat << '_EOF_' > dotToDash.pl #!/usr/bin/env perl use strict; use warnings; my $file = shift; open (FH, "<$file") or die "can not read $file"; while (my $line = ) { if ($line =~ m/^s /) { chomp $line; my @a = split('\s+', $line); if (scalar(@a) == 7) { if ($a[1] !~ m/eboVir3/) { $a[1] = "$a[1].$a[1]"; } $a[6] =~ s/\./-/g; print join(' ', @a), "\n"; } else { die "ERROR: s line found not 7 fields ?"; } } else { printf "%s", $line; } } close (FH); '_EOF_' # << happy emacs ./dotToDash.pl mafFrag.multiz160way.maf > defraged.multiz160way.maf # and reload: rm /gbdb/eboVir3/multiz160way/multiz160way.maf ln -s `pwd`/defraged.multiz160way.maf \ /gbdb/eboVir3/multiz160way/multiz160way.maf cd /dev/shm time hgLoadMaf eboVir3 multiz160way # Loaded 1 mafs in 1 files from /gbdb/eboVir3/multiz160way # real 0m0.054s # construct strain-name track cat defraged.multiz160way.maf \ | sed -f ~/kent/src/hg/makeDb/doc/eboVir3/ucscName.strainName.sed \ | sed -e 's/eboVir3.G3686v1_2014/eboVir3.KM034562v1/' \ > strainName.multiz160way.maf mkdir /gbdb/eboVir3/strainName160way rm /gbdb/eboVir3/strainName160way/strainName160way.maf ln -s `pwd`/strainName.multiz160way.maf \ /gbdb/eboVir3/strainName160way/strainName160way.maf cd /dev/shm time hgLoadMaf eboVir3 strainName160way