# ***** BEGIN LICENSE BLOCK ***** # Version: MPL 2.0 # # The contents of this file are subject to the Mozilla Public License Version # 2.0 (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/2.0/ # # Software distributed under the License is distributed on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License # for the specific language governing rights and limitations under the # License. # # The Original Code is OMA standalone. # # The Initial Developer of the Original Code is CBRG Research Group; # ETH Zurich; Switzerland. # Portions created by the Initial Developer are Copyright (C) 2005-2013 # the Initial Developer. All Rights Reserved. # # Contributor(s): # Christophe Dessimoz # Adrian Altenhoff # Stefan Zoller # Adrian Schneider # Alexander Roth # Gaston Gonnet # # ***** END LICENSE BLOCK ***** ############################################################################## # Scaffold verification # # - go over all genomes marked as being scaffolds # - look for best matches in all other genomes # - if # of genomes with many:1 matches > threshold = proportion * nb of genomes # (keep track of the number of matches # - now make sure that these matches do not overlap with more than MaxOverlap # - if that's the case, we have a candidate split gene!! Esprit := proc(params) global DB, DefContigHits, PossContigHits, ContigHits; # parse input params MinProbContig := params[1]: MinBestScore := params[2]: DistConfLevel := params[3]: AllowMultipleHits := params[4]: MaxContigOverlap := params[5]: # start here thresh1 := MinProbContig*length(SearchAllArray(false,isContig)): thresh2 := (1-MinProbContig)*length(SearchAllArray(false,isContig)): ContigHits := []: for i to NG do if isContig[i] then DB := DBs[i]; for s1 to ns[i] do # here, we keep track of potential corresponding fragments t := table(0): tpairs := table([],[]): c := 0; for j to NG do if isContig[j] then next fi; lists2 := decompress(BestMatch[i,j,s1]): if (AllowMultipleHits and length(lists2) >= 1) or length(lists2) = 1 then tmpt := table(false); for s2Iter in lists2 do r1 := r3 := NULL: if s2Iter['Score100'] < (MinBestScore*100) then next fi; s2 := s2Iter['Entry']: s1Pair := s2Iter; r1 := s1Pair['r2']; # back match for s3Iter in decompress(BestMatch[j,i,s2]) do if s3Iter['Score100'] < (MinBestScore*100) then next fi; s3 := s3Iter['Entry']; if s3<=s1 then next fi; s3Pair := s3Iter: r3 := s3Pair['r1']: # some restrictions # at this point, we need to see if the two matches # (s1,s2) and (s3,s2) overlap as much as # ensure that r1,r3 does not overlap more than # MaxContigOverlap additiveLength := r1[2]-r1[1]+r3[2]-r3[1]+2; effectiveLength := max(r1[1],r1[2],r3[1],r3[2]) - min(r1[1], r1[2], r3[1], r3[2])+1; if (additiveLength - effectiveLength) <= MaxContigOverlap then # ensure that distances are not too distant :) s3Dist := s3Pair['PamDist10000']/10000: s1Dist := s1Pair['PamDist10000']/10000: s3Var := s3Pair['PamVar100000']/100000: s1Var := s1Pair['PamVar100000']/100000: deltaDist := abs(s3Dist - s1Dist): deltaDev := sqrt(s3Var + s1Var): if (deltaDist < deltaDev*DistConfLevel) then tmpt[s3] := true; gap := -(additiveLength - effectiveLength); if (r1[1] - r3[1]) > 0 then corder := [2,1]; else corder := [1,2]; fi: tpairs[s3] := append(tpairs[s3], [j, s2, deltaDist/deltaDev, gap, corder]): fi; fi; od; od: for k in Indices(tmpt) do t[k] := t[k]+1: od; else c := c+1; fi; # in this case, there is no way the condition will ever be # satisfied for s1 if c > thresh2 then break fi; od; if c <= thresh2 then # okay, let's look at our table of potential candidates: # first, count how many s3 we have c3 := 0; for x in Indices(t) do if t[x] >= thresh1 then c3 := c3+1; fi od; # second, store them for x in Indices(t) do if t[x] >= thresh1 then ContigHits := append(ContigHits, [[i,s1],[i,x],tpairs[x],c3]); fi; od; fi; if mod(s1,5000)=0 then printf('Completed %d/%d=%.2f%%\n',s1,ns[i],100*s1/ns[i]); fi; od; fi; od; print('Postprocessing...'); # put hits with same s1, s3 in one single hit tmpTable := table([],[]); for i in ContigHits do tmpTable[i[1..2]] := append(tmpTable[i[1..2]], i[3..-1]); od: tmp := []; for z in Indices(tmpTable) do newhit := [op(z),[],0]; el := tmpTable[z]: for j to length(el) do newhit[3] := append(newhit[3], op(el[j,1])); newhit[4] := newhit[4]+el[j,2]; od: newhit[4] := newhit[4]/length(el); tmp := append(tmp, newhit); od: ContigHits := tmp; # search for hits where a sequence also occurs in other hits and flag them # in ContigHits; create PossContigHits and DefContigHits tmpTable := table([],[]); for i to length(ContigHits) do tmpTable[ContigHits[i,1]] := append(tmpTable[ContigHits[i,1]],i); tmpTable[ContigHits[i,2]] := append(tmpTable[ContigHits[i,2]],i); od; DefContigHits := []: PossContigHits := []: for i in ContigHits do if length(tmpTable[i[1]]) > 1 or length(tmpTable[i[2]]) > 1 then PossContigHits := append(PossContigHits, i): else DefContigHits := append(DefContigHits, i): fi: od: return(DefContigHits,PossContigHits); end; module external WriteEspritResults; ##################################################################### # # Methods for writing the esprit results # # Parses array of contig hits and produces # new array with, foreach element: # [seqs, labels, stdfacs, species, gaps, corder, s3set] ParseContigHits := proc(arr) global DB; hits := []: # first, get labels and seq-data of found hits for i to length(arr) do seqs := []: labels := []: stdfacs := []: gaps := []; species := []: corder := []: DB := DBs[arr[i,1,1]]: if arr[i,3,1,5] = [1,2] then first := 1; second := 2; else first := 2; second := 1; fi: seqs := append(seqs, string(SearchTag('SEQ', Entry(arr[i,first,2])))): labels := append(labels, string(SearchTag('ID', Entry(arr[i,first,2])))): seqs := append(seqs, string(SearchTag('SEQ', Entry(arr[i,second,2])))): labels := append(labels, string(SearchTag('ID', Entry(arr[i,second,2])))): for j to length(arr[i,3]) do DB := DBs[arr[i,3,j,1]]; seqs := append(seqs, string(SearchTag('SEQ', Entry(arr[i,3,j,2])))): labels := append(labels, string(SearchTag('ID', Entry(arr[i,3,j,2])))): stdfacs := append(stdfacs, string(arr[i,3,j,3])); gaps := append(gaps, string(arr[i,3,j,4])); species := append(species, genomes[arr[i,3,j,1]]); corder := append(corder, arr[i,3,j,5]); od: s3set := []: for e in arr[i,4] do DB := DBs[arr[i,1,1]]; s3set := append(s3set, SearchDelim(';', SearchTag('ID', Entry(e)))[1]); od: hits := append(hits, [seqs, labels, stdfacs, species, gaps, corder, s3set ]): od: # do some polishing on the labels len := length(hits): for i to len do for j to length(hits[i,2]) do t := SearchDelim(';', hits[i,2,j]): hits[i,2,j] := t[1]: od: od: return(hits); end: #################################################### # # Writes parameters and metainformation to files in # folder 'EspritOutput'; additionally, also writes all # seqs in fasta format in a tarball. # WriteEspritResults := proc() global DB, PossContigHits, DefContigHits: outFolder := 'EspritOutput/'; if length(FileStat(outFolder)) = 0 then TimedCallSystem('mkdir -p '.outFolder); fi: # Find contig genome in Datasets contigdata := '': for i to length(genomes) do if isContig[i] then contigdata := genomes[i]: break: fi: od: printf('Write results: params.txt\n'); OpenWriting(outFolder.'params.txt'): printf('MinSeqLenContig := %a;\nMinProbContig := %a;\nMaxContigOverlap := %a;\n' .'MinBestScore := %a;\nDistConfLevel := %a;\nAllowMultipleHits := %a;\n' .'Datasets := %a;\nStablePairTol := %a;\n\n', MinSeqLenContig, MinProbContig, MaxContigOverlap, MinBestScore, DistConfLevel, AllowMultipleHits, G, StablePairTol); # total amount of hits, number of inconclusive hits totalHits := length(ContigHits): inconclusive := length(PossContigHits): if totalHits = 0 then if inconclusive > 0 then inconPerc := 100 else inconPerc := 0: fi: else inconPerc := 100*inconclusive/totalHits: fi: printf('%a hits in total; %a (%2.2f%%) are not conclusive ("dubious").\n\n', totalHits, inconclusive, inconPerc); OpenWriting(previous): # method to write proper file format WriteResultFile := proc(filename, hits, fastafolder) OpenWriting(filename); printf('# contigs are ordered according to their position relative to the putative ortholog\n'); printf('# see for fasta files in folder "fasta/%a"\n', fastafolder): printf('## fragment-pair\tfirst fragment\tsecond fragment\tcorresponding full gene\tspecies\tdistance difference between fragments\t# positions between fragments\ts3 IDs\n'); for k to length(hits) do i := hits[k]; for j to (length(i[2])-2) do printf('%a\t%a\t%a\t%a\t%a\t%a\t%a\t%a', k, i[2,1], i[2,2], i[2,2+j], i[4,j], i[3,j], i[5,j], i[7]); printf('\n'); od: od: OpenWriting(previous); end: # method to write a simple results file of hits WriteSimpleResultFile := proc(filename) global DB: OpenWriting(filename); printf('# contigs are ordered according to their position relative to the putative ortholog\n'); printf('#first fragment\tsecond fragment\tlist of reference genes separated with semi-colons\n'); for z in DefContigHits do DB := DBs[z[1,1]]; id1arr := SearchDelim('|', SearchTag('ID', Entry(z[1,2]))): id1 := id1arr[2] . '|' . id1arr[3]: id1 := SearchDelim(';', id1)[1]: id2arr := SearchDelim('|', SearchTag('ID', Entry(z[2,2]))): id2 := id2arr[2] . '|' . id2arr[3]: id2 := SearchDelim(';', id2)[1]: if z[3,1,-1,1] = 2 then t := id1; id1 := id2; id2 := t; fi; printf('%a\t', id1): printf('%a\t', id2): if length(z[3]) > 1 then for i from 1 to length(z[3])-1 do DB := DBs[z[3,i,1]]: id := SearchDelim('|', SearchTag('ID', Entry(z[3,i,2])))[-1]: id := SearchDelim(';', id)[1]: printf('%a:%a; ', genomes[z[3,i,1]], id); od: fi: DB := DBs[z[3,length(z[3]),1]]: id := SearchDelim('|', SearchTag('ID', Entry(z[3,length(z[3]),2])))[-1]: id := SearchDelim(';', id)[1]: printf('%a:%a\n', genomes[z[3,length(z[3]),1]], id): od: OpenWriting(previous); end: printf('Write results: hits.txt\n'); defHits := ParseContigHits(DefContigHits): WriteResultFile(outFolder.'hits.txt', defHits, 'defhits'); printf('Write results: dubious.txt\n'); possHits := ParseContigHits(PossContigHits): WriteResultFile(outFolder.'dubious.txt', possHits, 'posshits'); actuallyWriteFasta := proc(arr, fastaset) CallSystem('mkdir -p '.outFolder.'fasta/'.fastaset): limit := min(50, length(arr)): for j to limit do i := arr[j]: seqs := i[1]: labels := i[2]: # try to create MSA m := traperror(MafftMSA(seqs, labels)): if not type(m,string) then seqs := m[2]: fi: # some postprocessing of strings for s in seqs do s := ReplaceString('_', '-', s); od: WriteFasta(seqs, labels, outFolder.'fasta/'.fastaset.'/'.j.'.fa'); od: end: # write fasta files and/or MSAs and make tarball printf('Write results: fasta files\n'); actuallyWriteFasta(defHits, 'defhits'): actuallyWriteFasta(possHits, 'posshits'): TimedCallSystem('cd '.outFolder.' && tar cvzf fasta.tgz fasta'); TimedCallSystem('rm -rf '.outFolder.'fasta'); return(); end: end: # module