ó
ªSIc           @   sŸ  d  Z  d d l m Z d d l m Z d d l m Z m Z m Z d d l m	 Z	 d e f d „  ƒ  YZ
 e d k r›d	 GHd
 Z d d l m Z e e
 e e ƒ ƒ ƒ Z e e ƒ d k sË t e e ƒ ƒ ‚ e e d j ƒ  ƒ d k sí t ‚ x_ e D]W Z d e e j ƒ  ƒ e j ƒ  f GHx- e D]% Z d e j e j e j d f GHq"Wqô Wd GHd d l Z d Z g  e j e ƒ D]( Z e j j e ƒ d d k rže ^ qvZ e j ƒ  xç e D]ß Z  e j j e  ƒ d d k r”He  GHd e e  ƒ GHx¢ e! e
 e" e j j# e e  ƒ ƒ ƒ ƒ D]y \ Z$ Z d e$ d e f GHxW e D]O Z d e j k rme j j% j& d k sjt ‚ n e' e j j% d ƒ s‰t ‚ q:WqWn  qµWn  d S(   s!  
Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools.

You are expected to use this module via the Bio.AlignIO functions (or the
Bio.SeqIO functions if you want to work directly with the gapped sequences).

This module contains a parser for the pairwise alignments produced by Bill
Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is
refered to as the "fasta-m10" file format (as we only support the machine
readable output format selected with the -m 10 command line option).

This module does NOT cover the generic "fasta" file format originally
developed as an input format to the FASTA tools.  The Bio.AlignIO and
Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
which can also be used to store a multiple sequence alignments.
iÿÿÿÿ(   t	   Alignment(   t   AlignmentIterator(   t   single_letter_alphabett   generic_dnat   generic_protein(   t   Gappedt   FastaM10Iteratorc           B   s;   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   sq  Alignment iterator for the FASTA tool's pairwise alignment output.

    This is for reading the pairwise alignments output by Bill Pearson's
    FASTA program when called with the -m 10 command line option for machine
    readable output.  For more details about the FASTA tools, see the website
    http://fasta.bioch.virginia.edu/ and the paper:

         W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448

    This class is intended to be used via the Bio.AlignIO.parse() function
    by specifying the format as "fasta-m10" as shown in the following code:

        from Bio import AlignIO
        handle = ...
        for a in AlignIO.parse(handle, "fasta-m10") :
            assert len(a.get_all_seqs()) == 2, "Should be pairwise!"
            print "Alignment length %i" % a.get_alignment_length()
            for record in a :
                print record.seq, record.name, record.id

    Note that this is not a full blown parser for all the information
    in the FASTA output - for example, most of the header and all of the
    footer is ignored.  Also, the alignments are not batched according to
    the input queries.

    Also note that there can be up to about 30 letters of flanking region
    included in the raw FASTA output as contextual information.  This is NOT
    part of the alignment itself, and is not included in the resulting
    Alignment objects returned.
    c         C   sí  |  j  } y |  j } |  ` Wn t k
 r; | j ƒ  } n X| sF d S| j d ƒ rg |  j | ƒ } n  xA d | k rª | j d ƒ rª d |  _ i  |  _ |  j	 | ƒ } qj W| sµ d Sd | k rÅ d S| d d !d k ré | d d k sõ t
 | ƒ ‚ g  g  } } i  i  } } d } i  } | d d !d k sB| d d	 !d k rQt d
 ƒ ‚ n  | d j ƒ  } | j ƒ  } |  j | | ƒ } | d d !d k s™t
 ‚ | d d k o»| j ƒ  j d ƒ sÍt d ƒ ‚ n  |  j j | d j d d ƒ d ƒ sùt
 ‚ | j ƒ  } |  j | | ƒ } | d d !d k s1t
 ‚ x3 | d d k sf| j | j ƒ  ƒ | j ƒ  } q4W| d d k o‰| j ƒ  j d ƒ s¥t d t | ƒ ƒ ‚ n  | j | d j d d ƒ d ƒ sÎt
 ‚ | j ƒ  } |  j | | ƒ } | d d !d k st
 ‚ xR | d d !d k p5| d d k p5d | k sZ| j | j ƒ  ƒ | j ƒ  } q	W| d d !d k r| j ƒ  d k s†t
 ‚ g  }	 | j ƒ  } xR | d d !d k pÇ| d d k pÇd | k sì|	 j | j ƒ  ƒ | j ƒ  } q›Wd j |	 ƒ }
 ~	 | d d !d k s"t
 ‚ n d }
 | d d k sDd | k sDt
 ‚ | |  _ d j | ƒ } d j | ƒ } ~ ~ |  j | | ƒ } |  j | | ƒ } t | ƒ t | ƒ k rØt d | t | ƒ | t | ƒ f ƒ ‚ n  d | k r&t | d ƒ t | ƒ k r&t d | d t | ƒ f ƒ ‚ q&n  |  j } t | ƒ } i  | _ x* |  j j ƒ  D] \ } } | | j | <qTWx' | j ƒ  D] \ } } | | j | <q~W| j |  j | ƒ | j ƒ  d } | j |  j k sè| j |  j k sèt
 ‚ |  j j d d ƒ d j d ƒ | _ d | _ t | d ƒ | j d <| t  k r†d | k r†| d d k rdt! | j" _ q†| d d k r†t# | j" _ q†n  d | k rÈt$ | j" j d ƒ sÈt% | j" j d ƒ | j" _ qÈn  | j | | ƒ | j ƒ  d } | j | k s| j | k st
 ‚ | j d d ƒ d j d ƒ | _ d | _ t | d ƒ | j d <| t  k r§d | k r§| d d k r…t! | j" _ q§| d d k r§t# | j" _ q§n  d | k rét$ | j" j d ƒ sét% | j" j d ƒ | j" _ qén  | S(    sÇ   Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows.t   #s   >>>t    s   >>><<<i    i   s   >>t   >i   s"   Expected target line starting '>>'s   ; s   ..s*   Expected line starting '>' and ending '..'i   s4   Expected line starting '>' and ending '..', got '%s's
   ; al_cons:s|   Problem parsing the alignment sequence coordinates, following should be the same length but are not:
%s - len %i
%s - len %it
   sw_overlaps:   Specified sw_overlap = %s does not match expected value %iiÿÿÿÿt   ,t   queryt   sq_lent   original_lengtht   sq_typet   Dt   pt   -t   gap_chart   matchN(&   t   handlet   _headert   AttributeErrort   readlinet   Nonet
   startswitht   _skip_file_headert   _query_descrt   _query_header_annotationt   _parse_query_headert   AssertionErrort
   ValueErrort   stript   _parse_tag_sectiont   endswitht   splitt   appendt   reprt   joint   _extract_alignment_regiont   lent   intt   alphabetR    t   _annotationst	   iteritemst   add_sequencet   get_all_seqst   idt   descriptiont   namet   annotationsR   R   t   seqR   t   hasattrR   (   t   selfR   t   linet   query_seq_partst   match_seq_partst   query_annotationt   match_annotationt   match_descrt   alignment_annotationt   align_consensus_partst   align_consensust	   query_seqt	   match_seqt   query_align_seqt   match_align_seqR+   t	   alignmentt   keyt   valuet   record(    (    s†   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/AlignIO/FastaIO.pyt   next=   sà    		
		0'%,%)	22"				*%	!$"	!c         C   s&   x d | k r! |  j  j ƒ  } q W| S(   s_   Helper function for the main parsing code.

        Skips over the file header region.
        s   >>>(   R   R   (   R6   R7   (    (    s†   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/AlignIO/FastaIO.pyR   !  s    c         C   s:  i  |  _  d |  _ d | k r2 | d d !d k s8 t ‚ |  j j ƒ  } xK | d d !d k s” |  j j ƒ  } | s t d ƒ ‚ n  d | k rJ | SqJ W| d d !d k s´ t | ƒ ‚ | d j ƒ  |  _ |  j j ƒ  } |  j | |  j  ƒ } | d d !d k st | ƒ ‚ | d d !d	 k s6d | k s6t | ƒ ‚ | S(
   sË   Helper function for the main parsing code.

        Skips over the free format query header, extracting the tagged parameters.

        If there are no hits for the current query, it is skipped entirely.R   s   >>>i    i   s   Premature end of file!s   >>><<<i   s   ; s   >>(   R   R   R   R   R   R    R!   R"   (   R6   R7   (    (    s†   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/AlignIO/FastaIO.pyR   7  s"    '		& +c         C   s  | j  d ƒ } t | d ƒ } t | d ƒ t | d ƒ k r{ t | d ƒ | } t | d ƒ | | j d ƒ d } n9 | t | d ƒ } | t | d ƒ | j d ƒ d } d | k rÞ | | k  rÞ | t | ƒ k sú t d | | | | f ƒ ‚ | | | !S(   sY  Helper function for the main parsing code.

        To get the actual pairwise alignment sequences, we must first
        translate the un-gapped sequence based coordinates into positions
        in the gapped sequence (which may have a flanking region shown
        using leading - characters).  To date, I have never seen any
        trailing flanking region shown in the m10 file, but the
        following code should also cope with that.

        Note that this code seems to work fine even when the "sq_offset"
        entries are prsent as a result of using the -X command line option.
        R   t   al_display_startt   al_startt   al_stopi   i    s.   Problem with sequence start/stop,
%s[%i:%i]
%s(   R!   R*   t   countR)   R   (   R6   t   alignment_seq_with_flankingt
   annotationt   align_strippedt   display_startt   startt   end(    (    s†   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/AlignIO/FastaIO.pyR(     s     (%-c         C   s{   | d d !d k s" t  d ƒ ‚ n  xR | d d !d k rv | d j ƒ  j d d ƒ \ } } | | | <|  j j ƒ  } q% W| S(   s  Helper function for the main parsing code.

        line - supply line just read from the handle containing the start of
               the tagged section.
        dictionary - where to record the tagged values

        Returns a string, the first line following the tagged section.i    i   s   ; s   Expected line starting '; 's   : i   (   R    R!   R$   R   R   (   R6   R7   t
   dictionaryt   tagRF   (    (    s†   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/AlignIO/FastaIO.pyR"   ±  s    "
(   t   __name__t
   __module__t   __doc__RH   R   R   R(   R"   (    (    (    s†   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/AlignIO/FastaIO.pyR      s   	ä		X	"t   __main__s   Running a quick self-tests¯  # /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
FASTA searches a protein or DNA sequence data bank
 version 34.26 January 12, 2007
Please cite:
 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448

Query library NC_002127.faa vs NC_009649.faa library
searching NC_009649.faa library

  1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa
 vs  NC_009649.faa library

  45119 residues in   180 sequences
  Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273
 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1  B-trim: 9 in 1/25
 Lambda= 0.175043

FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
 join: 36, opt: 24, open/ext: -10/-2, width:  16
 Scan time:  0.000
The best scores are:                                      opt bits E(180)
gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931)   71 24.9    0.58
gi|152973588|ref|YP_001338639.1| F pilus assembly  ( 459)   63 23.1    0.99

>>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library
; pg_name: /opt/fasta/fasta34
; pg_ver: 34.26
; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
; pg_name: FASTA
; pg_ver: 3.5 Sept 2006
; pg_matrix: BL50 (15:-5)
; pg_open-ext: -10 -2
; pg_ktup: 2
; pg_optcut: 24
; pg_cgap: 36
; mp_extrap: 60000 180
; mp_stats:  Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273  mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1  B-trim: 9 in 1/25  Lambda= 0.175043
; mp_KS: -0.0000 (N=0) at 8159228
>>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
; fa_frame: f
; fa_initn:  65
; fa_init1:  43
; fa_opt:  71
; fa_z-score: 90.3
; fa_bits: 24.9
; fa_expect:   0.58
; sw_score: 71
; sw_ident: 0.250
; sw_sim: 0.574
; sw_overlap: 108
>gi|10955263| ..
; sq_len: 107
; sq_offset: 1
; sq_type: p
; al_start: 5
; al_stop: 103
; al_display_start: 1
--------------------------MTKRSGSNT-RRRAISRPVRLTAE
ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM-----
RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD
>gi|152973457|ref|YP_001338508.1| ..
; sq_len: 931
; sq_type: p
; al_start: 96
; al_stop: 195
; al_display_start: 66
SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ
SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI
KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY
VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ
>>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
; fa_frame: f
; fa_initn:  33
; fa_init1:  33
; fa_opt:  63
; fa_z-score: 86.1
; fa_bits: 23.1
; fa_expect:   0.99
; sw_score: 63
; sw_ident: 0.266
; sw_sim: 0.656
; sw_overlap: 64
>gi|10955263| ..
; sq_len: 107
; sq_offset: 1
; sq_type: p
; al_start: 32
; al_stop: 94
; al_display_start: 2
TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK
VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL
LSRLMAD
>gi|152973588|ref|YP_001338639.1| ..
; sq_len: 459
; sq_type: p
; al_start: 191
; al_stop: 248
; al_display_start: 161
VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK
ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG
ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT
SNKALKSQISALLSSIQNKAVADEKLTDQE
  2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
 vs  NC_009649.faa library

  45119 residues in   180 sequences
  Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313
 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1  B-trim: 8 in 1/25
 Lambda= 0.179384

FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
 join: 36, opt: 24, open/ext: -10/-2, width:  16
 Scan time:  0.000
The best scores are:                                      opt bits E(180)
gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101)   58 22.9    0.29

>>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
; pg_name: /opt/fasta/fasta34
; pg_ver: 34.26
; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
; pg_name: FASTA
; pg_ver: 3.5 Sept 2006
; pg_matrix: BL50 (15:-5)
; pg_open-ext: -10 -2
; pg_ktup: 2
; pg_optcut: 24
; pg_cgap: 36
; mp_extrap: 60000 180
; mp_stats:  Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313  mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1  B-trim: 8 in 1/25  Lambda= 0.179384
; mp_KS: -0.0000 (N=0) at 8159228
>>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
; fa_frame: f
; fa_initn:  50
; fa_init1:  50
; fa_opt:  58
; fa_z-score: 95.8
; fa_bits: 22.9
; fa_expect:   0.29
; sw_score: 58
; sw_ident: 0.289
; sw_sim: 0.632
; sw_overlap: 38
>gi|10955264| ..
; sq_len: 126
; sq_offset: 1
; sq_type: p
; al_start: 1
; al_stop: 38
; al_display_start: 1
------------------------------MKKDKKYQIEAIKNKDKTLF
IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF
NGEKFSSYTLNKVTKTDEYN
>gi|152973462|ref|YP_001338513.1| ..
; sq_len: 101
; sq_type: p
; al_start: 44
; al_stop: 81
; al_display_start: 14
DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI
VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA
  3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa
 vs  NC_009649.faa library

  45119 residues in   180 sequences
  Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461
 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1  B-trim: 14 in 1/25
 Lambda= 0.210386

FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
 join: 37, opt: 25, open/ext: -10/-2, width:  16
 Scan time:  0.020
The best scores are:                                      opt bits E(180)
gi|152973545|ref|YP_001338596.1| putative plasmid  ( 242)   70 27.5   0.082

>>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library
; pg_name: /opt/fasta/fasta34
; pg_ver: 34.26
; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
; pg_name: FASTA
; pg_ver: 3.5 Sept 2006
; pg_matrix: BL50 (15:-5)
; pg_open-ext: -10 -2
; pg_ktup: 2
; pg_optcut: 25
; pg_cgap: 37
; mp_extrap: 60000 180
; mp_stats:  Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461  mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1  B-trim: 14 in 1/25  Lambda= 0.210386
; mp_KS: -0.0000 (N=0) at 8159228
>>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
; fa_frame: f
; fa_initn:  52
; fa_init1:  52
; fa_opt:  70
; fa_z-score: 105.5
; fa_bits: 27.5
; fa_expect:  0.082
; sw_score: 70
; sw_ident: 0.279
; sw_sim: 0.651
; sw_overlap: 43
>gi|10955265| ..
; sq_len: 346
; sq_offset: 1
; sq_type: p
; al_start: 197
; al_stop: 238
; al_display_start: 167
DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
GEYFTENKPKYIIREIHQET
>gi|152973545|ref|YP_001338596.1| ..
; sq_len: 242
; sq_type: p
; al_start: 52
; al_stop: 94
; al_display_start: 22
IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
QDFAFTRKMRREARQVEQSW
>>><<<


579 residues in 3 query   sequences
45119 residues in 180 library sequences
 Scomplib [34.26]
 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
 Total Scan time:  0.020 Total Display time:  0.010

Function used was FASTA [version 34.26 January 12, 2007]

(   t   StringIOi   i    i   s#   Alignment %i sequences of length %is   %s %s %iR   t   DoneNs   ../../Tests/Fasta/s   .m10t   =s   #%i, %si   R   R   ((   RW   t   Bio.Align.GenericR    t
   InterfacesR   t   Bio.AlphabetR   R   R   R   R   RU   t   simple_exampleRY   t   listt
   alignmentsR)   R   R/   t   at   get_alignment_lengtht   rR4   R0   R3   t   ost   patht   listdirt   ft   splitextt   filest   sortt   filenamet	   enumeratet   openR'   t   iR+   R   R5   (    (    (    s†   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/AlignIO/FastaIO.pyt   <module>   sD   ÿ «ê$"'>
4