ó
ù`]c           @   sH   d  Z  d d l Z d d l m Z d e f d „  ƒ  YZ e ƒ  j Z d S(   sê  
S-Expression Tokenizer

``SExprTokenizer`` is used to find parenthesized expressions in a
string.  In particular, it divides a string into a sequence of
substrings that are either parenthesized expressions (including any
nested parenthesized expressions), or other whitespace-separated
tokens.

    >>> from nltk.tokenize import SExprTokenizer
    >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
    ['(a b (c d))', 'e', 'f', '(g)']

By default, `SExprTokenizer` will raise a ``ValueError`` exception if
used to tokenize an expression with non-matching parentheses:

    >>> SExprTokenizer().tokenize('c) d) e (f (g')
    Traceback (most recent call last):
      ...
    ValueError: Un-matched close paren at char 1

The ``strict`` argument can be set to False to allow for
non-matching parentheses.  Any unmatched close parentheses will be
listed as their own s-expression; and the last partial sexpr with
unmatched open parentheses will be listed as its own sexpr:

    >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
    ['c', ')', 'd', ')', 'e', '(f (g']

The characters used for open and close parentheses may be customized
using the ``parens`` argument to the `SExprTokenizer` constructor:

    >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
    ['{a b {c d}}', 'e', 'f', '{g}']

The s-expression tokenizer is also available as a function:

    >>> from nltk.tokenize import sexpr_tokenize
    >>> sexpr_tokenize('(a b (c d)) e f (g)')
    ['(a b (c d))', 'e', 'f', '(g)']

iÿÿÿÿN(   t
   TokenizerIt   SExprTokenizerc           B   s&   e  Z d  Z d e d „ Z d „  Z RS(   s\  
    A tokenizer that divides strings into s-expressions.
    An s-expresion can be either:

      - a parenthesized expression, including any nested parenthesized
        expressions, or
      - a sequence of non-whitespace non-parenthesis characters.

    For example, the string ``(a (b c)) d e (f)`` consists of four
    s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.

    By default, the characters ``(`` and ``)`` are treated as open and
    close parentheses, but alternative strings may be specified.

    :param parens: A two-element sequence specifying the open and close parentheses
        that should be used to find sexprs.  This will typically be either a
        two-character string, or a list of two strings.
    :type parens: str or list
    :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
    s   ()c         C   s~   t  | ƒ d k r! t d ƒ ‚ n  | |  _ | d |  _ | d |  _ t j d t j | d ƒ t j | d ƒ f ƒ |  _ d  S(   Ni   s'   parens must contain exactly two stringsi    i   s   %s|%s(	   t   lent
   ValueErrort   _strictt   _open_parent   _close_parent   ret   compilet   escapet   _paren_regexp(   t   selft   parenst   strict(    (    s2   lib/python2.7/site-packages/nltk/tokenize/sexpr.pyt   __init__O   s    	c         C   si  g  } d } d } x|  j  j | ƒ D]î } | j ƒ  } | d k ro | | | | j ƒ  !j ƒ  7} | j ƒ  } n  | |  j k r‹ | d 7} n  | |  j k r% |  j rÈ | d k rÈ t d | j ƒ  ƒ ‚ n  t	 d | d ƒ } | d k r| j
 | | | j ƒ  !ƒ | j ƒ  } qq% q% W|  j r?| d k r?t d | ƒ ‚ n  | t | ƒ k  re| j
 | | ƒ n  | S(   sQ  
        Return a list of s-expressions extracted from *text*.
        For example:

            >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
            ['(a b (c d))', 'e', 'f', '(g)']

        All parentheses are assumed to mark s-expressions.
        (No special processing is done to exclude parentheses that occur
        inside strings, or following backslash characters.)

        If the given expression contains non-matching parentheses,
        then the behavior of the tokenizer depends on the ``strict``
        parameter to the constructor.  If ``strict`` is ``True``, then
        raise a ``ValueError``.  If ``strict`` is ``False``, then any
        unmatched close parentheses will be listed as their own
        s-expression; and the last partial s-expression with unmatched open
        parentheses will be listed as its own s-expression:

            >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
            ['c', ')', 'd', ')', 'e', '(f (g']

        :param text: the string to be tokenized
        :type text: str or iter(str)
        :rtype: iter(str)
        i    i   s!   Un-matched close paren at char %ds    Un-matched open paren at char %d(   R
   t   finditert   groupt   startt   splitR   R   R   R   t   maxt   appendt   endR   (   R   t   textt   resultt   post   deptht   mt   paren(    (    s2   lib/python2.7/site-packages/nltk/tokenize/sexpr.pyt   tokenizeY   s,    (   t   __name__t
   __module__t   __doc__t   TrueR   R   (    (    (    s2   lib/python2.7/site-packages/nltk/tokenize/sexpr.pyR   9   s   
(   R   R   t   nltk.tokenize.apiR    R   R   t   sexpr_tokenize(    (    (    s2   lib/python2.7/site-packages/nltk/tokenize/sexpr.pyt   <module>2   s   S