
`]c           @  s   d  d l  m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l m Z d  d l	 m
 Z
 d  d l m Z d  d l m Z m Z m Z m Z m Z m Z d  d l m Z d Z d	 e f d
     YZ d   Z d S(   i(   t   unicode_literalst   print_functionN(   t   PIPE(   t	   text_type(   t   compat(   t   find_jart	   find_filet   find_dirt   config_javat   javat   _java_options(   t
   TokenizerIu!   https://nlp.stanford.edu/softwaret   StanfordSegmenterc           B  sz   e  Z d  Z d Z d d d d d d d d d d e d d  Z d   Z d   Z d   Z	 d	   Z
 d
   Z e d  Z RS(   u  Interface to the Stanford Segmenter

    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
    should be provieded, for example::

        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')

    >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    >>> seg = StanfordSegmenter()
    >>> seg.default_config('zh')
    >>> sent = u'这是斯坦福中文分词器测试'
    >>> print(seg.segment(sent))
    这 是 斯坦福 中文 分词器 测试
    <BLANKLINE>
    >>> seg.default_config('ar')
    >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
    >>> print(seg.segment(sent.split()))
    هذا هو تصنيف ستانفورد العربي ل الكلمات
    <BLANKLINE>
    u   stanford-segmenter.jaru   falseu   UTF-8u   -mx2gc         C  sM  t  j d t  t  j t d  t d d t  j d t  t |  j | d d d d d	 t d
 | } | d  k	 r t d | d d d d d	 t d
 | } n d  } t	 j
 j d   | | g D  |  _ | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ |	 |  _ | |  _ |
 d  k r!i  n |
 }
 d j d   |
 j   D  |  _ d  S(   Nu   alwaysu}   
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPTokenizer[0m instead.'t
   stackleveli   u   ignoret   env_varsu   STANFORD_SEGMENTERt
   searchpatht   urlt   verboseu   slf4j-api.jaru   SLF4Jc         s  s!   |  ] } | d  k	 r | Vq d  S(   N(   t   None(   t   .0t   _(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pys	   <genexpr>r   s    u   ,c         s  s0   |  ]& \ } } d  j  | t j |   Vq d S(   u   {0}={1}N(   t   formatt   jsont   dumps(   R   t   keyt   val(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pys	   <genexpr>   s    (   u   STANFORD_SEGMENTER(    (   u   SLF4Ju   STANFORD_SEGMENTER(    (   t   warningst   simplefiltert   DeprecationWarningt   warnt   strR   t   _JARt   _stanford_urlR   t   ost   pathsept   joint   _stanford_jart   _java_classt   _modelt   _sihan_corpora_dictt   _sihan_post_processingt   _keep_whitespacest   _dictt	   _encodingt   java_optionst   itemst   _options_cmd(   t   selft   path_to_jart   path_to_slf4jt
   java_classt   path_to_modelt   path_to_dictt   path_to_sihan_corpora_dictt   sihan_post_processingt   keep_whitespacest   encodingt   optionsR   R,   t   stanford_segmentert   slf4j(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyt   __init__?   sH    										c         C  s  d } t  j j d  r? t  j j t  j j d  d  h } n  d |  _ d |  _ d |  _ | d k rx d |  _	 d } n | d k rXd |  _	 d	 } d
 |  _ d } y+ t
 | d | d t d t d d |  _ Wn! t k
 r t d |   n Xd } y: t | d t d t d d } t  j j | |  |  _ Wqmt k
 rTt d |   qmXn t d j |    y+ t
 | d | d t d t d d |  _ Wn! t k
 rt d |   n Xd S(   u   
        Attempt to intialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        u   STANFORD_SEGMENTERu   datau   falseu   aru=   edu.stanford.nlp.international.arabic.process.ArabicSegmenteru'   arabic-segmenter-atb+bn+arztrain.ser.gzu   zhu%   edu.stanford.nlp.ie.crf.CRFClassifieru   pku.gzu   trueu   dict-chris6.ser.gzR   R   R   R   u   STANFORD_MODELSu_   Could not find '%s' (tried using env. variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)u   ./data/uM   Could not find '%s' (tried using the STANFORD_SEGMENTER environment variable)u   Unsupported language {}N(    (   u   STANFORD_MODELS(   u   STANFORD_SEGMENTER(   u   STANFORD_MODELSu   STANFORD_SEGMENTER(   R!   t   environt   gett   pathR#   R   R*   R'   R(   R%   R   R    t   Falset   LookupErrorR   R   R&   (   R/   t   langt   search_patht   modelR4   t	   sihan_dirt   path_to_sihan_dir(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyt   default_config   sb    '								c         C  s   t    j |  d  S(   N(   t   supert   tokenize(   R/   t   s(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyRI      s    c         C  sq   |  j  d |  j d |  j d | g } |  j d k	 r^ | j d |  j d |  j d |  j g  n  |  j |  } | S(   u	   
        u   -loadClassifieru   -keepAllWhitespacesu	   -textFileu   -serDictionaryu   -sighanCorporaDictu   -sighanPostProcessingN(	   R%   R&   R)   R'   R   t   extendR*   R(   t   _execute(   R/   t   input_file_patht   cmdt   stdout(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyt   segment_file   s"    	c         C  s   |  j  | g  S(   N(   t   segment_sents(   R/   t   tokens(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyt   segment   s    c         C  s  |  j  } t j d t  \ } |  _ t j | d  } d j d   | D  } t | t	  rv | rv | j
 |  } n  | j |  | j   |  j d |  j d |  j d |  j g } |  j d k	 r | j d |  j d	 |  j d
 |  j g  n  |  j |  } t j |  j  | S(   u	   
        t   textu   wbu   
c         s  s   |  ] } d  j  |  Vq d S(   u    N(   R#   (   R   t   x(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pys	   <genexpr>   s    u   -loadClassifieru   -keepAllWhitespacesu	   -textFileu   -serDictionaryu   -sighanCorporaDictu   -sighanPostProcessingN(   R+   t   tempfilet   mkstempt   Truet   _input_file_pathR!   t   fdopenR#   t
   isinstanceR   t   encodet   writet   closeR%   R&   R)   R'   R   RK   R*   R(   RL   t   unlink(   R/   t	   sentencesR8   t	   _input_fht   _inputRN   RO   (    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyRQ      s4    	
c         C  s   |  j  } | j d | g  |  j } | rD | j d |  j g  n  d j t  } t d |  j d |  t | d |  j d t	 d t	 \ } } | j
 |  } t d | d t  | S(	   Nu   -inputEncodingu   -optionsu    R9   R   t	   classpathRO   t   stderr(   R+   RK   R.   R#   R
   R   R,   R	   R$   R   t   decodeR@   (   R/   RN   R   R8   R.   t   default_optionsRO   t   _stderr(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyRL     s    		$N(   t   __name__t
   __module__t   __doc__R   R   R@   R<   RG   RI   RP   RS   RQ   RL   (    (    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyR   '   s(   7	I				+c         C  sg   d d l  m } y' t   } | j d  | j d  Wn) t k
 rb } | d t |    n Xd  S(   Ni(   t   SkipTestu   aru   zhu6   Tests for nltk.tokenize.stanford_segmenter skipped: %s(   t   noseRk   R   RG   RA   R   (   t   moduleRk   t   segt   e(    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyt   setup_module0  s    	(   t
   __future__R    R   RV   R!   R   R   t
   subprocessR   t   sixR   t   nltkR   t   nltk.internalsR   R   R   R   R	   R
   t   nltk.tokenize.apiR   R    R   Rp   (    (    (    s?   lib/python2.7/site-packages/nltk/tokenize/stanford_segmenter.pyt   <module>   s   . 
