ó
ů`]c           @  sb   d  Z  d d l m Z d d l Z d d l m Z d d l m Z e d e f d     Y Z d S(   u   
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
i˙˙˙˙(   t   unicode_literalsN(   t   StemmerI(   t   python_2_unicode_compatiblet   LancasterStemmerc        s   B  sn   e  Z d  Z d} Z d~ e dt  Z d~ du  Z dv   Z dw   Z	 dx   Z
 dy   Z dz   Z d{   Z d|   Z RS(   u/  
    Lancaster Stemmer

        >>> from nltk.stem.lancaster import LancasterStemmer
        >>> st = LancasterStemmer()
        >>> st.stem('maximum')     # Remove "-um" when word is intact
        'maxim'
        >>> st.stem('presumably')  # Don't remove "-um" when word is not intact
        'presum'
        >>> st.stem('multiply')    # No action taken if word ends with "-ply"
        'multiply'
        >>> st.stem('provision')   # Replace "-sion" with "-j" to trigger "j" set of rules
        'provid'
        >>> st.stem('owed')        # Word starting with vowel must contain at least 2 letters
        'ow'
        >>> st.stem('ear')         # ditto
        'ear'
        >>> st.stem('saying')      # Words starting with consonant must contain at least 3
        'say'
        >>> st.stem('crying')      #     letters and one of those letters must be a vowel
        'cry'
        >>> st.stem('string')      # ditto
        'string'
        >>> st.stem('meant')       # ditto
        'meant'
        >>> st.stem('cement')      # ditto
        'cem'
        >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
        >>> st_pre.stem('kilometer') # Test Prefix
        'met'
        >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
        >>> st_custom.stem("ness") # Change s to t
        'nest'
    u   ai*2.u   a*1.u   bb1.u   city3s.u   ci2>u   cn1t>u   dd1.u   dei3y>u   deec2ss.u   dee1.u   de2>u   dooh4>u   e1>u   feil1v.u   fi2>u   gni3>u   gai3y.u   ga2>u   gg1.u   ht*2.u	   hsiug5ct.u   hsi3>u   i*1.u   i1y>u   ji1d.u   juf1s.u   ju1d.u   jo1d.u   jeh1r.u   jrev1t.u   jsim2t.u   jn1d.u   j1s.u   lbaifi6.u   lbai4y.u   lba3>u   lbi3.u   lib2l>u   lc1.u   lufi4y.u   luf3>u   lu2.u   lai3>u   lau3>u   la2>u   ll1.u   mui3.u   mu*2.u   msi3>u   mm1.u   nois4j>u   noix4ct.u   noi3>u   nai3>u   na2>u   nee0.u   ne2>u   nn1.u   pihs4>u   pp1.u   re2>u   rae0.u   ra2.u   ro2>u   ru2>u   rr1.u   rt1>u   rei3y>u   sei3y>u   sis2.u   si2>u   ssen4>u   ss0.u   suo3>u   su*2.u   s*1>u   s0.u	   tacilp4y.u   ta2>u   tnem4>u   tne3>u   tna3>u   tpir2b.u   tpro2b.u   tcud1.u   tpmus2.u   tpec2iv.u   tulo2v.u   tsis0.u   tsi3>u   tt1.u   uqi3.u   ugo1.u   vis3j>u   vie0.u   vi2>u   ylb1>u   yli3y>u   ylp0.u   yl2>u   ygo1.u   yhp1.u   ymo1.u   ypo1.u   yti3>u   yte3>u   ytl2.u   yrtsi5.u   yra3>u   yro3>u   yfi3.u   ycn2t>u   yca3>u   zi2>u   zy1s.c         C  s.   i  |  _  | |  _ | r | n |  j |  _ d S(   u5   Create an instance of the Lancaster stemmer.
        N(   t   rule_dictionaryt   _strip_prefixt   default_rule_tuplet   _rule_tuple(   t   selft
   rule_tuplet   strip_prefix_flag(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   __init__Ż   s    		c         C  sŹ   | r | n |  j  } t j d  } i  |  _ xx | D]p } | j |  sa t d j |    n  | d d !} | |  j k r |  j | j |  q4 | g |  j | <q4 Wd S(   u(  Validate the set of rules used in this stemmer.

        If this function is called as an individual method, without using stem
        method, rule_tuple argument will be compiled into self.rule_dictionary.
        If this function is called within stem, self._rule_tuple will be used.

        u   ^[a-z]+\*?\d[a-z]*[>\.]?$u   The rule {0} is invalidi    i   N(   R   t   ret   compileR   t   matcht
   ValueErrort   formatt   append(   R   R	   t
   valid_rulet   rulet   first_letter(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt
   parseRulesš   s    		c         C  sV   | j    } |  j r$ |  j |  n | } | } |  j sF |  j   n  |  j | |  S(   u1   Stem a word using the Lancaster stemmer.
        (   t   lowerR   t   _LancasterStemmer__stripPrefixR   R   t   _LancasterStemmer__doStemming(   R   t   wordt   intact_word(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   stemĐ   s    	c         C  s  t  j d  } t } xw| r|  j |  } | d k  sL | | |  j k rU t } q t } x|  j | | D]} | j |  } | rm | j   \ }	 }
 } } } t |  } | j	 |	 d d d   rr|
 r&| | k rl|  j
 | |  rl|  j | | |  } t } | d k rt } n  Pqlqo|  j
 | |  ro|  j | | |  } t } | d k rht } n  Pqoqrqm qm W| t k r t } q q W| S(   u)   Perform the actual word stemming
        u#   ^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$i    Ni˙˙˙˙u   .(   R   R   t   Truet    _LancasterStemmer__getLastLetterR   t   FalseR   t   groupst   intt   endswitht   _LancasterStemmer__isAcceptablet   _LancasterStemmer__applyRule(   R   R   R   R   t   proceedt   last_letter_positiont   rule_was_appliedR   t
   rule_matcht   ending_stringt   intact_flagt   remove_totalt   append_stringt	   cont_flag(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   __doStemmingŕ   sB    				c         C  sA   d } x4 t  t |   D]  } | | j   r8 | } q Pq W| S(   uQ   Get the zero-based index of the last alphabetic character in this string
        i˙˙˙˙(   t   ranget   lent   isalpha(   R   R   t   last_lettert   position(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   __getLastLetter!  s    	c         C  s   t  } | d d k r8 t |  | d k r t } q nK t |  | d k r | d d k rg t } q | d d k r t } q n  | S(   u:   Determine if the word is acceptable for stemming.
        i    u   aeiouyi   i   i   (   R   R/   R   (   R   R   R*   t   word_is_acceptable(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   __isAcceptable,  s    	c         C  s4   t  |  | } | d | !} | r0 | | 7} n  | S(   u,   Apply the stemming rule to the word
        i    (   R/   (   R   R   R*   R+   t   new_word_length(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   __applyRule>  s
    c      	   C  s2   x+ d
 D]# } | j  |  r | t |  Sq W| S(   uY   Remove prefix from a word.

        This function originally taken from Whoosh.

        u   kilou   microu   milliu   intrau   ultrau   megau   nanou   picou   pseudo(	   u   kilou   microu   milliu   intrau   ultrau   megau   nanou   picou   pseudo(   t
   startswithR/   (   R   R   t   prefix(    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   __stripPrefixJ  s            
c         C  s   d S(   Nu   <LancasterStemmer>(    (   R   (    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   __repr___  s    (s   u   ai*2.u   a*1.u   bb1.u   city3s.u   ci2>u   cn1t>u   dd1.u   dei3y>u   deec2ss.u   dee1.u   de2>u   dooh4>u   e1>u   feil1v.u   fi2>u   gni3>u   gai3y.u   ga2>u   gg1.u   ht*2.u	   hsiug5ct.u   hsi3>u   i*1.u   i1y>u   ji1d.u   juf1s.u   ju1d.u   jo1d.u   jeh1r.u   jrev1t.u   jsim2t.u   jn1d.u   j1s.u   lbaifi6.u   lbai4y.u   lba3>u   lbi3.u   lib2l>u   lc1.u   lufi4y.u   luf3>u   lu2.u   lai3>u   lau3>u   la2>u   ll1.u   mui3.u   mu*2.u   msi3>u   mm1.u   nois4j>u   noix4ct.u   noi3>u   nai3>u   na2>u   nee0.u   ne2>u   nn1.u   pihs4>u   pp1.u   re2>u   rae0.u   ra2.u   ro2>u   ru2>u   rr1.u   rt1>u   rei3y>u   sei3y>u   sis2.u   si2>u   ssen4>u   ss0.u   suo3>u   su*2.u   s*1>u   s0.u	   tacilp4y.u   ta2>u   tnem4>u   tne3>u   tna3>u   tpir2b.u   tpro2b.u   tcud1.u   tpmus2.u   tpec2iv.u   tulo2v.u   tsis0.u   tsi3>u   tt1.u   uqi3.u   ugo1.u   vis3j>u   vie0.u   vi2>u   ylb1>u   yli3y>u   ylp0.u   yl2>u   ygo1.u   yhp1.u   ymo1.u   ypo1.u   yti3>u   yte3>u   ytl2.u   yrtsi5.u   yra3>u   yro3>u   yfi3.u   ycn2t>u   yca3>u   zi2>u   zy1s.N(   t   __name__t
   __module__t   __doc__R   t   NoneR   R   R   R   R   R   R"   R#   R   R;   (    (    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyR      sú   #                                                                                                                  
		A				(	   R>   t
   __future__R    R   t   nltk.stem.apiR   t   nltk.compatR   R   (    (    (    s2   lib/python2.7/site-packages/nltk/stem/lancaster.pyt   <module>   s   