ó
­V]c           @   s  d  Z  d Z d g Z d d l m Z y d d l m Z Wn) e k
 rd Z d e f d „  ƒ  YZ n Xd d l Z d d l	 Z	 e j
 d	  \ Z Z Z e d	 k o´ e d
 k o´ e d	 k Z e d	 k oÌ e d	 k Z e d	 k oä e d k Z d d l m Z m Z m Z m Z m Z d d l m Z m Z d d l m Z m Z m Z d Z d e f d „  ƒ  YZ d e f d „  ƒ  YZ  e d	 k re d
 k re rd d l! Z! e! j" d ƒ Z# e# e  _# e! j" d e! j$ ƒ Z% e% e _% d d l& m' Z' m( Z( d „  Z) d „  Z* e) e _) e* e _* e+ Z n  d S(   sC   Use the HTMLParser library to parse HTML files that aren't too bad.t   MITt   HTMLParserTreeBuilderiÿÿÿÿ(   t
   HTMLParser(   t   HTMLParseErrorR   c           B   s   e  Z RS(    (   t   __name__t
   __module__(    (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR      s   Ni   i   i   (   t   CDatat   Commentt   Declarationt   Doctypet   ProcessingInstruction(   t   EntitySubstitutiont   UnicodeDammit(   t   HTMLt   HTMLTreeBuildert   STRICTs   html.parsert   BeautifulSoupHTMLParserc           B   sz   e  Z d  „  Z d „  Z d „  Z e d „ Z e d „ Z d „  Z d „  Z	 d „  Z
 d „  Z d	 „  Z d
 „  Z d „  Z RS(   c         O   s    t  j |  | | Ž g  |  _ d  S(   N(   R   t   __init__t   already_closed_empty_element(   t   selft   argst   kwargs(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR   9   s    	c         C   s   t  j | ƒ d S(   si  In Python 3, HTMLParser subclasses must implement error(), although this
        requirement doesn't appear to be documented.

        In Python 2, HTMLParser implements error() as raising an exception.

        In any event, this method is called only on very strange markup and our best strategy
        is to pretend it didn't happen and keep going.
        N(   t   warningst   warn(   R   t   msg(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   errorE   s    	c         C   s)   |  j  | | d t ƒ} |  j | ƒ d  S(   Nt   handle_empty_element(   t   handle_starttagt   Falset   handle_endtag(   R   t   namet   attrst   tag(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   handle_startendtagP   s    c   	      C   sœ   i  } x9 | D]1 \ } } | d  k r. d } n  | | | <d } q W|  j j | d  d  | ƒ } | r˜ | j r˜ | r˜ |  j | d t ƒ|  j j | ƒ n  d  S(   Nt    s   ""t   check_already_closed(   t   Nonet   soupR   t   is_empty_elementR   R   R   t   append(	   R   R   R   R   t	   attr_dictt   keyt   valuet	   attrvalueR    (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR   [   s    	


c         C   s<   | r( | |  j  k r( |  j  j | ƒ n |  j j | ƒ d  S(   N(   R   t   removeR%   R   (   R   R   R#   (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR   w   s    c         C   s   |  j  j | ƒ d  S(   N(   R%   t   handle_data(   R   t   data(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR-   ‚   s    c         C   s"  | j  d ƒ r* t | j d ƒ d ƒ } n6 | j  d ƒ rT t | j d ƒ d ƒ } n t | ƒ } d  } | d k  rÐ x[ |  j j d f D]D } | s— q… n  y t | g ƒ j | ƒ } Wq… t k
 rÈ } q… Xq… Wn  | sy t	 | ƒ } Wqt
 t f k
 r} qXn  | pd } |  j | ƒ d  S(   Nt   xi   t   Xi   s   windows-1252u   ï¿½(   t
   startswitht   intt   lstripR$   R%   t   original_encodingt	   bytearrayt   decodet   UnicodeDecodeErrort   unichrt
   ValueErrort   OverflowErrorR-   (   R   R   t	   real_nameR.   t   encodingt   e(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   handle_charref…   s*    c         C   sB   t  j j | ƒ } | d  k	 r' | } n
 d | } |  j | ƒ d  S(   Ns   &%s(   R   t   HTML_ENTITY_TO_CHARACTERt   getR$   R-   (   R   R   t	   characterR.   (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   handle_entityref¦   s
    	
c         C   s1   |  j  j ƒ  |  j  j | ƒ |  j  j t ƒ d  S(   N(   R%   t   endDataR-   R   (   R   R.   (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   handle_comment³   s    c         C   sh   |  j  j ƒ  | j d ƒ r/ | t d ƒ } n | d k rD d } n  |  j  j | ƒ |  j  j t ƒ d  S(   Ns   DOCTYPE t   DOCTYPER"   (   R%   RC   R1   t   lenR-   R	   (   R   R.   (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   handle_decl¸   s    	c         C   se   | j  ƒ  j d ƒ r. t } | t d ƒ } n t } |  j j ƒ  |  j j | ƒ |  j j | ƒ d  S(   Ns   CDATA[(   t   upperR1   R   RF   R   R%   RC   R-   (   R   R.   t   cls(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   unknown_declÂ   s    c         C   s1   |  j  j ƒ  |  j  j | ƒ |  j  j t ƒ d  S(   N(   R%   RC   R-   R
   (   R   R.   (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt	   handle_piÌ   s    (   R   R   R   R   R!   t   TrueR   R   R-   R>   RB   RD   RG   RJ   RK   (    (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR   7   s   					!			
	
c           B   sS   e  Z e Z e Z e Z e e e	 g Z
 d d d  „ Z d d d d „ Z d „  Z RS(   c         K   sn   t  t |  ƒ j |   | p g  } | p+ i  } t rH t rH t | d <n  t r[ t | d <n  | | f |  _ d  S(   Nt   strictt   convert_charrefs(   t   superR   R   t   CONSTRUCTOR_TAKES_STRICTt    CONSTRUCTOR_STRICT_IS_DEPRECATEDR   t"   CONSTRUCTOR_TAKES_CONVERT_CHARREFSt   parser_args(   R   RS   t   parser_kwargsR   (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR   Ù   s    c         c   sl   t  | t ƒ r$ | d d t f Vd S| | g } t | | d t d | ƒ} | j | j | j | j	 f Vd S(   s¸   
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        Nt   is_htmlt   exclude_encodings(
   t
   isinstancet   unicodeR$   R   R   RL   t   markupR4   t   declared_html_encodingt   contains_replacement_characters(   R   RY   t   user_specified_encodingt   document_declared_encodingRV   t   try_encodingst   dammit(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   prepare_markupã   s    	c         C   s   |  j  \ } } t | | Ž  } |  j | _ y | j | ƒ | j ƒ  Wn, t k
 rs } t j t d ƒ ƒ | ‚ n Xg  | _	 d  S(   Ns*  Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.(
   RS   R   R%   t   feedt   closeR   R   R   t   RuntimeWarningR   (   R   RY   R   R   t   parserR=   (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyRa   õ   s    	

N(   R   R   R   t   is_xmlRL   t	   picklablet
   HTMLPARSERt   NAMER   R   t   featuresR$   R   R`   Ra   (    (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR   Ò   s   
sQ   \s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?sê  
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
(   t   tagfindt   attrfindc         C   sË  d  |  _ |  j | ƒ } | d k  r( | S|  j } | | | !|  _ g  } t j | | d ƒ } | so t d ƒ ‚ | j ƒ  } | | d | !j ƒ  |  _	 } x| | k  r§|  j
 rÆ t j | | ƒ } n t j | | ƒ } | sâ Pn  | j d d d ƒ \ }	 }
 } |
 sd  } nX | d  d k o.| d k n sW| d  d k oR| d k n rg| d d !} n  | r|  j | ƒ } n  | j |	 j ƒ  | f ƒ | j ƒ  } qœ W| | | !j ƒ  } | d k rv|  j ƒ  \ } } d |  j k r | |  j j d ƒ } t |  j ƒ |  j j d ƒ } n | t |  j ƒ } |  j
 r^|  j d | | | !d  f ƒ n  |  j | | | !ƒ | S| j d
 ƒ r˜|  j | | ƒ n/ |  j | | ƒ | |  j k rÇ|  j | ƒ n  | S(   Ni    i   s#   unexpected call to parse_starttag()i   i   s   'iÿÿÿÿt   "t   >s   />s   
s    junk characters in start tag: %ri   (   Rm   s   />(   R$   t   __starttag_textt   check_for_whole_start_tagt   rawdataRj   t   matcht   AssertionErrort   endt   lowert   lasttagRM   Rk   t   attrfind_tolerantt   groupt   unescapeR'   t   stript   getpost   countRF   t   rfindR   R-   t   endswithR!   R   t   CDATA_CONTENT_ELEMENTSt   set_cdata_mode(   R   t   it   endposRp   R   Rq   t   kR    t   mt   attrnamet   restR+   Rs   t   linenot   offset(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   parse_starttag   s\    				$$		c         C   s2   | j  ƒ  |  _ t j d |  j t j ƒ |  _ d  S(   Ns   </\s*%s\s*>(   Rt   t
   cdata_elemt   ret   compilet   It   interesting(   R   t   elem(    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyR   W  s    (,   t   __doc__t   __license__t   __all__R   R   t   ImportErrorR=   t	   Exceptiont   sysR   t   version_infot   majort   minort   releaseRP   RQ   RR   t   bs4.elementR   R   R   R	   R
   t
   bs4.dammitR   R   t   bs4.builderR   R   R   Rg   R   R   RŠ   R‹   Rv   t   VERBOSEt   locatestarttagendt   html.parserRj   Rk   Rˆ   R   RL   (    (    (    s6   lib/python2.7/site-packages/bs4/builder/_htmlparser.pyt   <module>   sB   		$(›5				7			