B
    j9C\u3  ã            
   @   s˜  d Z dZdgZddlmZ yddlmZ W n2 ek
r\ Z zG dd„ deƒZW ddZ[X Y nX ddl	Z	ddl
Z
e	jdd	… \ZZZed	ko˜ed
ko˜ed	kZed	ko¨ed	kZed	ko¸edkZddlmZmZmZmZmZ ddlmZmZ ddlmZmZmZ dZG dd„ deƒZ G dd„ deƒZ!ed	kr”ed
kr”es”ddl"Z"e" #d¡Z$e$e!_$e" #de"j%¡Z&e&e _&ddlm'Z'm(Z( dd„ Z)dd„ Z*e)e _)e*e _*dZdS )zCUse the HTMLParser library to parse HTML files that aren't too bad.ZMITÚHTMLParserTreeBuilderé    )Ú
HTMLParser)ÚHTMLParseErrorc               @   s   e Zd ZdS )r   N)Ú__name__Ú
__module__Ú__qualname__© r   r   ú6lib/python3.7/site-packages/bs4/builder/_htmlparser.pyr      s   r   Né   é   é   )ÚCDataÚCommentÚDeclarationÚDoctypeÚProcessingInstruction)ÚEntitySubstitutionÚUnicodeDammit)ÚHTMLÚHTMLTreeBuilderÚSTRICTzhtml.parserc               @   sp   e Zd Zdd„ Zdd„ Zdd„ Zddd	„Zdd
d„Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ ZdS )ÚBeautifulSoupHTMLParserc             O   s   t j| f|ž|Ž g | _d S )N)r   Ú__init__Úalready_closed_empty_element)ÚselfÚargsÚkwargsr   r   r	   r   9   s    	z BeautifulSoupHTMLParser.__init__c             C   s   t  |¡ dS )ai  In Python 3, HTMLParser subclasses must implement error(), although this
        requirement doesn't appear to be documented.

        In Python 2, HTMLParser implements error() as raising an exception.

        In any event, this method is called only on very strange markup and our best strategy
        is to pretend it didn't happen and keep going.
        N)ÚwarningsÚwarn)r   Úmsgr   r   r	   ÚerrorE   s    	zBeautifulSoupHTMLParser.errorc             C   s   | j ||dd}|  |¡ d S )NF)Úhandle_empty_element)Úhandle_starttagÚhandle_endtag)r   ÚnameÚattrsÚtagr   r   r	   Úhandle_startendtagP   s    z*BeautifulSoupHTMLParser.handle_startendtagTc       	      C   sl   i }x(|D ] \}}|d krd}|||< d}q
W | j  |d d |¡}|rh|jrh|rh| j|dd | j |¡ d S )NÚ z""F)Úcheck_already_closed)Úsoupr"   Zis_empty_elementr#   r   Úappend)	r   r$   r%   r!   Z	attr_dictÚkeyÚvalueÚ	attrvaluer&   r   r   r	   r"   [   s    
z'BeautifulSoupHTMLParser.handle_starttagc             C   s,   |r|| j kr| j  |¡ n| j |¡ d S )N)r   Úremover*   r#   )r   r$   r)   r   r   r	   r#   w   s    z%BeautifulSoupHTMLParser.handle_endtagc             C   s   | j  |¡ d S )N)r*   Úhandle_data)r   Údatar   r   r	   r0   ‚   s    z#BeautifulSoupHTMLParser.handle_datac             C   sî   |  d¡rt| d¡dƒ}n$|  d¡r8t| d¡dƒ}nt|ƒ}d }|dk r xR| jjdfD ]B}|sdqZyt|gƒ |¡}W qZ tk
rš } zW d d }~X Y qZX qZW |sØyt|ƒ}W n& t	t
fk
rÖ } zW d d }~X Y nX |pÞd}|  |¡ d S )NÚxé   ÚXé   zwindows-1252u   ï¿½)Ú
startswithÚintÚlstripr*   Úoriginal_encodingÚ	bytearrayÚdecodeÚUnicodeDecodeErrorÚchrÚ
ValueErrorÚOverflowErrorr0   )r   r$   Z	real_namer1   ÚencodingÚer   r   r	   Úhandle_charref…   s*    

z&BeautifulSoupHTMLParser.handle_charrefc             C   s0   t j |¡}|d k	r|}nd| }|  |¡ d S )Nz&%s)r   ZHTML_ENTITY_TO_CHARACTERÚgetr0   )r   r$   Ú	characterr1   r   r   r	   Úhandle_entityref¦   s
    z(BeautifulSoupHTMLParser.handle_entityrefc             C   s&   | j  ¡  | j  |¡ | j  t¡ d S )N)r*   ÚendDatar0   r   )r   r1   r   r   r	   Úhandle_comment³   s    
z&BeautifulSoupHTMLParser.handle_commentc             C   sN   | j  ¡  | d¡r&|tdƒd … }n|dkr2d}| j  |¡ | j  t¡ d S )NzDOCTYPE ZDOCTYPEr(   )r*   rF   r6   Úlenr0   r   )r   r1   r   r   r	   Úhandle_decl¸   s    

z#BeautifulSoupHTMLParser.handle_declc             C   sN   |  ¡  d¡r$t}|tdƒd … }nt}| j ¡  | j |¡ | j |¡ d S )NzCDATA[)Úupperr6   r   rH   r   r*   rF   r0   )r   r1   Úclsr   r   r	   Úunknown_declÂ   s    
z$BeautifulSoupHTMLParser.unknown_declc             C   s&   | j  ¡  | j  |¡ | j  t¡ d S )N)r*   rF   r0   r   )r   r1   r   r   r	   Ú	handle_piÌ   s    
z!BeautifulSoupHTMLParser.handle_piN)T)T)r   r   r   r   r    r'   r"   r#   r0   rB   rE   rG   rI   rL   rM   r   r   r   r	   r   7   s   

!

r   c               @   s<   e Zd ZdZdZeZeeegZ	dd„ Z
d
dd„Zdd	„ ZdS )r   FTc             O   s*   t rtsd|d< trd|d< ||f| _d S )NFÚstrictZconvert_charrefs)ÚCONSTRUCTOR_TAKES_STRICTÚ CONSTRUCTOR_STRICT_IS_DEPRECATEDÚ"CONSTRUCTOR_TAKES_CONVERT_CHARREFSÚparser_args)r   r   r   r   r   r	   r   Ù   s
    zHTMLParserTreeBuilder.__init__Nc             c   sN   t |tƒr|dddfV  dS ||g}t||d|d}|j|j|j|jfV  dS )z¸
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        NFT)Zis_htmlÚexclude_encodings)Ú
isinstanceÚstrr   Úmarkupr9   Zdeclared_html_encodingZcontains_replacement_characters)r   rV   Zuser_specified_encodingZdocument_declared_encodingrS   Ztry_encodingsZdammitr   r   r	   Úprepare_markupà   s    
z$HTMLParserTreeBuilder.prepare_markupc          
   C   sr   | j \}}t||Ž}| j|_y| |¡ | ¡  W n4 tk
rf } zt tdƒ¡ |‚W d d }~X Y nX g |_	d S )Na*  Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.)
rR   r   r*   ÚfeedÚcloser   r   r   ÚRuntimeWarningr   )r   rV   r   r   ÚparserrA   r   r   r	   rX   ò   s    


zHTMLParserTreeBuilder.feed)NNN)r   r   r   Zis_xmlZ	picklableÚ
HTMLPARSERÚNAMEr   r   Zfeaturesr   rW   rX   r   r   r   r	   r   Ò   s   
 
zQ\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?aê  
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
)ÚtagfindÚattrfindc             C   s6  d | _ |  |¡}|dk r|S | j}|||… | _ g }t ||d ¡}|sPtdƒ‚| ¡ }||d |…  ¡  | _}xÜ||k rN| j	r’t
 ||¡}nt ||¡}|s¤P | ddd¡\}	}
}|
sÂd }n`|d d… d  krè|dd … ksn |d d… d  kr|dd … kr"n n|dd… }|r2|  |¡}| |	 ¡ |f¡ | ¡ }qtW |||…  ¡ }|d	krö|  ¡ \}}d
| j krª|| j  d
¡ }t| j ƒ| j  d
¡ }n|t| j ƒ }| j	rà|  d|||… d d… f ¡ |  |||… ¡ |S | d¡r|  ||¡ n"|  ||¡ || jkr2|  |¡ |S )Nr   é   z#unexpected call to parse_starttag()r   r
   ú'éÿÿÿÿú")ú>z/>Ú
z junk characters in start tag: %ré   z/>)Z__starttag_textZcheck_for_whole_start_tagÚrawdatar^   ÚmatchÚAssertionErrorÚendÚlowerZlasttagrN   r_   Úattrfind_tolerantÚgroupZunescaper+   ÚstripZgetposÚcountrH   Úrfindr    r0   Úendswithr'   r"   ZCDATA_CONTENT_ELEMENTSÚset_cdata_mode)r   ÚiÚendposrg   r%   rh   Úkr&   ÚmZattrnameÚrestr.   rj   ÚlinenoÚoffsetr   r   r	   Úparse_starttag  s\    
(,


rz   c             C   s$   |  ¡ | _t d| j tj¡| _d S )Nz</\s*%s\s*>)rk   Z
cdata_elemÚreÚcompileÚIZinteresting)r   Úelemr   r   r	   rr   T  s    
rr   T)+Ú__doc__Z__license__Ú__all__Zhtml.parserr   r   ÚImportErrorrA   Ú	ExceptionÚsysr   Úversion_infoÚmajorÚminorÚreleaserO   rP   rQ   Zbs4.elementr   r   r   r   r   Z
bs4.dammitr   r   Zbs4.builderr   r   r   r\   r   r   r{   r|   rl   ÚVERBOSEZlocatestarttagendr^   r_   rz   rr   r   r   r   r	   Ú<module>   sD   "	 2
7