B
    ߜwa9E                 @   s   d Z ddlZddlZddlZddlmZ dgZedZedZ	edZ
edZed	Zed
ZedZedZedZedejZed
ZedZG dd dejZdS )zA parser for HTML and XHTML.    N)unescape
HTMLParserz[&<]z
&[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z	<[a-zA-Z]>z--\s*>z+([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF  
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
         (?:\s*,)*                   # possibly followed by a comma
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c               @   s   e Zd ZdZdZddddZdd Zd	d
 Zdd ZdZ	dd Z
dd Zdd Zdd Zdd Zd9ddZdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Zd7d8 ZdS ):r   aE  Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  If convert_charrefs is
    True the character references are converted automatically to the
    corresponding Unicode character (and self.handle_data() is no
    longer split in chunks), otherwise they are passed by calling
    self.handle_entityref() or self.handle_charref() with the string
    containing respectively the named or numeric reference as the
    argument.
    )ZscriptZstyleT)convert_charrefsc            C   s   || _ |   dS )zInitialize and reset this instance.

        If convert_charrefs is True (the default), all character references
        are automatically converted to the corresponding Unicode characters.
        N)r   reset)selfr    r   lib/python3.7/html/parser.py__init__W   s    zHTMLParser.__init__c             C   s(   d| _ d| _t| _d| _tj|  dS )z1Reset this instance.  Loses all unprocessed data. z???N)rawdatalasttaginteresting_normalinteresting
cdata_elem_markupbase
ParserBaser   )r   r   r   r	   r   `   s
    zHTMLParser.resetc             C   s   | j | | _ | d dS )zFeed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        r   N)r   goahead)r   datar   r   r	   feedh   s    zHTMLParser.feedc             C   s   |  d dS )zHandle any buffered data.   N)r   )r   r   r   r	   closeq   s    zHTMLParser.closeNc             C   s   | j S )z)Return full source of start tag: '<...>'.)_HTMLParser__starttag_text)r   r   r   r	   get_starttag_textw   s    zHTMLParser.get_starttag_textc             C   s$   |  | _td| j tj| _d S )Nz</\s*%s\s*>)lowerr   recompileIr   )r   elemr   r   r	   set_cdata_mode{   s    
zHTMLParser.set_cdata_modec             C   s   t | _d | _d S )N)r   r   r   )r   r   r   r	   clear_cdata_mode   s    zHTMLParser.clear_cdata_modec             C   sN  | j }d}t|}x||k r| jrx| jsx|d|}|dk r|dt||d }|dkrrtd	||srP |}n(| j
	||}|r| }n| jrP |}||k r| jr| js| t|||  n| |||  | ||}||krP |j}|d|rDt||r | |}	n|d|r8| |}	nl|d|rP| |}	nT|d|rh| |}	n<|d	|r| |}	n$|d
 |k r| d |d
 }	nP |	dk r6|sP |d|d
 }	|	dk r|d|d
 }	|	dk r|d
 }	n|	d
7 }	| jr$| js$| t|||	  n| |||	  | ||	}q|d|rt||}|r| dd }
| |
 | }	|d|	d
 s|	d
 }	| ||	}qn:d||d  kr| |||d   | ||d }P q|d|rt||}|rH|d
}
| |
 | }	|d|	d
 s:|	d
 }	| ||	}qt||}|r|r| ||d  kr| }	|	|kr|}	| ||d
 }P n,|d
 |k r| d | ||d
 }nP qdstdqW |r<||k r<| js<| jr| js| t|||  n| |||  | ||}||d  | _ d S )Nr   <&"   z[\s;]z</z<!--z<?z<!r   r   z&#   ;zinteresting.search() lied)r   lenr   r   findrfindmaxr   r   searchr   starthandle_datar   Z	updatepos
startswithstarttagopenmatchparse_starttagparse_endtagparse_commentparse_piparse_html_declarationcharrefgrouphandle_charrefend	entityrefhandle_entityref
incompleteAssertionError)r   r9   r   injZampposr0   r.   knamer   r   r	   r      s    
 










zHTMLParser.goaheadc             C   s   | j }|||d  dks"td|||d  dkr@| |S |||d  dkr^| |S |||d   d	kr|d
|d }|dkrdS | ||d |  |d S | |S d S )Nr$   z<!z+unexpected call to parse_html_declaration()   z<!--   z<![	   z	<!doctyper   r%   r   )r   r=   r3   Zparse_marked_sectionr   r(   handle_declparse_bogus_comment)r   r>   r   gtposr   r   r	   r5      s    

z!HTMLParser.parse_html_declarationr   c             C   s`   | j }|||d  dks"td|d|d }|dkr>dS |rX| ||d |  |d S )Nr$   )z<!z</z"unexpected call to parse_comment()r   r%   r   )r   r=   r(   handle_comment)r   r>   Zreportr   posr   r   r	   rG     s    zHTMLParser.parse_bogus_commentc             C   sd   | j }|||d  dks"tdt||d }|s:dS | }| ||d |  | }|S )Nr$   z<?zunexpected call to parse_pi()r%   )r   r=   picloser+   r,   	handle_pir9   )r   r>   r   r0   r@   r   r   r	   r4   !  s    zHTMLParser.parse_pic             C   s  d | _ | |}|dk r|S | j}||| | _ g }t||d }|sPtd| }|d  | _	}x||k r.t
||}|sP |ddd\}	}
}|
sd }n\|d d d  kr|dd  ksn |d d d  kr|dd  krn n|dd }|rt|}||	 |f | }qnW |||  }|d	kr|  \}}d
| j kr|| j d
 }t| j | j d
 }n|t| j  }| |||  |S |dr| || n"| || || jkr| | |S )Nr   r   z#unexpected call to parse_starttag()r$   rD   'r%   ")r   z/>
z/>)r   check_for_whole_start_tagr   tagfind_tolerantr0   r=   r9   r7   r   r   attrfind_tolerantr   appendstripZgetposcountr'   r)   r-   endswithhandle_startendtaghandle_starttagCDATA_CONTENT_ELEMENTSr   )r   r>   endposr   attrsr0   rA   tagmZattrnamerestZ	attrvaluer9   linenooffsetr   r   r	   r1   -  sR    
&*

zHTMLParser.parse_starttagc             C   s   | j }t||}|r| }|||d  }|dkr>|d S |dkr~|d|rZ|d S |d|rjdS ||krv|S |d S |dkrdS |dkrdS ||kr|S |d S td	d S )
Nr   r   /z/>r$   r%   r   z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!)r   locatestarttagend_tolerantr0   r9   r.   r=   )r   r>   r   r]   r@   nextr   r   r	   rP   `  s.    z$HTMLParser.check_for_whole_start_tagc             C   s.  | j }|||d  dks"tdt||d }|s:dS | }t||}|s| jd k	rr| |||  |S t	||d }|s|||d  dkr|d S | 
|S |d }|d| }| | |d S |d }| jd k	r|| jkr| |||  |S | | |   |S )	Nr$   z</zunexpected call to parse_endtagr   r%   rD   z</>r   )r   r=   	endendtagr+   r9   
endtagfindr0   r   r-   rQ   rG   r7   r   r(   handle_endtagr    )r   r>   r   r0   rH   Z	namematchZtagnamer   r   r   r	   r2     s8    



zHTMLParser.parse_endtagc             C   s   |  || | | d S )N)rX   rf   )r   r\   r[   r   r   r	   rW     s    zHTMLParser.handle_startendtagc             C   s   d S )Nr   )r   r\   r[   r   r   r	   rX     s    zHTMLParser.handle_starttagc             C   s   d S )Nr   )r   r\   r   r   r	   rf     s    zHTMLParser.handle_endtagc             C   s   d S )Nr   )r   rB   r   r   r	   r8     s    zHTMLParser.handle_charrefc             C   s   d S )Nr   )r   rB   r   r   r	   r;     s    zHTMLParser.handle_entityrefc             C   s   d S )Nr   )r   r   r   r   r	   r-     s    zHTMLParser.handle_datac             C   s   d S )Nr   )r   r   r   r   r	   rI     s    zHTMLParser.handle_commentc             C   s   d S )Nr   )r   Zdeclr   r   r	   rF     s    zHTMLParser.handle_declc             C   s   d S )Nr   )r   r   r   r   r	   rL     s    zHTMLParser.handle_pic             C   s   d S )Nr   )r   r   r   r   r	   unknown_decl  s    zHTMLParser.unknown_declc             C   s   t jdtdd t|S )NzZThe unescape method is deprecated and will be removed in 3.5, use html.unescape() instead.r$   )
stacklevel)warningswarnDeprecationWarningr   )r   sr   r   r	   r     s    
zHTMLParser.unescape)r   )__name__
__module____qualname____doc__rY   r
   r   r   r   r   r   r   r    r   r5   rG   r4   r1   rP   r2   rW   rX   rf   r8   r;   r-   rI   rF   rL   rg   r   r   r   r   r	   r   ?   s8   		z
3"()rp   r   ri   r   Zhtmlr   __all__r   r   r<   r:   r6   r/   rK   ZcommentcloserQ   rR   VERBOSErb   rd   re   r   r   r   r   r   r	   <module>   s(   











