B
    n&=[+                @   s   d dl mZmZmZ d dlmZ d dlmZ ddl	m
Z
 ddl	mZ ddl	mZmZ ddl	mZmZmZ dd	l	mZmZ dd
l	mZ ddlmZ ddlmZ eeZG dd deZdS )    )absolute_importdivisionunicode_literals)unichr)deque   )spaceCharacters)entities)asciiLettersasciiUpper2Lower)digits	hexDigitsEOF)
tokenTypestagTokenTypes)replacementCharacters)HTMLInputStream)Triec                   sd  e Zd ZdZd fdd	Zdd Zdd Zdd
dZdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Zd<d= Z d>d? Z!d@dA Z"dBdC Z#dDdE Z$dFdG Z%dHdI Z&dJdK Z'dLdM Z(dNdO Z)dPdQ Z*dRdS Z+dTdU Z,dVdW Z-dXdY Z.dZd[ Z/d\d] Z0d^d_ Z1d`da Z2dbdc Z3ddde Z4dfdg Z5dhdi Z6djdk Z7dldm Z8dndo Z9dpdq Z:drds Z;dtdu Z<dvdw Z=dxdy Z>dzd{ Z?d|d} Z@d~d ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd ZGdd ZHdd ZIdd ZJdd ZKdd ZL  ZMS )HTMLTokenizera	   This class takes care of tokenizing HTML.

    * self.currentToken
      Holds the token that is currently being processed.

    * self.state
      Holds a reference to the method to be invoked... XXX

    * self.stream
      Points to HTMLInputStream object.
    Nc                sF   t |f|| _|| _d| _g | _| j| _d| _d | _t	t
|   d S )NF)r   streamparserZ
escapeFlagZlastFourChars	dataStatestateescapecurrentTokensuperr   __init__)selfr   r   kwargs)	__class__ 2lib/python3.7/site-packages/html5lib/_tokenizer.pyr   "   s    zHTMLTokenizer.__init__c             c   s\   t g | _xL|  rVx&| jjr:td | jjddV  qW x| jrR| j V  q>W qW dS )z This is where the magic happens.

        We do our usually processing through the states and when we have a token
        to return we yield the token which pauses processing until the next token
        is requested.
        
ParseErrorr   )typedataN)r   
tokenQueuer   r   errorsr   poppopleft)r   r    r    r!   __iter__1   s    


zHTMLTokenizer.__iter__c       	   %   C   s  t }d}|rt}d}g }| j }x(||krJ|tk	rJ|| | j }q$W td||}|tkrt| }| j	t
d dd|id nbd|  krd	ksn |d
krd}| j	t
d dd|id n d|  krdksn d|  krdksn d|  krdksn d|  kr0dksn |tddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d
g#kr| j	t
d dd|id yt|}W n> tk
r   |d6 }td|d? B td7|d8@ B  }Y nX |d9kr| j	t
d d:d; | j| |S )<zThis function returns either U+FFFD or the character based on the
        decimal or hexadecimal representation. It also discards ";" if present.
        If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
        
       r"   z$illegal-codepoint-for-numeric-entity	charAsInt)r#   r$   datavarsi   i  i u   �r                  i  i     i  i  i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i i   i   i  ;z numeric-entity-without-semicolon)r#   r$   )r   r   r   charr   appendintjoinr   r%   r   	frozensetchr
ValueErrorunget)	r   ZisHexZallowedradix	charStackcr-   r6   vr    r    r!   consumeNumberEntityA   s`    

&

z!HTMLTokenizer.consumeNumberEntityFc       	      C   s  d}| j  g}|d tksB|d tddfksB|d k	rV||d krV| j |d  n |d dkrd}|| j   |d dkrd}|| j   |r|d tks|s|d tkr| j |d  | |}n4| j	t
d	 d
d | j |  dd| }njx8|d tk	rDtd|s0P || j   qW y$td|d d }t|}W n tk
r   d }Y nX |d k	rB|d dkr| j	t
d	 dd |d dkr|r|| tks|| tks|| dkr| j |  dd| }n.t| }| j |  |d||d  7 }n4| j	t
d	 dd | j |  dd| }|r| jd d d  |7  < n*|tkrd}nd}| j	t
| |d d S )N&r   <#F)xXTr"   zexpected-numeric-entity)r#   r$   r,   r5   znamed-entity-without-semicolon=zexpected-named-entityr$   r   SpaceCharacters
Characters)r   r6   r   r   r=   r7   r   r   rB   r%   r   r'   r9   entitiesTrieZhas_keys_with_prefixZlongest_prefixlenKeyErrorr
   r	   r   )	r   allowedCharfromAttributeoutputr?   hexZ
entityNameZentityLengthZ	tokenTyper    r    r!   consumeEntity   sf    





zHTMLTokenizer.consumeEntityc             C   s   | j |dd dS )zIThis method replaces the need for "entityInAttributeValueState".
        T)rO   rP   N)rS   )r   rO   r    r    r!   processEntityInAttribute   s    z&HTMLTokenizer.processEntityInAttributec             C   s   | j }|d tkrp|d t|d< |d td krp|d rR| jtd dd |d rp| jtd d	d | j| | j| _d
S )zThis method is a generic handler for emitting the tags. It also sets
        the state to "data" because that's what's needed after a token has been
        emitted.
        r#   nameEndTagr$   r"   zattributes-in-end-tag)r#   r$   selfClosingzself-closing-flag-on-end-tagN)	r   r   	translater   r   r%   r7   r   r   )r   tokenr    r    r!   emitCurrentToken   s    

zHTMLTokenizer.emitCurrentTokenc             C   s   | j  }|dkr| j| _n|dkr.| j| _n|dkrd| jtd dd | jtd dd n`|tkrpdS |t	kr| jtd	 || j 
t	d
 d n&| j 
d}| jtd || d d
S )NrC   rD    r"   zinvalid-codepoint)r#   r$   rK   FrJ   T)rC   rD   r[   )r   r6   entityDataStater   tagOpenStater%   r7   r   r   r   
charsUntil)r   r$   charsr    r    r!   r      s&    



zHTMLTokenizer.dataStatec             C   s   |    | j| _dS )NT)rS   r   r   )r   r    r    r!   r\     s    zHTMLTokenizer.entityDataStatec             C   s   | j  }|dkr| j| _n|dkr.| j| _n|tkr:dS |dkrp| jtd dd | jtd d	d nT|t	kr| jtd
 || j 
t	d d n&| j 
d}| jtd || d dS )NrC   rD   Fr[   r"   zinvalid-codepoint)r#   r$   rK   u   �rJ   T)rC   rD   r[   )r   r6   characterReferenceInRcdatar   rcdataLessThanSignStater   r%   r7   r   r   r^   )r   r$   r_   r    r    r!   rcdataState  s&    



zHTMLTokenizer.rcdataStatec             C   s   |    | j| _dS )NT)rS   rb   r   )r   r    r    r!   r`   1  s    z(HTMLTokenizer.characterReferenceInRcdatac             C   s   | j  }|dkr| j| _nh|dkrR| jtd dd | jtd dd n2|tkr^dS | j d	}| jtd || d d
S )NrD   r[   r"   zinvalid-codepoint)r#   r$   rK   u   �F)rD   r[   T)	r   r6   rawtextLessThanSignStater   r%   r7   r   r   r^   )r   r$   r_   r    r    r!   rawtextState6  s    


zHTMLTokenizer.rawtextStatec             C   s   | j  }|dkr| j| _nh|dkrR| jtd dd | jtd dd n2|tkr^dS | j d	}| jtd || d d
S )NrD   r[   r"   zinvalid-codepoint)r#   r$   rK   u   �F)rD   r[   T)	r   r6   scriptDataLessThanSignStater   r%   r7   r   r   r^   )r   r$   r_   r    r    r!   scriptDataStateH  s    


zHTMLTokenizer.scriptDataStatec             C   sr   | j  }|tkrdS |dkrL| jtd dd | jtd dd n"| jtd || j d d dS )	NFr[   r"   zinvalid-codepoint)r#   r$   rK   u   �T)r   r6   r   r%   r7   r   r^   )r   r$   r    r    r!   plaintextStateZ  s    

zHTMLTokenizer.plaintextStatec             C   s  | j  }|dkr| j| _n|dkr.| j| _n|tkrVtd |g ddd| _| j| _n|dkr| j	
td dd	 | j	
td
 dd	 | j| _nt|dkr| j	
td dd	 | j | | j| _n@| j	
td dd	 | j	
td
 dd	 | j | | j| _dS )N!/ZStartTagF)r#   rU   r$   rW   ZselfClosingAcknowledged>r"   z'expected-tag-name-but-got-right-bracket)r#   r$   rK   z<>?z'expected-tag-name-but-got-question-markzexpected-tag-namerD   T)r   r6   markupDeclarationOpenStater   closeTagOpenStater
   r   r   tagNameStater%   r7   r   r=   bogusCommentState)r   r$   r    r    r!   r]   i  s6    









zHTMLTokenizer.tagOpenStatec             C   s   | j  }|tkr0td |g dd| _| j| _n|dkrX| jtd dd | j	| _nn|t
kr| jtd dd | jtd	 d
d | j	| _n0| jtd dd|id | j | | j| _dS )NrV   F)r#   rU   r$   rW   rj   r"   z*expected-closing-tag-but-got-right-bracket)r#   r$   z expected-closing-tag-but-got-eofrK   z</z!expected-closing-tag-but-got-charr$   )r#   r$   r.   T)r   r6   r
   r   r   rn   r   r%   r7   r   r   r=   ro   )r   r$   r    r    r!   rm     s(    





zHTMLTokenizer.closeTagOpenStatec             C   s   | j  }|tkr| j| _n|dkr.|   n~|tkrV| jt	d dd | j
| _nV|dkrh| j| _nD|dkr| jt	d dd | jd  d	7  < n| jd  |7  < d
S )Nrj   r"   zeof-in-tag-name)r#   r$   ri   r[   zinvalid-codepointrU   u   �T)r   r6   r   beforeAttributeNameStater   rZ   r   r%   r7   r   r   selfClosingStartTagStater   )r   r$   r    r    r!   rn     s"    






zHTMLTokenizer.tagNameStatec             C   sP   | j  }|dkr"d| _| j| _n*| jtd dd | j | | j	| _dS )Nri   r,   rK   rD   )r#   r$   T)
r   r6   temporaryBufferrcdataEndTagOpenStater   r%   r7   r   r=   rb   )r   r$   r    r    r!   ra     s    

z%HTMLTokenizer.rcdataLessThanSignStatec             C   sX   | j  }|tkr*|  j|7  _| j| _n*| jtd dd | j 	| | j
| _dS )NrK   z</)r#   r$   T)r   r6   r
   rr   rcdataEndTagNameStater   r%   r7   r   r=   rb   )r   r$   r    r    r!   rs     s    

z#HTMLTokenizer.rcdataEndTagOpenStatec             C   s   | j o| j d  | j k}| j }|tkrT|rTtd | jg dd| _ | j| _n|dkr|rtd | jg dd| _ | j	| _n||dkr|rtd | jg dd| _ | 
  | j| _nH|tkr|  j|7  _n0| jtd d| j d	 | j| | j| _d
S )NrU   rV   F)r#   rU   r$   rW   ri   rj   rK   z</)r#   r$   T)r   lowerrr   r   r6   r   r   rp   r   rq   rZ   r   r
   r%   r7   r=   rb   )r   appropriater$   r    r    r!   rt     s2    



z#HTMLTokenizer.rcdataEndTagNameStatec             C   sP   | j  }|dkr"d| _| j| _n*| jtd dd | j | | j	| _dS )Nri   r,   rK   rD   )r#   r$   T)
r   r6   rr   rawtextEndTagOpenStater   r%   r7   r   r=   rd   )r   r$   r    r    r!   rc     s    

z&HTMLTokenizer.rawtextLessThanSignStatec             C   sX   | j  }|tkr*|  j|7  _| j| _n*| jtd dd | j 	| | j
| _dS )NrK   z</)r#   r$   T)r   r6   r
   rr   rawtextEndTagNameStater   r%   r7   r   r=   rd   )r   r$   r    r    r!   rw     s    

z$HTMLTokenizer.rawtextEndTagOpenStatec             C   s   | j o| j d  | j k}| j }|tkrT|rTtd | jg dd| _ | j| _n|dkr|rtd | jg dd| _ | j	| _n||dkr|rtd | jg dd| _ | 
  | j| _nH|tkr|  j|7  _n0| jtd d| j d	 | j| | j| _d
S )NrU   rV   F)r#   rU   r$   rW   ri   rj   rK   z</)r#   r$   T)r   ru   rr   r   r6   r   r   rp   r   rq   rZ   r   r
   r%   r7   r=   rd   )r   rv   r$   r    r    r!   rx     s2    



z$HTMLTokenizer.rawtextEndTagNameStatec             C   sx   | j  }|dkr"d| _| j| _nR|dkrJ| jtd dd | j| _n*| jtd dd | j 	| | j
| _dS )	Nri   r,   rh   rK   z<!)r#   r$   rD   T)r   r6   rr   scriptDataEndTagOpenStater   r%   r7   r   scriptDataEscapeStartStater=   rf   )r   r$   r    r    r!   re     s    


z)HTMLTokenizer.scriptDataLessThanSignStatec             C   sX   | j  }|tkr*|  j|7  _| j| _n*| jtd dd | j 	| | j
| _dS )NrK   z</)r#   r$   T)r   r6   r
   rr   scriptDataEndTagNameStater   r%   r7   r   r=   rf   )r   r$   r    r    r!   ry   ,  s    

z'HTMLTokenizer.scriptDataEndTagOpenStatec             C   s   | j o| j d  | j k}| j }|tkrT|rTtd | jg dd| _ | j| _n|dkr|rtd | jg dd| _ | j	| _n||dkr|rtd | jg dd| _ | 
  | j| _nH|tkr|  j|7  _n0| jtd d| j d	 | j| | j| _d
S )NrU   rV   F)r#   rU   r$   rW   ri   rj   rK   z</)r#   r$   T)r   ru   rr   r   r6   r   r   rp   r   rq   rZ   r   r
   r%   r7   r=   rf   )r   rv   r$   r    r    r!   r{   7  s2    



z'HTMLTokenizer.scriptDataEndTagNameStatec             C   sJ   | j  }|dkr2| jtd dd | j| _n| j | | j| _dS )N-rK   )r#   r$   T)	r   r6   r%   r7   r   scriptDataEscapeStartDashStater   r=   rf   )r   r$   r    r    r!   rz   S  s    

z(HTMLTokenizer.scriptDataEscapeStartStatec             C   sJ   | j  }|dkr2| jtd dd | j| _n| j | | j| _dS )Nr|   rK   )r#   r$   T)	r   r6   r%   r7   r   scriptDataEscapedDashDashStater   r=   rf   )r   r$   r    r    r!   r}   ]  s    

z,HTMLTokenizer.scriptDataEscapeStartDashStatec             C   s   | j  }|dkr2| jtd dd | j| _n|dkrD| j| _nn|dkrz| jtd dd | jtd dd n8|tkr| j	| _n&| j 
d	}| jtd || d d
S )Nr|   rK   )r#   r$   rD   r[   r"   zinvalid-codepointu   �)rD   r|   r[   T)r   r6   r%   r7   r   scriptDataEscapedDashStater   "scriptDataEscapedLessThanSignStater   r   r^   )r   r$   r_   r    r    r!   scriptDataEscapedStateg  s"    




z$HTMLTokenizer.scriptDataEscapedStatec             C   s   | j  }|dkr2| jtd dd | j| _n|dkrD| j| _nn|dkr| jtd dd | jtd dd | j| _n0|t	kr| j
| _n| jtd |d | j| _d	S )
Nr|   rK   )r#   r$   rD   r[   r"   zinvalid-codepointu   �T)r   r6   r%   r7   r   r~   r   r   r   r   r   )r   r$   r    r    r!   r   {  s"    






z(HTMLTokenizer.scriptDataEscapedDashStatec             C   s   | j  }|dkr*| jtd dd n|dkr<| j| _n|dkrd| jtd dd | j| _nn|dkr| jtd dd | jtd d	d | j| _n0|t	kr| j
| _n| jtd |d | j| _d
S )Nr|   rK   )r#   r$   rD   rj   r[   r"   zinvalid-codepointu   �T)r   r6   r%   r7   r   r   r   rf   r   r   r   )r   r$   r    r    r!   r~     s&    






z,HTMLTokenizer.scriptDataEscapedDashDashStatec             C   s   | j  }|dkr"d| _| j| _n\|tkrT| jtd d| d || _| j	| _n*| jtd dd | j 
| | j| _dS )Nri   r,   rK   rD   )r#   r$   T)r   r6   rr    scriptDataEscapedEndTagOpenStater   r
   r%   r7   r    scriptDataDoubleEscapeStartStater=   r   )r   r$   r    r    r!   r     s    


z0HTMLTokenizer.scriptDataEscapedLessThanSignStatec             C   sP   | j  }|tkr"|| _| j| _n*| jtd dd | j 	| | j
| _dS )NrK   z</)r#   r$   T)r   r6   r
   rr    scriptDataEscapedEndTagNameStater   r%   r7   r   r=   r   )r   r$   r    r    r!   r     s    

z.HTMLTokenizer.scriptDataEscapedEndTagOpenStatec             C   s   | j o| j d  | j k}| j }|tkrT|rTtd | jg dd| _ | j| _n|dkr|rtd | jg dd| _ | j	| _n||dkr|rtd | jg dd| _ | 
  | j| _nH|tkr|  j|7  _n0| jtd d| j d	 | j| | j| _d
S )NrU   rV   F)r#   rU   r$   rW   ri   rj   rK   z</)r#   r$   T)r   ru   rr   r   r6   r   r   rp   r   rq   rZ   r   r
   r%   r7   r=   r   )r   rv   r$   r    r    r!   r     s2    



z.HTMLTokenizer.scriptDataEscapedEndTagNameStatec             C   s   | j  }|ttdB krR| jtd |d | j dkrH| j	| _
q| j| _
nB|tkr| jtd |d |  j|7  _n| j | | j| _
dS )N)ri   rj   rK   )r#   r$   scriptT)r   r6   r   r:   r%   r7   r   rr   ru   scriptDataDoubleEscapedStater   r   r
   r=   )r   r$   r    r    r!   r     s    


z.HTMLTokenizer.scriptDataDoubleEscapeStartStatec             C   s   | j  }|dkr2| jtd dd | j| _n|dkrZ| jtd dd | j| _nt|dkr| jtd dd | jtd dd n>|tkr| jtd d	d | j	| _n| jtd |d d
S )Nr|   rK   )r#   r$   rD   r[   r"   zinvalid-codepointu   �zeof-in-script-in-scriptT)
r   r6   r%   r7   r    scriptDataDoubleEscapedDashStater   (scriptDataDoubleEscapedLessThanSignStater   r   )r   r$   r    r    r!   r     s$    





z*HTMLTokenizer.scriptDataDoubleEscapedStatec             C   s   | j  }|dkr2| jtd dd | j| _n|dkrZ| jtd dd | j| _n|dkr| jtd dd | jtd dd | j| _nF|t	kr| jtd d	d | j
| _n| jtd |d | j| _d
S )Nr|   rK   )r#   r$   rD   r[   r"   zinvalid-codepointu   �zeof-in-script-in-scriptT)r   r6   r%   r7   r   $scriptDataDoubleEscapedDashDashStater   r   r   r   r   )r   r$   r    r    r!   r      s(    







z.HTMLTokenizer.scriptDataDoubleEscapedDashStatec             C   s  | j  }|dkr*| jtd dd n|dkrR| jtd dd | j| _n|dkrz| jtd dd | j| _n|dkr| jtd dd | jtd d	d | j| _nF|t	kr| jtd d
d | j
| _n| jtd |d | j| _dS )Nr|   rK   )r#   r$   rD   rj   r[   r"   zinvalid-codepointu   �zeof-in-script-in-scriptT)r   r6   r%   r7   r   r   r   rf   r   r   r   )r   r$   r    r    r!   r     s,    







z2HTMLTokenizer.scriptDataDoubleEscapedDashDashStatec             C   sP   | j  }|dkr8| jtd dd d| _| j| _n| j | | j	| _dS )Nri   rK   )r#   r$   r,   T)
r   r6   r%   r7   r   rr   scriptDataDoubleEscapeEndStater   r=   r   )r   r$   r    r    r!   r   0  s    

z6HTMLTokenizer.scriptDataDoubleEscapedLessThanSignStatec             C   s   | j  }|ttdB krR| jtd |d | j dkrH| j	| _
q| j| _
nB|tkr| jtd |d |  j|7  _n| j | | j| _
dS )N)ri   rj   rK   )r#   r$   r   T)r   r6   r   r:   r%   r7   r   rr   ru   r   r   r   r
   r=   )r   r$   r    r    r!   r   ;  s    


z,HTMLTokenizer.scriptDataDoubleEscapeEndStatec             C   s0  | j  }|tkr$| j td n|tkrJ| jd |dg | j| _n|dkr\| 	  n|dkrn| j
| _n|dkr| jtd dd	 | jd |dg | j| _n|d
kr| jtd dd	 | jd ddg | j| _nF|tkr| jtd dd	 | j| _n| jd |dg | j| _dS )NTr$   r,   rj   ri   )'"rI   rD   r"   z#invalid-character-in-attribute-name)r#   r$   r[   zinvalid-codepointu   �z#expected-attribute-name-but-got-eof)r   r6   r   r^   r
   r   r7   attributeNameStater   rZ   rq   r%   r   r   r   )r   r$   r    r    r!   rp   K  s6    










z&HTMLTokenizer.beforeAttributeNameStatec             C   s  | j  }d}d}|dkr&| j| _n.|tkr\| jd d d  || j td 7  < d}n|dkrjd}n|tkr|| j| _n|dkr| j	| _n|d	kr| j
td
 dd | jd d d  d7  < d}n|dkr| j
td
 dd | jd d d  |7  < d}nH|tkr6| j
td
 dd | j| _n| jd d d  |7  < d}|r| jd d d t| jd d d< xP| jd d d D ]:\}}| jd d d |kr| j
td
 dd P qW |r|   dS )NTFrI   r$   rF   r   rj   ri   r[   r"   zinvalid-codepoint)r#   r$   u   �)r   r   rD   z#invalid-character-in-attribute-namezeof-in-attribute-namezduplicate-attribute)r   r6   beforeAttributeValueStater   r
   r   r^   r   afterAttributeNameStaterq   r%   r7   r   r   r   rX   r   rZ   )r   r$   ZleavingThisStateZ	emitTokenrU   _r    r    r!   r   i  sR    








&
z HTMLTokenizer.attributeNameStatec             C   sD  | j  }|tkr$| j td n|dkr8| j| _n|dkrJ|   n|tkrp| jd 	|dg | j
| _n|dkr| j| _n|dkr| j	td d	d
 | jd 	ddg | j
| _n|dkr| j	td dd
 | jd 	|dg | j
| _nF|tkr$| j	td dd
 | j| _n| jd 	|dg | j
| _dS )NTrI   rj   r$   r,   ri   r[   r"   zinvalid-codepoint)r#   r$   u   �)r   r   rD   z&invalid-character-after-attribute-namezexpected-end-of-tag-but-got-eof)r   r6   r   r^   r   r   rZ   r
   r   r7   r   rq   r%   r   r   r   )r   r$   r    r    r!   r     s:    










z%HTMLTokenizer.afterAttributeNameStatec             C   sh  | j  }|tkr$| j td n@|dkr8| j| _n,|dkrX| j| _| j | n|dkrj| j| _n|dkr| j	
td dd |   n|d	kr| j	
td d
d | jd d d  d7  < | j| _n|dkr| j	
td dd | jd d d  |7  < | j| _nL|tkrB| j	
td dd | j| _n"| jd d d  |7  < | j| _dS )NTr   rC   r   rj   r"   z.expected-attribute-value-but-got-right-bracket)r#   r$   r[   zinvalid-codepointr$   rF   r   u   �)rI   rD   `z"equals-in-unquoted-attribute-valuez$expected-attribute-value-but-got-eof)r   r6   r   r^   attributeValueDoubleQuotedStater   attributeValueUnQuotedStater=   attributeValueSingleQuotedStater%   r7   r   rZ   r   r   r   )r   r$   r    r    r!   r     s>    











z'HTMLTokenizer.beforeAttributeValueStatec             C   s   | j  }|dkr| j| _n|dkr0| d n|dkrj| jtd dd | jd d d	  d
7  < nN|t	kr| jtd dd | j
| _n&| jd d d	  || j d 7  < dS )Nr   rC   r[   r"   zinvalid-codepoint)r#   r$   r$   rF   r   u   �z#eof-in-attribute-value-double-quote)r   rC   r[   T)r   r6   afterAttributeValueStater   rT   r%   r7   r   r   r   r   r^   )r   r$   r    r    r!   r     s     




z-HTMLTokenizer.attributeValueDoubleQuotedStatec             C   s   | j  }|dkr| j| _n|dkr0| d n|dkrj| jtd dd | jd d d	  d
7  < nN|t	kr| jtd dd | j
| _n&| jd d d	  || j d 7  < dS )Nr   rC   r[   r"   zinvalid-codepoint)r#   r$   r$   rF   r   u   �z#eof-in-attribute-value-single-quote)r   rC   r[   T)r   r6   r   r   rT   r%   r7   r   r   r   r   r^   )r   r$   r    r    r!   r     s     




z-HTMLTokenizer.attributeValueSingleQuotedStatec             C   s  | j  }|tkr| j| _n|dkr0| d n|dkrB|   n|dkr|| jt	d dd | j
d d d	  |7  < n|d
kr| jt	d dd | j
d d d	  d7  < nV|tkr| jt	d dd | j| _n.| j
d d d	  || j tdtB  7  < dS )NrC   rj   )r   r   rI   rD   r   r"   z0unexpected-character-in-unquoted-attribute-value)r#   r$   r$   rF   r   r[   zinvalid-codepointu   �z eof-in-attribute-value-no-quotes)rC   rj   r   r   rI   rD   r   r[   T)r   r6   r   rp   r   rT   rZ   r%   r7   r   r   r   r   r^   r:   )r   r$   r    r    r!   r     s,    






z)HTMLTokenizer.attributeValueUnQuotedStatec             C   s   | j  }|tkr| j| _n|dkr.|   np|dkr@| j| _n^|tkrt| j	t
d dd | j | | j| _n*| j	t
d dd | j | | j| _dS )Nrj   ri   r"   z$unexpected-EOF-after-attribute-value)r#   r$   z*unexpected-character-after-attribute-valueT)r   r6   r   rp   r   rZ   rq   r   r%   r7   r   r=   r   )r   r$   r    r    r!   r      s"    






z&HTMLTokenizer.afterAttributeValueStatec             C   s   | j  }|dkr&d| jd< |   n^|tkrZ| jtd dd | j | | j	| _
n*| jtd dd | j | | j| _
dS )Nrj   TrW   r"   z#unexpected-EOF-after-solidus-in-tag)r#   r$   z)unexpected-character-after-solidus-in-tag)r   r6   r   rZ   r   r%   r7   r   r=   r   r   rp   )r   r$   r    r    r!   rq   4  s    





z&HTMLTokenizer.selfClosingStartTagStatec             C   sD   | j d}|dd}| jtd |d | j   | j| _dS )Nrj   r[   u   �Comment)r#   r$   T)	r   r^   replacer%   r7   r   r6   r   r   )r   r$   r    r    r!   ro   F  s    
zHTMLTokenizer.bogusCommentStatec             C   s  | j  g}|d dkrR|| j   |d dkrPtd dd| _| j| _dS n|d dkrd}x.dD ]&}|| j   |d |krhd	}P qhW |rtd
 dd d dd| _| j| _dS n|d dkrF| jd k	rF| jj	j
rF| jj	j
d j| jj	jkrFd}x2dD ]*}|| j   |d |krd	}P qW |rF| j| _dS | jtd dd x|rx| j |  q^W | j| _dS )NrF   r|   r   r,   )r#   r$   T)dD))oO)r@   C)tT)yY)pP)eEFZDoctype)r#   rU   publicIdsystemIdcorrect[)r   r   Ar   r   r   r"   zexpected-dashes-or-doctype)r   r6   r7   r   r   commentStartStater   doctypeStater   ZtreeZopenElements	namespaceZdefaultNamespacecdataSectionStater%   r=   r'   ro   )r   r?   matchedexpectedr    r    r!   rl   U  sP    



z(HTMLTokenizer.markupDeclarationOpenStatec             C   s   | j  }|dkr| j| _n|dkrN| jtd dd | jd  d7  < n|dkr| jtd d	d | j| j | j| _nP|t	kr| jtd d
d | j| j | j| _n| jd  |7  < | j
| _dS )Nr|   r[   r"   zinvalid-codepoint)r#   r$   r$   u   �rj   zincorrect-commentzeof-in-commentT)r   r6   commentStartDashStater   r%   r7   r   r   r   r   commentState)r   r$   r    r    r!   r     s(    






zHTMLTokenizer.commentStartStatec             C   s   | j  }|dkr| j| _n|dkrN| jtd dd | jd  d7  < n|dkr| jtd d	d | j| j | j| _nT|t	kr| jtd d
d | j| j | j| _n| jd  d| 7  < | j
| _dS )Nr|   r[   r"   zinvalid-codepoint)r#   r$   r$   u   -�rj   zincorrect-commentzeof-in-commentT)r   r6   commentEndStater   r%   r7   r   r   r   r   r   )r   r$   r    r    r!   r     s(    






z#HTMLTokenizer.commentStartDashStatec             C   s   | j  }|dkr| j| _n|dkrN| jtd dd | jd  d7  < nT|tkr| jtd dd | j| j | j	| _n| jd  || j 
d	 7  < d
S )Nr|   r[   r"   zinvalid-codepoint)r#   r$   r$   u   �zeof-in-comment)r|   r[   T)r   r6   commentEndDashStater   r%   r7   r   r   r   r   r^   )r   r$   r    r    r!   r     s    




zHTMLTokenizer.commentStatec             C   s   | j  }|dkr| j| _n|dkrV| jtd dd | jd  d7  < | j| _nT|t	kr| jtd dd | j| j | j
| _n| jd  d| 7  < | j| _d	S )
Nr|   r[   r"   zinvalid-codepoint)r#   r$   r$   u   -�zeof-in-comment-end-dashT)r   r6   r   r   r%   r7   r   r   r   r   r   )r   r$   r    r    r!   r     s     





z!HTMLTokenizer.commentEndDashStatec             C   s,  | j  }|dkr*| j| j | j| _n|dkrd| jtd dd | jd  d7  < | j| _n|dkr| jtd d	d | j	| _n|d
kr| jtd dd | jd  |7  < nj|t
kr| jtd dd | j| j | j| _n4| jtd dd | jd  d| 7  < | j| _dS )Nrj   r[   r"   zinvalid-codepoint)r#   r$   r$   u   --�rh   z,unexpected-bang-after-double-dash-in-commentr|   z,unexpected-dash-after-double-dash-in-commentzeof-in-comment-double-dashzunexpected-char-in-commentz--T)r   r6   r%   r7   r   r   r   r   r   commentEndBangStater   )r   r$   r    r    r!   r     s6    









zHTMLTokenizer.commentEndStatec             C   s   | j  }|dkr*| j| j | j| _n|dkrN| jd  d7  < | j| _n|dkr| jtd dd | jd  d	7  < | j	| _nT|t
kr| jtd d
d | j| j | j| _n| jd  d| 7  < | j	| _dS )Nrj   r|   r$   z--!r[   r"   zinvalid-codepoint)r#   r$   u   --!�zeof-in-comment-end-bang-stateT)r   r6   r%   r7   r   r   r   r   r   r   r   )r   r$   r    r    r!   r     s(    






z!HTMLTokenizer.commentEndBangStatec             C   s   | j  }|tkr| j| _nj|tkr\| jtd dd d| j	d< | j| j	 | j
| _n*| jtd dd | j | | j| _dS )Nr"   z!expected-doctype-name-but-got-eof)r#   r$   Fr   zneed-space-after-doctypeT)r   r6   r   beforeDoctypeNameStater   r   r%   r7   r   r   r   r=   )r   r$   r    r    r!   r     s    





zHTMLTokenizer.doctypeStatec             C   s   | j  }|tkrn|dkrT| jtd dd d| jd< | j| j | j| _n|dkr| jtd dd d	| jd
< | j	| _nR|t
kr| jtd dd d| jd< | j| j | j| _n|| jd
< | j	| _dS )Nrj   r"   z+expected-doctype-name-but-got-right-bracket)r#   r$   Fr   r[   zinvalid-codepointu   �rU   z!expected-doctype-name-but-got-eofT)r   r6   r   r%   r7   r   r   r   r   doctypeNameStater   )r   r$   r    r    r!   r     s.    










z$HTMLTokenizer.beforeDoctypeNameStatec             C   s  | j  }|tkr2| jd t| jd< | j| _n|dkrh| jd t| jd< | j	| j | j
| _n|dkr| j	td dd | jd  d7  < | j| _nh|tkr| j	td dd d	| jd
< | jd t| jd< | j	| j | j
| _n| jd  |7  < dS )NrU   rj   r[   r"   zinvalid-codepoint)r#   r$   u   �zeof-in-doctype-nameFr   T)r   r6   r   r   rX   r   afterDoctypeNameStater   r%   r7   r   r   r   r   )r   r$   r    r    r!   r   6  s,    







zHTMLTokenizer.doctypeNameStatec             C   sL  | j  }|tkrn2|dkr8| j| j | j| _n|tkrd| jd< | j 	| | jt
d dd | j| j | j| _n|dkrd}x$d	D ]}| j  }||krd}P qW |r| j| _dS nF|d
krd}x$dD ]}| j  }||krd}P qW |r| j| _dS | j 	| | jt
d dd|id d| jd< | j| _dS )Nrj   Fr   r"   zeof-in-doctype)r#   r$   )r   r   T))uU)bB)lL)iI)r@   r   )sS))r   r   )r   r   )r   r   )r   r   )mMz*expected-space-or-right-bracket-in-doctyper$   )r#   r$   r.   )r   r6   r   r%   r7   r   r   r   r   r=   r   afterDoctypePublicKeywordStateafterDoctypeSystemKeywordStatebogusDoctypeState)r   r$   r   r   r    r    r!   r   O  sP    









z#HTMLTokenizer.afterDoctypeNameStatec             C   s   | j  }|tkr| j| _n|dkrP| jtd dd | j | | j| _nT|t	kr| jtd dd d| j
d< | j| j
 | j| _n| j | | j| _dS )	N)r   r   r"   zunexpected-char-in-doctype)r#   r$   zeof-in-doctypeFr   T)r   r6   r   "beforeDoctypePublicIdentifierStater   r%   r7   r   r=   r   r   r   )r   r$   r    r    r!   r     s"    






z,HTMLTokenizer.afterDoctypePublicKeywordStatec             C   s   | j  }|tkrn|dkr0d| jd< | j| _n|dkrLd| jd< | j| _n|dkr| jt	d dd d	| jd
< | j| j | j
| _nh|tkr| jt	d dd d	| jd
< | j| j | j
| _n(| jt	d dd d	| jd
< | j| _dS )Nr   r,   r   r   rj   r"   zunexpected-end-of-doctype)r#   r$   Fr   zeof-in-doctypezunexpected-char-in-doctypeT)r   r6   r   r   (doctypePublicIdentifierDoubleQuotedStater   (doctypePublicIdentifierSingleQuotedStater%   r7   r   r   r   r   )r   r$   r    r    r!   r     s4    












z0HTMLTokenizer.beforeDoctypePublicIdentifierStatec             C   s   | j  }|dkr| j| _n|dkrN| jtd dd | jd  d7  < n|dkr| jtd d	d d
| jd< | j| j | j| _nR|t	kr| jtd dd d
| jd< | j| j | j| _n| jd  |7  < dS )Nr   r[   r"   zinvalid-codepoint)r#   r$   r   u   �rj   zunexpected-end-of-doctypeFr   zeof-in-doctypeT)
r   r6   !afterDoctypePublicIdentifierStater   r%   r7   r   r   r   r   )r   r$   r    r    r!   r     s*    








z6HTMLTokenizer.doctypePublicIdentifierDoubleQuotedStatec             C   s   | j  }|dkr| j| _n|dkrN| jtd dd | jd  d7  < n|dkr| jtd d	d d
| jd< | j| j | j| _nR|t	kr| jtd dd d
| jd< | j| j | j| _n| jd  |7  < dS )Nr   r[   r"   zinvalid-codepoint)r#   r$   r   u   �rj   zunexpected-end-of-doctypeFr   zeof-in-doctypeT)
r   r6   r   r   r%   r7   r   r   r   r   )r   r$   r    r    r!   r     s*    








z6HTMLTokenizer.doctypePublicIdentifierSingleQuotedStatec             C   s  | j  }|tkr| j| _n|dkr<| j| j | j| _n|dkrn| jt	d dd d| jd< | j
| _n|dkr| jt	d dd d| jd< | j| _nh|tkr| jt	d d	d d
| jd< | j| j | j| _n(| jt	d dd d
| jd< | j| _dS )Nrj   r   r"   zunexpected-char-in-doctype)r#   r$   r,   r   r   zeof-in-doctypeFr   T)r   r6   r   -betweenDoctypePublicAndSystemIdentifiersStater   r%   r7   r   r   r   (doctypeSystemIdentifierDoubleQuotedState(doctypeSystemIdentifierSingleQuotedStater   r   )r   r$   r    r    r!   r     s6    













z/HTMLTokenizer.afterDoctypePublicIdentifierStatec             C   s   | j  }|tkrn|dkr4| j| j | j| _n|dkrPd| jd< | j| _n|dkrld| jd< | j	| _nh|t
kr| jtd dd d	| jd
< | j| j | j| _n(| jtd dd d	| jd
< | j| _dS )Nrj   r   r,   r   r   r"   zeof-in-doctype)r#   r$   Fr   zunexpected-char-in-doctypeT)r   r6   r   r%   r7   r   r   r   r   r   r   r   r   )r   r$   r    r    r!   r     s.    










z;HTMLTokenizer.betweenDoctypePublicAndSystemIdentifiersStatec             C   s   | j  }|tkr| j| _n|dkrP| jtd dd | j | | j| _nT|t	kr| jtd dd d| j
d< | j| j
 | j| _n| j | | j| _dS )	N)r   r   r"   zunexpected-char-in-doctype)r#   r$   zeof-in-doctypeFr   T)r   r6   r   "beforeDoctypeSystemIdentifierStater   r%   r7   r   r=   r   r   r   )r   r$   r    r    r!   r     s"    






z,HTMLTokenizer.afterDoctypeSystemKeywordStatec             C   s   | j  }|tkrn|dkr0d| jd< | j| _n|dkrLd| jd< | j| _n|dkr| jt	d dd d	| jd
< | j| j | j
| _nh|tkr| jt	d dd d	| jd
< | j| j | j
| _n(| jt	d dd d	| jd
< | j| _dS )Nr   r,   r   r   rj   r"   zunexpected-char-in-doctype)r#   r$   Fr   zeof-in-doctypeT)r   r6   r   r   r   r   r   r%   r7   r   r   r   r   )r   r$   r    r    r!   r   /  s4    












z0HTMLTokenizer.beforeDoctypeSystemIdentifierStatec             C   s   | j  }|dkr| j| _n|dkrN| jtd dd | jd  d7  < n|dkr| jtd d	d d
| jd< | j| j | j| _nR|t	kr| jtd dd d
| jd< | j| j | j| _n| jd  |7  < dS )Nr   r[   r"   zinvalid-codepoint)r#   r$   r   u   �rj   zunexpected-end-of-doctypeFr   zeof-in-doctypeT)
r   r6   !afterDoctypeSystemIdentifierStater   r%   r7   r   r   r   r   )r   r$   r    r    r!   r   L  s*    








z6HTMLTokenizer.doctypeSystemIdentifierDoubleQuotedStatec             C   s   | j  }|dkr| j| _n|dkrN| jtd dd | jd  d7  < n|dkr| jtd d	d d
| jd< | j| j | j| _nR|t	kr| jtd dd d
| jd< | j| j | j| _n| jd  |7  < dS )Nr   r[   r"   zinvalid-codepoint)r#   r$   r   u   �rj   zunexpected-end-of-doctypeFr   zeof-in-doctypeT)
r   r6   r   r   r%   r7   r   r   r   r   )r   r$   r    r    r!   r   d  s*    








z6HTMLTokenizer.doctypeSystemIdentifierSingleQuotedStatec             C   s   | j  }|tkrn~|dkr4| j| j | j| _n^|tkrt| jt	d dd d| jd< | j| j | j| _n| jt	d dd | j
| _dS )	Nrj   r"   zeof-in-doctype)r#   r$   Fr   zunexpected-char-in-doctypeT)r   r6   r   r%   r7   r   r   r   r   r   r   )r   r$   r    r    r!   r   |  s     





z/HTMLTokenizer.afterDoctypeSystemIdentifierStatec             C   sZ   | j  }|dkr*| j| j | j| _n,|tkrV| j | | j| j | j| _n dS )Nrj   T)	r   r6   r%   r7   r   r   r   r   r=   )r   r$   r    r    r!   r     s    


zHTMLTokenizer.bogusDoctypeStatec             C   s   g }x| | jd | | jd | j }|tkr@P q|dksLt|d dd  dkrx|d d d |d< P q| | qW d|}|d}|dkrx&t|D ]}| j	 t
d	 d
d qW |dd}|r| j	 t
d |d | j| _dS )N]rj   rF   z]]r,   r[   r   r"   zinvalid-codepoint)r#   r$   u   �rK   T)r7   r   r^   r6   r   AssertionErrorr9   countranger%   r   r   r   r   )r   r$   r6   Z	nullCountr   r    r    r!   r     s0    



zHTMLTokenizer.cdataSectionState)N)NF)N__name__
__module____qualname____doc__r   r)   rB   rS   rT   rZ   r   r\   rb   r`   rd   rf   rg   r]   rm   rn   ra   rs   rt   rc   rw   rx   re   ry   r{   rz   r}   r   r   r~   r   r   r   r   r   r   r   r   r   rp   r   r   r   r   r   r   r   rq   ro   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__r    r    )r   r!   r      s   H
P#

6 "-3r   N)Z
__future__r   r   r   Zsixr   r;   collectionsr   Z	constantsr   r	   r
   r   r   r   r   r   r   r   Z_inputstreamr   Z_trier   rL   objectr   r    r    r    r!   <module>   s   