B
    n&=[~              !   @   s  d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dlZd dlZddlmZmZmZmZ ddlmZ ddlmZ d d	lmZ yd d
lmZ W n ek
r   eZY nX edd eD Zedd eD Zedd eD ZeeddgB ZdZejrFed dkr"e ddks&t!e"edd e#d d Z$n
e"eZ$e%dddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4g Z&e"d5Z'i Z(G d6d7 d7e)Z*d8d9 Z+G d:d; d;e)Z,G d<d= d=e,Z-G d>d? d?e.Z/G d@dA dAe)Z0G dBdC dCe)Z1dDdE Z2dS )F    )absolute_importdivisionunicode_literals)	text_typebinary_type)http_clienturllibN   )EOFspaceCharactersasciiLettersasciiUppercase)_ReparseException)_utils)StringIO)BytesIOc             C   s   g | ]}| d qS )ascii)encode).0item r   4lib/python3.7/site-packages/html5lib/_inputstream.py
<listcomp>   s    r   c             C   s   g | ]}| d qS )r   )r   )r   r   r   r   r   r      s    c             C   s   g | ]}| d qS )r   )r   )r   r   r   r   r   r      s       >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿]]z"\uD800-\uDFFF"i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i i z[	- -/:-@\[-`{-~]c               @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )BufferedStreamzBuffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
    joining many strings will be slow since it is O(n**2)
    c             C   s   || _ g | _ddg| _d S )Nr   r   )streambufferposition)selfr   r   r   r   __init__@   s    zBufferedStream.__init__c             C   s@   d}x(| j d | jd  D ]}|t|7 }qW || jd 7 }|S )Nr   r	   )r   r    len)r!   poschunkr   r   r   tellE   s
    zBufferedStream.tellc             C   sX   ||   kst|}d}x0t| j| |k rH|t| j| 8 }|d7 }qW ||g| _d S )Nr   r	   )_bufferedBytesAssertionErrorr#   r   r    )r!   r$   offsetir   r   r   seekL   s    zBufferedStream.seekc             C   sT   | j s| |S | jd t| j krF| jd t| j d krF| |S | |S d S )Nr   r	   r   )r   _readStreamr    r#   _readFromBuffer)r!   bytesr   r   r   readU   s    

zBufferedStream.readc             C   s   t dd | jD S )Nc             S   s   g | ]}t |qS r   )r#   )r   r   r   r   r   r   _   s    z1BufferedStream._bufferedBytes.<locals>.<listcomp>)sumr   )r!   r   r   r   r'   ^   s    zBufferedStream._bufferedBytesc             C   s<   | j |}| j| | jd  d7  < t|| jd< |S )Nr   r	   )r   r/   r   appendr    r#   )r!   r.   datar   r   r   r,   a   s
    zBufferedStream._readStreamc             C   s   |}g }| j d }| j d }x|t| jk r|dkr|dks@t| j| }|t|| krn|}||| g| _ n"t|| }|t|g| _ |d7 }|||||   ||8 }d}qW |r|| | d|S )Nr   r	       )r    r#   r   r(   r1   r,   join)r!   r.   ZremainingBytesrvZbufferIndexZbufferOffsetZbufferedDataZbytesToReadr   r   r   r-   h   s&    


zBufferedStream._readFromBufferN)__name__
__module____qualname____doc__r"   r&   r+   r/   r'   r,   r-   r   r   r   r   r   9   s   		r   c             K   s   t | tjs(t | tjjr.t | jtjr.d}n&t| drJt | dt	}n
t | t	}|rdd |D }|rvt
d| t| f|S t| f|S d S )NFr/   r   c             S   s   g | ]}| d r|qS )Z	_encoding)endswith)r   xr   r   r   r      s    z#HTMLInputStream.<locals>.<listcomp>z3Cannot set an encoding with a unicode input, set %r)
isinstancer   ZHTTPResponser   ZresponseZaddbasefphasattrr/   r   	TypeErrorHTMLUnicodeInputStreamHTMLBinaryInputStream)sourcekwargsZ	isUnicodeZ	encodingsr   r   r   HTMLInputStream   s    

rD   c               @   sp   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dddZdd Zdd ZdddZdd ZdS )r@   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    i (  c             C   sZ   t jsd| _ntddkr$| j| _n| j| _dg| _tddf| _| 	|| _
|   dS )a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        Nu   􏿿r	   r   zutf-8certain)r   supports_lone_surrogatesreportCharacterErrorsr#   characterErrorsUCS4characterErrorsUCS2ZnewLineslookupEncodingcharEncoding
openStream
dataStreamreset)r!   rB   r   r   r   r"      s    
zHTMLUnicodeInputStream.__init__c             C   s.   d| _ d| _d| _g | _d| _d| _d | _d S )N r   )r%   	chunkSizechunkOffseterrorsprevNumLinesprevNumCols_bufferedCharacter)r!   r   r   r   rN      s    zHTMLUnicodeInputStream.resetc             C   s   t |dr|}nt|}|S )zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r/   )r>   r   )r!   rB   r   r   r   r   rL      s    
z!HTMLUnicodeInputStream.openStreamc             C   sT   | j }|dd|}| j| }|dd|}|dkr@| j| }n||d  }||fS )N
r   r   r	   )r%   countrS   rfindrT   )r!   r)   r%   ZnLinesZpositionLineZlastLinePosZpositionColumnr   r   r   	_position   s    
z HTMLUnicodeInputStream._positionc             C   s   |  | j\}}|d |fS )z:Returns (line, col) of the current position in the stream.r	   )rY   rQ   )r!   linecolr   r   r   r       s    zHTMLUnicodeInputStream.positionc             C   s6   | j | jkr|  stS | j }| j| }|d | _ |S )zo Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        r	   )rQ   rP   	readChunkr
   r%   )r!   rQ   charr   r   r   r]      s    

zHTMLUnicodeInputStream.charNc             C   s   |d kr| j }| | j\| _| _d| _d| _d| _| j|}| j	rX| j	| }d | _	n|s`dS t
|dkrt|d }|dksd|  krdkrn n|d | _	|d d }| jr| | |d	d
}|dd
}|| _t
|| _dS )NrO   r   Fr	   r      i   i  z
rV   T)_defaultChunkSizerY   rP   rS   rT   r%   rQ   rM   r/   rU   r#   ordrG   replace)r!   rP   r2   Zlastvr   r   r   r\      s0    
 


z HTMLUnicodeInputStream.readChunkc             C   s,   x&t tt|D ]}| jd qW d S )Nzinvalid-codepoint)ranger#   invalid_unicode_refindallrR   r1   )r!   r2   _r   r   r   rH   %  s    z*HTMLUnicodeInputStream.characterErrorsUCS4c             C   s   d}xt |D ]}|rqt| }| }t|||d  rtt|||d  }|tkrn| j	
d d}q|dkr|dkr|t|d kr| j	
d qd}| j	
d qW d S )NF   zinvalid-codepointTi   i  r	   )rd   finditerra   groupstartr   ZisSurrogatePairZsurrogatePairToCodepointnon_bmp_invalid_codepointsrR   r1   r#   )r!   r2   skipmatchZ	codepointr$   Zchar_valr   r   r   rI   )  s     z*HTMLUnicodeInputStream.characterErrorsUCS2Fc       
      C   s  yt ||f }W nl tk
r|   x|D ]}t|dk s&tq&W ddd |D }|s^d| }td|  }t ||f< Y nX g }x||| j| j	}|dkr| j	| j
krP n0| }|| j
kr|| j| j	|  || _	P || j| j	d  |  sP qW d|}	|	S )z Returns a string of characters from the stream up to but not
        including any character in 'characters' or EOF. 'characters' must be
        a container that supports the 'in' method and iteration over its
        characters.
           rO   c             S   s   g | ]}d t | qS )z\x%02x)ra   )r   cr   r   r   r   N  s    z5HTMLUnicodeInputStream.charsUntil.<locals>.<listcomp>z^%sz[%s]+N)charsUntilRegExKeyErrorra   r(   r4   recompilerm   r%   rQ   rP   endr1   r\   )
r!   Z
charactersZoppositecharsro   Zregexr5   mrt   rr   r   r   
charsUntil@  s2    
 

z!HTMLUnicodeInputStream.charsUntilc             C   sT   |d k	rP| j dkr.|| j | _|  jd7  _n"|  j d8  _ | j| j  |ksPtd S )Nr   r	   )rQ   r%   rP   r(   )r!   r]   r   r   r   ungeto  s    
zHTMLUnicodeInputStream.unget)N)F)r6   r7   r8   r9   r`   r"   rN   rL   rY   r    r]   r\   rH   rI   rx   ry   r   r   r   r   r@      s    
&
/r@   c               @   sL   e Zd ZdZdddZdd Zd	d
 ZdddZdd Zdd Z	dd Z
dS )rA   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    Nwindows-1252Tc             C   sn   |  || _t| | j d| _d| _|| _|| _|| _|| _	|| _
| || _| jd dk	sbt|   dS )a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        i   d   r   N)rL   	rawStreamr@   r"   numBytesMetanumBytesChardetoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermineEncodingrK   r(   rN   )r!   rB   r   r   r   r   r   Z
useChardetr   r   r   r"     s    zHTMLBinaryInputStream.__init__c             C   s&   | j d j| jd| _t|  d S )Nr   rb   )rK   Z
codec_infostreamreaderr|   rM   r@   rN   )r!   r   r   r   rN     s    zHTMLBinaryInputStream.resetc             C   sD   t |dr|}nt|}y||  W n   t|}Y nX |S )zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r/   )r>   r   r+   r&   r   )r!   rB   r   r   r   r   rL     s    
z HTMLBinaryInputStream.openStreamc             C   s  |   df}|d d k	r|S t| jdf}|d d k	r:|S t| jdf}|d d k	rX|S |  df}|d d k	rt|S t| jdf}|d d k	r|d jds|S t| jdf}|d d k	r|S |rryddl	m
} W n tk
r   Y nX g }| }xF|js<| j| j}t|tst|s&P || || qW |  t|jd }| jd |d k	rr|dfS t| jdf}|d d k	r|S tddfS )NrE   r   Z	tentativezutf-16)UniversalDetectorencodingzwindows-1252)	detectBOMrJ   r   r   detectEncodingMetar   name
startswithr   Zchardet.universaldetectorr   ImportErrorZdoner|   r/   r~   r<   r.   r(   r1   Zfeedcloseresultr+   r   )r!   ZchardetrK   r   ZbuffersZdetectorr   r   r   r   r   r     sR    


z'HTMLBinaryInputStream.determineEncodingc             C   s   | j d dkstt|}|d kr&d S |jdkrFtd}|d k	stnT|| j d krf| j d df| _ n4| jd |df| _ |   td| j d |f d S )Nr	   rE   )zutf-16bezutf-16lezutf-8r   zEncoding changed from %s to %s)rK   r(   rJ   r   r|   r+   rN   r   )r!   ZnewEncodingr   r   r   changeEncoding  s    

z$HTMLBinaryInputStream.changeEncodingc          
   C   s   t jdt jdt jdt jdt jdi}| jd}t|t	s<t
||dd }d}|s~||}d}|s~||dd	 }d	}|r| j| t|S | jd
 dS dS )zAttempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return Nonezutf-8zutf-16lezutf-16bezutf-32lezutf-32be   N   rg   r   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr|   r/   r<   r.   r(   getr+   rJ   )r!   ZbomDictstringr   r+   r   r   r   r     s$    
zHTMLBinaryInputStream.detectBOMc             C   sV   | j | j}t|tstt|}| j d | }|dk	rR|j	dkrRt
d}|S )z9Report the encoding declared by the meta element
        r   N)zutf-16bezutf-16lezutf-8)r|   r/   r}   r<   r.   r(   EncodingParserr+   getEncodingr   rJ   )r!   r   parserr   r   r   r   r   9  s    z(HTMLBinaryInputStream.detectEncodingMeta)NNNNrz   T)T)r6   r7   r8   r9   r"   rN   rL   r   r   r   r   r   r   r   r   rA     s     
(
>"rA   c               @   s   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zeee
Zdd ZeeZefddZdd Zdd Zdd ZdS )EncodingByteszString-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raisedc             C   s   t |tstt| | S )N)r<   r.   r(   __new__lower)r!   valuer   r   r   r   L  s    zEncodingBytes.__new__c             C   s
   d| _ d S )Nr   )rY   )r!   r   r   r   r   r"   P  s    zEncodingBytes.__init__c             C   s   | S )Nr   )r!   r   r   r   __iter__T  s    zEncodingBytes.__iter__c             C   s>   | j d  }| _ |t| kr"tn|dk r.t| ||d  S )Nr	   r   )rY   r#   StopIterationr?   )r!   pr   r   r   __next__W  s    zEncodingBytes.__next__c             C   s   |   S )N)r   )r!   r   r   r   next_  s    zEncodingBytes.nextc             C   sB   | j }|t| krtn|dk r$t|d  | _ }| ||d  S )Nr   r	   )rY   r#   r   r?   )r!   r   r   r   r   previousc  s    zEncodingBytes.previousc             C   s   | j t| krt|| _ d S )N)rY   r#   r   )r!   r    r   r   r   setPositionl  s    zEncodingBytes.setPositionc             C   s*   | j t| krt| j dkr"| j S d S d S )Nr   )rY   r#   r   )r!   r   r   r   getPositionq  s
    
zEncodingBytes.getPositionc             C   s   | | j | j d  S )Nr	   )r    )r!   r   r   r   getCurrentByte{  s    zEncodingBytes.getCurrentBytec             C   sL   | j }x:|t| k r@| ||d  }||kr6|| _|S |d7 }qW || _dS )zSkip past a list of charactersr	   N)r    r#   rY   )r!   ru   r   ro   r   r   r   rl     s    zEncodingBytes.skipc             C   sL   | j }x:|t| k r@| ||d  }||kr6|| _|S |d7 }qW || _d S )Nr	   )r    r#   rY   )r!   ru   r   ro   r   r   r   	skipUntil  s    zEncodingBytes.skipUntilc             C   s>   | j }| ||t|  }||}|r:|  j t|7  _ |S )zLook for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone)r    r#   r   )r!   r.   r   r2   r5   r   r   r   
matchBytes  s    
zEncodingBytes.matchBytesc             C   sR   | | j d |}|dkrJ| jdkr,d| _|  j|t| d 7  _dS tdS )zLook for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the matchNr   r   r	   T)r    findrY   r#   r   )r!   r.   ZnewPositionr   r   r   jumpTo  s    
zEncodingBytes.jumpToN)r6   r7   r8   r9   r   r"   r   r   r   r   r   r   propertyr    r   currentBytespaceCharactersBytesrl   r   r   r   r   r   r   r   r   H  s    	
r   c               @   sX   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )r   z?Mini parser for detecting character encoding from meta elementsc             C   s   t || _d| _dS )z3string - the data to work on for encoding detectionN)r   r2   r   )r!   r2   r   r   r   r"     s    
zEncodingParser.__init__c          
   C   s   d| j fd| jfd| jfd| jfd| jfd| jff}x^| jD ]T}d}xD|D ]<\}}| j|rJy| }P W qJ tk
r   d}P Y qJX qJW |s<P q<W | jS )	Ns   <!--s   <metas   </s   <!s   <?r   TF)	handleComment
handleMetahandlePossibleEndTaghandleOtherhandlePossibleStartTagr2   r   r   r   )r!   ZmethodDispatchrf   ZkeepParsingkeymethodr   r   r   r     s&    zEncodingParser.getEncodingc             C   s   | j dS )zSkip over commentss   -->)r2   r   )r!   r   r   r   r     s    zEncodingParser.handleCommentc             C   s   | j jtkrdS d}d }x|  }|d kr.dS |d dkr^|d dk}|r|d k	r|| _dS q|d dkr|d }t|}|d k	r|| _dS q|d dkrtt|d }| }|d k	rt|}|d k	r|r|| _dS |}qW d S )	NTFr   s
   http-equivr	   s   content-types   charsets   content)	r2   r   r   getAttributer   rJ   ContentAttrParserr   parse)r!   Z	hasPragmaZpendingEncodingattrZtentativeEncodingcodecZcontentParserr   r   r   r     s:    zEncodingParser.handleMetac             C   s
   |  dS )NF)handlePossibleTag)r!   r   r   r   r     s    z%EncodingParser.handlePossibleStartTagc             C   s   t | j | dS )NT)r   r2   r   )r!   r   r   r   r     s    
z#EncodingParser.handlePossibleEndTagc             C   sf   | j }|jtkr(|r$|  |   dS |t}|dkrD|  n|  }x|d k	r`|  }qNW dS )NTr   )r2   r   asciiLettersBytesr   r   r   spacesAngleBracketsr   )r!   ZendTagr2   ro   r   r   r   r   r     s    



z EncodingParser.handlePossibleTagc             C   s   | j dS )Nr   )r2   r   )r!   r   r   r   r     s    zEncodingParser.handleOtherc             C   s   | j }|ttdgB }|dks2t|dks2t|dkr>dS g }g }xt|dkrX|rXP nX|tkrl| }P nD|dkrd|dfS |tkr||	  n|dkrdS || t
|}qHW |dkr|  d|dfS t
| | }|dkrR|}xt
|}||kr(t
| d|d|fS |tkrB||	  q|| qW nJ|d	krjd|dfS |tkr||	  n|dkrdS || x^t
|}|tkrd|d|fS |tkr||	  n|dkrdS || qW dS )
z_Return a name,value pair for the next attribute in the stream,
        if one is found, or None   /Nr	   )r   N   =)r   r   r3   )   '   "r   )r2   rl   r   	frozensetr#   r(   r4   asciiUppercaseBytesr1   r   r   r   r   )r!   r2   ro   ZattrNameZ	attrValueZ	quoteCharr   r   r   r     sh    










zEncodingParser.getAttributeN)r6   r7   r8   r9   r"   r   r   r   r   r   r   r   r   r   r   r   r   r     s   $r   c               @   s   e Zd Zdd Zdd ZdS )r   c             C   s   t |tst|| _d S )N)r<   r.   r(   r2   )r!   r2   r   r   r   r"   f  s    zContentAttrParser.__init__c             C   s  y| j d | j  jd7  _| j   | j jdks8d S | j  jd7  _| j   | j jdkr| j j}| j  jd7  _| j j}| j |r| j || j j S d S nF| j j}y| j t | j || j j S  tk
r   | j |d  S X W n tk
r    d S X d S )Ns   charsetr	   r   )r   r   )r2   r   r    rl   r   r   r   r   )r!   Z	quoteMarkZoldPositionr   r   r   r   j  s.    

zContentAttrParser.parseN)r6   r7   r8   r"   r   r   r   r   r   r   e  s   r   c             C   s`   t | tr.y| d} W n tk
r,   dS X | dk	rXy
t| S  tk
rT   dS X ndS dS )z{Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding.r   N)r<   r   decodeUnicodeDecodeErrorwebencodingslookupAttributeError)r   r   r   r   rJ     s    

rJ   )3Z
__future__r   r   r   Zsixr   r   Z	six.movesr   r   r   rr   r   Z	constantsr
   r   r   r   r   rO   r   ior   r   r   r   r   r   r   r   Zinvalid_unicode_no_surrogaterF   rW   r(   rs   evalrd   setrk   Zascii_punctuation_rerp   objectr   rD   r@   rA   r.   r   r   r   rJ   r   r   r   r   <module>   sX   
"








J g Ih 6'