B
    ;4\	F              p   @   sH  d Z ddlmZ ddlZddlZddlZddlmZmZ ddlm	Z	 ddl
mZmZ ddl
mZ ddlmZ dd	lmZ ddlmZ dd
lmZ ddlmZ ddlmZ ddlmZ e	jZeeZee	j d e	j d e	j d gZ!e	j d Z"e	j d Z#dddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~ddddgpZ$G dd de%Z&G dd deZ'G dd deZ(dd Z)dd Z*dd Z+e,dZ-dd Z.G dd deZ/dS )z
Shim module between Bleach and html5lib. This makes it easier to upgrade the
html5lib library without having to change a lot of code.
    )unicode_literalsN)
HTMLParsergetTreeWalker)	constants)
namespacesprefixes)_ReparseException)Filter)allowed_protocols)HTMLInputStream)HTMLSerializer)HTMLTokenizer)TrieZStartTagZEndTagZEmptyTagZ
CharactersZ
ParseErroraZabbrZaddressZareaZarticleZasideZaudiobbaseZbdiZbdoZ
blockquoteZbodybrZbuttonZcanvasZcaptionZcitecodecolZcolgroupdataZdatalistZdddelZdetailsZdfnZdialogdivZdlZdtZemZembedZfieldsetZ
figcaptionZfigureZfooterZformZh1Zh2Zh3Zh4Zh5Zh6headheaderZhgroupZhrZhtmliZiframeZimginputZinsZkbdZkeygenZlabelZlegendZlilinkmapZmarkZmenumetaZmeterZnavZnoscriptobjectZolZoptgroupZoptionoutputpZparamZpictureZpreZprogressqZrpZrtZrubysZsampZscriptZsectionZselectZslotZsmallsourcespanZstrongZstylesubZsummaryZsuptableZtbodyZtdtemplateZtextareaZtfootZthZtheadZtimetitleZtrZtrackuZulvarZvideoZwbrc               @   sf   e Zd ZdZdd Zedd Zedd Zedd	 Zd
d Z	dddZ
dd Zdd Zdd ZdS )InputStreamWithMemoryzWraps an HTMLInputStream to remember characters since last <

    This wraps existing HTMLInputStream classes to keep track of the stream
    since the last < which marked an open tag state.

    c             C   s$   || _ | j j| _| j j| _g | _d S )N)_inner_streamresetZposition_buffer)selfZinner_stream r1   3lib/python3.7/site-packages/bleach/html5lib_shim.py__init__   s    

zInputStreamWithMemory.__init__c             C   s   | j jS )N)r-   errors)r0   r1   r1   r2   r4      s    zInputStreamWithMemory.errorsc             C   s   | j jS )N)r-   charEncoding)r0   r1   r1   r2   r5      s    z"InputStreamWithMemory.charEncodingc             C   s   | j jS )N)r-   changeEncoding)r0   r1   r1   r2   r6      s    z$InputStreamWithMemory.changeEncodingc             C   s   | j  }|r| j| |S )N)r-   charr/   append)r0   cr1   r1   r2   r7      s    
zInputStreamWithMemory.charFc             C   s$   | j j||d}| jt| |S )N)opposite)r-   
charsUntilr/   extendlist)r0   Z
charactersr:   charsr1   r1   r2   r;      s    z InputStreamWithMemory.charsUntilc             C   s   | j r| j d | j|S )N)r/   popr-   unget)r0   r7   r1   r1   r2   rA      s    zInputStreamWithMemory.ungetc             C   s   t d| jS )zReturns the stream history since last '<'

        Since the buffer starts at the last '<' as as seen by tagOpenState(),
        we know that everything from that point to when this method is called
        is the "tag" that is being tokenized.

         )sixZ	text_typejoinr/   )r0   r1   r1   r2   get_tag   s    zInputStreamWithMemory.get_tagc             C   s   dg| _ dS )zResets stream history to just '<'

        This gets called by tagOpenState() which marks a '<' that denotes an
        open tag. Any time we see that, we reset the buffer.

        <N)r/   )r0   r1   r1   r2   	start_tag   s    zInputStreamWithMemory.start_tagN)F)__name__
__module____qualname____doc__r3   propertyr4   r5   r6   r7   r;   rA   rE   rG   r1   r1   r1   r2   r,      s   

r,   c                   sT   e Zd ZdZd fdd	Z fddZd fdd		Z fd
dZ fddZ  Z	S )BleachHTMLTokenizerz1Tokenizer that doesn't consume character entitiesFc                s(   t t| jf | || _t| j| _d S )N)superrM   r3   consume_entitiesr,   stream)r0   rO   kwargs)	__class__r1   r2   r3      s    zBleachHTMLTokenizer.__init__c             #   s   d }xt t|  D ]}|d k	r|d dkrd|d tkrd|drddd |d D |d< d }|V  q|d dkr|d   | jjkr| j	
 |d< t|d< d }|V  q|d tkr|V  |}q|V  |V  d }q|d tkr|}q|V  qW |r|V  d S )Nr   z#invalid-character-in-attribute-nametypec             S   s4   g | ],}d |d krd|d krd|d kr|qS )"r   'rF   r1   ).0itemr1   r1   r2   
<listcomp>   s    z0BleachHTMLTokenizer.__iter__.<locals>.<listcomp>z!expected-closing-tag-but-got-char)rN   rM   __iter__TAG_TOKEN_TYPESgetlowerstripparsertagsrP   rE   CHARACTERS_TYPEPARSEERROR_TYPE)r0   Zlast_error_tokentoken)rR   r1   r2   rY      s8    

zBleachHTMLTokenizer.__iter__Nc                sN   | j rtt| ||S |r8| jd d d  d7  < n| jtdd d S )Nr   r?      &)rS   r   )rO   rN   rM   consumeEntitycurrentToken
tokenQueuer8   r`   )r0   ZallowedCharZfromAttribute)rR   r1   r2   re   -  s
    z!BleachHTMLTokenizer.consumeEntityc                s   | j   tt|  S )N)rP   rG   rN   rM   tagOpenState)r0   )rR   r1   r2   rh   ?  s    
z BleachHTMLTokenizer.tagOpenStatec                s   | j }| jjd k	rr|d tkrr|d  | jjkrr| jjr@d}n
| j }t|d}|| _ | j	
| | j| _d S tt|   d S )NrS   namerB   )rS   r   )rf   r^   r_   rZ   r\   r]   rP   rE   r`   rg   r8   Z	dataStatestaterN   rM   emitCurrentToken)r0   rb   Znew_dataZ	new_token)rR   r1   r2   rk   G  s    
z$BleachHTMLTokenizer.emitCurrentToken)F)NF)
rH   rI   rJ   rK   r3   rY   re   rh   rk   __classcell__r1   r1   )rR   r2   rM      s   >rM   c                   s*   e Zd ZdZ fddZdddZ  ZS )	BleachHTMLParserz$Parser that uses BleachHTMLTokenizerc                s>   |dk	rdd |D nd| _ || _|| _tt| jf | dS )a  
        :arg tags: list of allowed tags--everything else is either stripped or
            escaped; if None, then this doesn't look at tags at all
        :arg strip: whether to strip disallowed tags (True) or escape them (False);
            if tags=None, then this doesn't have any effect
        :arg consume_entities: whether to consume entities (default behavior) or
            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

        Nc             S   s   g | ]}|  qS r1   )r\   )rV   tagr1   r1   r2   rX   v  s    z-BleachHTMLParser.__init__.<locals>.<listcomp>)r_   r]   rO   rN   rm   r3   )r0   r_   r]   rO   rQ   )rR   r1   r2   r3   l  s    
zBleachHTMLParser.__init__Fr   c             K   sj   || _ || _|| _tf || j| d|| _|   y|   W n$ tk
rd   |   |   Y nX d S )N)rP   rO   r^   )	ZinnerHTMLMode	container	scriptingrM   rO   Z	tokenizerr.   ZmainLoopReparseException)r0   rP   Z	innerHTMLro   rp   rQ   r1   r1   r2   _parse{  s    
zBleachHTMLParser._parse)Fr   F)rH   rI   rJ   rK   r3   rr   rl   r1   r1   )rR   r2   rm   j  s   rm   c             C   sT   | d dkrH| d dkr0t t| dd dS t t| dd dS t| dS )	a9  Convert an entity (minus the & and ; part) into what it represents

    This handles numeric, hex, and text entities.

    :arg value: the string (minus the ``&`` and ``;`` part) to convert

    :returns: unicode character or None if it's an ambiguous ampersand that
        doesn't match a character entity

    r   #rc   )xX   N   
   )rC   ZunichrintENTITIESr[   )valuer1   r1   r2   convert_entity  s
    r|   c             C   s   d| kr| S g }xxt | D ]l}|s$q|dr|t|}|dk	r|t|}|dk	r||| |t|d d }|r|| q|| qW d|S )zConverts all found entities in the text

    :arg text: the text to convert entities in

    :returns: unicode text with converted entities

    rd   Nrv   rB   )next_possible_entity
startswithmatch_entityr|   r8   lenrD   )textZnew_textpartentityZ	convertedZ	remainderr1   r1   r2   convert_entities  s$    


r   c             C   s   | d dkrt d| dd } t| } d}dtj }| r| d dkrd}| d | rx| d d	krxd
}|| d7 }nd}x0| r| d |kr| d}||krP ||7 }q~W |r| r| d dkr|S dS x2| r| d |kr| d}t|sP ||7 }qW |r| r| d dkr|S dS )aH  Returns first entity in stream or None if no entity exists

    Note: For Bleach purposes, entities must start with a "&" and end with
    a ";". This ignoresambiguous character entities that have no ";" at the
    end.

    :arg stream: the character stream

    :returns: ``None`` or the entity string without "&" or ";"

    r   rd   zStream should begin with "&"rc   NrB   z<&=;rs   )rt   ru   Z0123456789abcdefABCDEF
0123456789;)
ValueErrorr=   stringZ
whitespacer@   ENTITIES_TRIEZhas_keys_with_prefix)rP   Zpossible_entityZend_charactersZallowedr9   r1   r1   r2   r     s:    




r   z(&)c             c   sF   x@t t| D ].\}}|dkr(|V  q|d dkrd| V  qW dS )zTakes a text and generates a list of possible entities

    :arg text: the text to look at

    :returns: generator where each part (except the first) starts with an
        "&"

    r   rv   rd   N)	enumerateAMP_SPLIT_REsplit)r   r   r   r1   r1   r2   r}     s
    	r}   c                   s*   e Zd ZdZdd Zd fdd	Z  ZS )BleachHTMLSerializerz3HTMLSerializer that undoes & -> &amp; in attributesc             c   s   | dd}xxt|D ]l}|s q|drtt|}|dk	rtt|dk	rtd| d V  |t|d d }|r|V  q| ddV  qW dS )z,Escapes just bare & in HTML attribute valuesz&amp;rd   Nr   rv   )replacer}   r~   r   r|   r   )r0   stokenr   r   r1   r1   r2   escape_base_amp  s    
z$BleachHTMLSerializer.escape_base_ampNc             #   s   d}d}xt t| ||D ]l}|rt|dkr2d}n:|r`|dkrlx| |D ]
}|V  qJW d}qn|dkrld}|V  q|drd}|V  qW dS )zWrap HTMLSerializer.serialize and conver & to &amp; in attribute values

        Note that this converts & to &amp; in attribute values where the & isn't
        already part of an unambiguous character entity.

        F>rT   =TrF   N)rN   r   	serializer   r~   )r0   Z
treewalkerencodingZin_tagZafter_equalsr   r   )rR   r1   r2   r   3  s$    

zBleachHTMLSerializer.serialize)N)rH   rI   rJ   rK   r   r   rl   r1   r1   )rR   r2   r     s   r   )0rK   Z
__future__r   rer   rC   Zbleach._vendor.html5libr   r   r   Z!bleach._vendor.html5lib.constantsr   r   r   rq   Z$bleach._vendor.html5lib.filters.baser	   Z)bleach._vendor.html5lib.filters.sanitizerr
   ZSanitizerFilterZ$bleach._vendor.html5lib._inputstreamr   Z"bleach._vendor.html5lib.serializerr   Z"bleach._vendor.html5lib._tokenizerr   Zbleach._vendor.html5lib._trier   Zentitiesrz   r   setZ
tokenTypesrZ   r`   ra   Z	HTML_TAGSr   r,   rM   rm   r|   r   r   compiler   r}   r   r1   r1   r1   r2   <module>   s"  

> %#:
