B
    03\Q                 @   s  d dl mZ d dlmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZ dd	d
dddddddddgZddgdgdgdZg ZdddgZddd eed dedded d!D Zed"e d# ejZd$ZG d%d& d&eZd'd( ZG d)d* d*ejZdS )+    )unicode_literals)chainN)urlparse)unescape)html5lib_shim)alphabetize_attributesforce_unicodeaabbracronymbZ
blockquotecodeZemiZliZolZstrongZulhreftitle)r	   r
   r   httpZhttpsZmailto c             C   s   g | ]}t |qS  )chr).0cr   r   /lib/python3.7/site-packages/bleach/sanitizer.py
<listcomp>-   s    r   	                []?c               @   s0   e Zd ZdZeeeedddfddZdd Z	dS )	Cleanera  Cleaner for cleaning HTML fragments of malicious content

    This cleaner is a security-focused function whose sole purpose is to remove
    malicious content from a string such that it can be displayed as content in
    a web page.

    To use::

        from bleach.sanitizer import Cleaner

        cleaner = Cleaner()

        for text in all_the_yucky_things:
            sanitized = cleaner.clean(text)

    .. Note::

       This cleaner is not designed to use to transform content to be used in
       non-web-page contexts.

    .. Warning::

       This cleaner is not thread-safe--the html parser has internal state.
       Create a separate cleaner per thread!


    FTNc             C   sn   || _ || _|| _|| _|| _|| _|p*g | _tj| j | jddd| _	t
d| _tjddddddd| _dS )a  Initializes a Cleaner

        :arg list tags: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``

        :arg dict attributes: allowed attributes; can be a callable, list or dict;
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``

        :arg list styles: allowed list of css styles; defaults to
            ``bleach.sanitizer.ALLOWED_STYLES``

        :arg list protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

        :arg bool strip: whether or not to strip disallowed elements

        :arg bool strip_comments: whether or not to strip HTML comments

        :arg list filters: list of html5lib Filter classes to pass streamed content through

            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters

            .. Warning::

               Using filters changes the output of ``bleach.Cleaner.clean``.
               Make sure the way the filters change the output are secure.

        F)tagsstripZconsume_entitiesZnamespaceHTMLElementsZetreealwaysT)Zquote_attr_valuesZomit_optional_tagsZescape_lt_in_attrsZresolve_entitiesZsanitizeZalphabetical_attributesN)r"   
attributesstyles	protocolsr#   strip_commentsfiltersr   ZBleachHTMLParserparserZgetTreeWalkerwalkerZBleachHTMLSerializer
serializer)selfr"   r%   r&   r'   r#   r(   r)   r   r   r   __init__W   s(    

zCleaner.__init__c          
   C   s   t |tjs$dj|jjd}t||s,dS t|}| j	|}t
| || j| j| j| j| j| jg d}x| jD ]}||d}qrW | j|S )zCleans text and returns sanitized result as unicode

        :arg str text: text to be cleaned

        :returns: sanitized text as unicode

        :raises TypeError: if ``text`` is not a text type

        z9argument cannot be of '{name}' type, must be of text type)namer   )sourcer%   strip_disallowed_elementsstrip_html_commentsallowed_elementsallowed_css_propertiesallowed_protocolsallowed_svg_properties)r0   )
isinstancesixZstring_typesformat	__class____name__	TypeErrorr   r*   ZparseFragmentBleachSanitizerFilterr+   r%   r#   r(   r"   r&   r'   r)   r,   Zrender)r-   textmessageZdomZfilteredZfilter_classr   r   r   clean   s(    
zCleaner.clean)
r;   
__module____qualname____doc__ALLOWED_TAGSALLOWED_ATTRIBUTESALLOWED_STYLESALLOWED_PROTOCOLSr.   r@   r   r   r   r   r!   :   s
   <r!   c                sL   t  r S t tr& fdd}|S t tr@ fdd}|S tddS )a0  Generates attribute filter function for the given attributes value

    The attributes value can take one of several shapes. This returns a filter
    function appropriate to the attributes value. One nice thing about this is
    that there's less if/then shenanigans in the ``allow_token`` method.

    c                s`   |  kr0 |  }t |r$|| ||S ||kr0dS d kr\ d }t |rT|| ||S ||kS dS )NT*F)callable)tagattrvalueZattr_val)r%   r   r   _attr_filter   s    z.attribute_filter_factory.<locals>._attr_filterc                s   | kS )Nr   )rJ   rK   rL   )r%   r   r   rM      s    z3attributes needs to be a callable, a list or a dictN)rI   r7   dictlist
ValueError)r%   rM   r   )r%   r   attribute_filter_factory   s    

rQ   c                   sp   e Zd ZdZeddf fdd	Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Z  ZS )r=   zmhtml5lib Filter that sanitizes text

    This filter can be used anywhere html5lib filters can be used.

    FTc                s*   t || _|| _|| _tt| j|f|S )a   Creates a BleachSanitizerFilter instance

        :arg Treewalker source: stream

        :arg list tags: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``

        :arg dict attributes: allowed attributes; can be a callable, list or dict;
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``

        :arg list styles: allowed list of css styles; defaults to
            ``bleach.sanitizer.ALLOWED_STYLES``

        :arg list protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

        :arg bool strip_disallowed_elements: whether or not to strip disallowed
            elements

        :arg bool strip_html_comments: whether or not to strip HTML comments

        )rQ   attr_filterr1   r2   superr=   r.   )r-   r0   r%   r1   r2   kwargs)r:   r   r   r.      s    
zBleachSanitizerFilter.__init__c             c   sH   xB|D ]:}|  |}|sqt|tr:x|D ]
}|V  q*W q|V  qW d S )N)sanitize_tokenr7   rO   )r-   token_iteratortokenZretZsubtokenr   r   r   sanitize_stream  s    



z%BleachSanitizerFilter.sanitize_streamc             c   s   g }xn|D ]f}|rR|d dkr,| | q
qjddd |D dd}g }|V  n|d dkrj| | q
|V  q
W ddd |D dd}|V  dS )	z/Merge consecutive Characters tokens in a streamtype
Charactersr   c             S   s   g | ]}|d  qS )datar   )r   
char_tokenr   r   r   r   (  s    z:BleachSanitizerFilter.merge_characters.<locals>.<listcomp>)r[   rY   c             S   s   g | ]}|d  qS )r[   r   )r   r\   r   r   r   r   5  s    N)appendjoin)r-   rV   Zcharacters_bufferrW   Z	new_tokenr   r   r   merge_characters  s"    



z&BleachSanitizerFilter.merge_charactersc             C   s   |  | tj| S )N)r_   rX   r   ZFilter__iter__)r-   r   r   r   r`   :  s    zBleachSanitizerFilter.__iter__c             C   s   |d }|dkrV|d | j kr(| |S | jr2dS d|krJt|d |d< | |S n.|dkrn| jsh|S dS n|dkr| |S |S dS )a  Sanitize a token either by HTML-encoding or dropping.

        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
        ['attribute', 'pairs'], 'tag': callable}.

        Here callable is a function with two arguments of attribute name and
        value. It should return true of false.

        Also gives the option to strip tags instead of encoding.

        :arg dict token: token to sanitize

        :returns: token or list of tokens

        rY   )StartTagEndTagEmptyTagr/   Nr[   CommentrZ   )r3   allow_tokenr1   r   disallowed_tokenr2   sanitize_characters)r-   rW   
token_typer   r   r   rU   =  s     

z$BleachSanitizerFilter.sanitize_tokenc             C   s   | dd}|s|S tt|}||d< d|kr4|S g }xt|D ]}|sNqD|drt|}|dk	r|dkr|ddd n|d|d	 |t	|d
 d }|rD|d|d qD|d|d qDW |S )a  Handles Characters tokens

        Our overridden tokenizer doesn't do anything with entities. However,
        that means that the serializer will convert all ``&`` in Characters
        tokens to ``&amp;``.

        Since we don't want that, we extract entities here and convert them to
        Entity tokens so the serializer will let them be.

        :arg token: the Characters token to work on

        :returns: a list of tokens

        r[   r   &NZamprZ   )rY   r[   ZEntity)rY   r/      )
getINVISIBLE_CHARACTERS_REsubINVISIBLE_REPLACEMENT_CHARr   Znext_possible_entity
startswithZmatch_entityr]   len)r-   rW   r[   Z
new_tokenspartZentityZ	remainderr   r   r   rg   h  s.    

z)BleachSanitizerFilter.sanitize_charactersc             C   s   t |}tdd|}|dd}| }yt|}W n tk
rL   dS X |jrd|j|kr|S n8|	drr|S d|kr|
dd |kr|S d|kr|S dS )	zChecks a uri value to see if it's allowed

        :arg value: the uri value to sanitize
        :arg allowed_protocols: list of allowed protocols

        :returns: allowed value or None

        z[`\000-\040\177-\240\s]+r   u   �N#:r   r   )r   convert_entitiesrerm   replacelowerr   rP   Zschemero   split)r-   rL   r5   	new_valueZparsedr   r   r   sanitize_uri_value  s*    


z(BleachSanitizerFilter.sanitize_uri_valuec       	      C   s   d|kri }x|d   D ]\}}|\}}| |d ||s>q|| jkrd| || j}|dkr`q|}|| jkrtddt|}|	 }|sqn|}d|d f| j
kr|dtjd dfgkrtd	|rq|d
kr| |}|||< qW t||d< |S )z-Handles the case where we're allowing the tagr[   r/   Nzurl\s*\(\s*[^#\s][^)]+?\) )Nr   Zxlinkr   z
^\s*[^#\s])Nstyle)itemsrR   Zattr_val_is_urirz   r5   Zsvg_attr_val_allows_refru   rm   r   r#   Zsvg_allow_local_hrefr   Z
namespacessearchsanitize_cssr   )	r-   rW   attrsnamespaced_nameval	namespacer/   ry   Znew_valr   r   r   re     s:    



z!BleachSanitizerFilter.allow_tokenc             C   s  |d }|dkr"d|d  |d< n|d r|dks6t g }xj|d  D ]Z\\}}}|rf|sf|| }}|d ksx|tjkr~|}ndtj| |f }|d||f  qHW d	|d d
|f |d< nd|d  |d< |dr|d d d d |d< d|d< |d= |S )NrY   rb   z</%s>r/   r[   )ra   rc   z%s:%sz %s="%s"z<%s%s>r   z<%s>ZselfClosingz/>rZ   )AssertionErrorr}   r   prefixesr]   r^   rk   )r-   rW   rh   r   nsr/   vr   r   r   r   rf     s,    

z&BleachSanitizerFilter.disallowed_tokenc             C   s   t |}tdd|}|d}td}x|D ]}||s6dS q6W td|s\dS g }xhtd|D ]X\}}|s|qn| | j	kr|
|d | d  qn| | jkrn|
|d | d  qnW d|S )	zSanitizes css in style tagszurl\s*\(\s*[^\s)]+?\s*\)\s*r{   ;zI^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$r   z ^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$z([-\w]+)\s*:\s*([^:;]*)z: )r   rt   ru   compilerm   rx   matchfindallrw   r4   r]   r6   r^   )r-   r|   partsZgauntletrq   r@   ZproprL   r   r   r   r   F  s&    



z"BleachSanitizerFilter.sanitize_css)r;   rA   rB   rC   rE   r.   rX   r_   r`   rU   rg   rz   re   rf   r   __classcell__r   r   )r:   r   r=      s   +=;=)r=   )Z
__future__r   	itertoolsr   ru   r8   Zsix.moves.urllib.parser   Zxml.sax.saxutilsr   Zbleachr   Zbleach.utilsr   r   rD   rE   rF   rG   r^   rangeZINVISIBLE_CHARACTERSr   UNICODErl   rn   objectr!   rQ   ZSanitizerFilterr=   r   r   r   r   <module>   sB   

.
 )