B
    ߜwa"                 @   s\   d Z ddlZddlZddlZdgZeddZG dd dZG dd dZ	G d	d
 d
Z
dS )a%   robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://www.robotstxt.org/norobots-rfc.txt
    NRobotFileParserRequestRatezrequests secondsc               @   sj   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd ZdS )r   zs This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

     c             C   s,   g | _ d | _d| _d| _| | d| _d S )NFr   )entriesdefault_entrydisallow_all	allow_allset_urllast_checked)selfurl r   #lib/python3.7/urllib/robotparser.py__init__   s    
zRobotFileParser.__init__c             C   s   | j S )zReturns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        )r
   )r   r   r   r   mtime$   s    zRobotFileParser.mtimec             C   s   ddl }|  | _dS )zYSets the time the robots.txt file was last fetched to the
        current time.

        r   N)timer
   )r   r   r   r   r   modified-   s    zRobotFileParser.modifiedc             C   s&   || _ tj|dd \| _| _dS )z,Sets the URL referring to a robots.txt file.      N)r   urllibparseurlparseZhostpath)r   r   r   r   r   r	   5   s    zRobotFileParser.set_urlc          
   C   s   yt j| j}W nR t jjk
rd } z0|jdkr:d| _n|jdkrT|jdk rTd| _W dd}~X Y nX |	 }| 
|d  dS )z4Reads the robots.txt URL and feeds it to the parser.)i  i  Ti  i  Nzutf-8)r   ZrequestZurlopenr   errorZ	HTTPErrorcoder   r   readr   decode
splitlines)r   ferrrawr   r   r   r   :   s    
zRobotFileParser.readc             C   s,   d|j kr| jd kr(|| _n| j| d S )N*)
useragentsr   r   append)r   entryr   r   r   
_add_entryG   s    

zRobotFileParser._add_entryc             C   s6  d}t  }|   x|D ]}|sT|dkr8t  }d}n|dkrT| | t  }d}|d}|dkrr|d| }| }|sq|dd}t|dkr|d   |d< tj	
|d  |d< |d dkr |dkr| | t  }|j|d  d}q|d dkr4|dkr|jt|d d	 d}q|d d
krh|dkr|jt|d d d}q|d dkr|dkr|d   rt|d |_d}q|d dkr|dkr|d d}t|dkr|d   r|d   rtt|d t|d |_d}qW |dkr2| | dS )zParse the input lines from a robots.txt file.

        We allow that a user-agent: line is not preceded by
        one or more blank lines.
        r   r      #N:z
user-agentZdisallowFZallowTzcrawl-delayzrequest-rate/)Entryr   r%   findstripsplitlenlowerr   r   unquoter"   r#   	rulelinesRuleLineisdigitintdelayr   req_rate)r   linesstater$   lineiZnumbersr   r   r   r   P   sd    






 
zRobotFileParser.parsec             C   s   | j r
dS | jrdS | jsdS tjtj|}tjdd|j|j	|j
|jf}tj|}|sfd}x"| jD ]}||rn||S qnW | jr| j|S dS )z=using the parsed robots.txt decide if useragent can fetch urlFTr   r)   )r   r   r
   r   r   r   r0   
urlunparser   ZparamsZqueryZfragmentquoter   
applies_to	allowancer   )r   	useragentr   Z
parsed_urlr$   r   r   r   	can_fetch   s$    
zRobotFileParser.can_fetchc             C   s>   |   sd S x| jD ]}||r|jS qW | jr:| jjS d S )N)r   r   r=   r5   r   )r   r?   r$   r   r   r   crawl_delay   s    

zRobotFileParser.crawl_delayc             C   s>   |   sd S x| jD ]}||r|jS qW | jr:| jjS d S )N)r   r   r=   r6   r   )r   r?   r$   r   r   r   request_rate   s    

zRobotFileParser.request_ratec             C   s0   | j }| jd k	r|| jg }dtt|d S )N
)r   r   joinmapstr)r   r   r   r   r   __str__   s    
zRobotFileParser.__str__N)r   )__name__
__module____qualname____doc__r   r   r   r	   r   r%   r   r@   rA   rB   rG   r   r   r   r   r      s   
		C

c               @   s(   e Zd ZdZdd Zdd Zdd ZdS )	r2   zoA rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path.c             C   s<   |dkr|sd}t jt j|}t j|| _|| _d S )Nr   T)r   r   r;   r   r<   r   r>   )r   r   r>   r   r   r   r      s
    zRuleLine.__init__c             C   s   | j dkp|| j S )Nr!   )r   
startswith)r   filenamer   r   r   r=      s    zRuleLine.applies_toc             C   s   | j r
dndd | j S )NZAllowZDisallowz: )r>   r   )r   r   r   r   rG      s    zRuleLine.__str__N)rH   rI   rJ   rK   r   r=   rG   r   r   r   r   r2      s   r2   c               @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )r*   z?An entry has one or more user-agents and zero or more rulelinesc             C   s   g | _ g | _d | _d | _d S )N)r"   r1   r5   r6   )r   r   r   r   r      s    zEntry.__init__c             C   s   g }x| j D ]}|d|  qW | jd k	r@|d| j  | jd k	rj| j}|d|j d|j  |tt| j	 |d d
|S )NzUser-agent: zCrawl-delay: zRequest-rate: r)   r   rC   )r"   r#   r5   r6   ZrequestsZsecondsextendrE   rF   r1   rD   )r   ZretagentZrater   r   r   rG      s    


zEntry.__str__c             C   sF   | dd  }x.| jD ]$}|dkr*dS | }||krdS qW dS )z2check if this entry applies to the specified agentr)   r   r!   TF)r-   r/   r"   )r   r?   rO   r   r   r   r=      s    zEntry.applies_toc             C   s$   x| j D ]}||r|jS qW dS )zZPreconditions:
        - our agent applies to this entry
        - filename is URL decodedT)r1   r=   r>   )r   rM   r9   r   r   r   r>      s    

zEntry.allowanceN)rH   rI   rJ   rK   r   rG   r=   r>   r   r   r   r   r*      s
   r*   )rK   collectionsZurllib.parser   Zurllib.request__all__
namedtupler   r   r2   r*   r   r   r   r   <module>   s    6