B
     \P%                 @   s  d Z ddlmZmZmZ ddlZddlZdZdZdZ	dZ
dZG d	d
 d
eZG dd deZddddddddedddedZeZddddddddedddedZe Zeddi e Zeded  dddddddde	eddde	d!Ze Zee
e
d" e Zed#di e Zed$di e Zee
e
dd% e Zed&ddd' eeeeeeeeeeed(Zd2d*d+Zd3d,d-Zd4d.d/Zd0d1 Z dS )5z<
- Parse jit compile info
- Compute warp occupany histogram
    )divisionabsolute_importprint_functionNi @  i   i   i  i  c               @   sD   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dS )OccupancyThreadKeyc             C   s"   |\| _ | _| j d| j f| _d S )N   )	occupancyZthreads
comparison)selfitem r   :lib/python3.7/site-packages/numba/cuda/cudadrv/autotune.py__init__   s    zOccupancyThreadKey.__init__c             C   s   | j |j k S )N)r   )r	   otherr   r   r   __lt__   s    zOccupancyThreadKey.__lt__c             C   s   | j |j kS )N)r   )r	   r   r   r   r   __eq__   s    zOccupancyThreadKey.__eq__c             C   s   | j |j kS )N)r   )r	   r   r   r   r   __ne__   s    zOccupancyThreadKey.__ne__c             C   s   | j |j kS )N)r   )r	   r   r   r   r   __gt__!   s    zOccupancyThreadKey.__gt__c             C   s   | j |j kS )N)r   )r	   r   r   r   r   __le__$   s    zOccupancyThreadKey.__le__c             C   s   | j |j kS )N)r   )r	   r   r   r   r   __ge__'   s    zOccupancyThreadKey.__ge__N)
__name__
__module____qualname__r   r   r   r   r   r   r   r   r   r   r   r      s   r   c               @   sZ   e Zd ZdZdddZedd Zedd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )	AutoTunerz<Autotune a kernel based upon the theoretical occupancy.
    Nr   c             C   sD   || _ || _t||d| _tttdd | j D t	d| _
d S )N)infoccc             s   s   | ]\}\}}||fV  qd S )Nr   ).0tpbZoccupfactorr   r   r   	<genexpr>2   s   z%AutoTuner.__init__.<locals>.<genexpr>)key)r   dynsmemwarp_occupancy_tablelistreversedsortedtableitemsr   _by_occupancy)r	   r   r   smem_configr    r   r   r   r   .   s    zAutoTuner.__init__c             C   s   | j S )zlA dict with thread-per-block as keys and tuple-2 of
        (occupency, limiting factor) as values.
        )r"   )r	   r   r   r   r&   7   s    zAutoTuner.tablec             C   s   | j S )zA list of tuple-2 of (occupancy, thread-per-block) sorted in
        descending.

        The first item has the highest occupancy and the lowest number of
        thread-per-block.
        )r(   )r	   r   r   r   by_occupancy>   s    zAutoTuner.by_occupancyc             C   s   |   S )N)max_occupancy_min_blocks)r	   r   r   r   bestH   s    zAutoTuner.bestc             C   s   | j d d S )zReturns the thread-per-block that optimizes for
        maximum occupancy and minimum blocks.

        Maximum blocks allows for the best utilization of parallel execution
        because each block can be executed concurrently on different SM.
        r   r   )r*   )r	   r   r   r   r+   K   s    z"AutoTuner.max_occupancy_min_blocksc             C   s,   t | j d }t||}| j|dgd S )z.Find the occupancy of the closest tpb
        thread_per_warpr   )PHYSICAL_LIMITSr   ceilr&   get)r	   r   warpsizer   r   r   closestT   s    
zAutoTuner.closestc             C   s@   t | j d }tt||}tt||}| jt||d | S )z=Returns the best tpb in the given range inclusively.
        r-   r   )r.   r   intr/   floorpreferrange)r	   ZmintpbZmaxtpbr1   r   r   r   best_within^   s    zAutoTuner.best_withinc             G   sN   g }x,|D ]$}|  |}|dkr
|||f q
W |rJt|tdd d S dS )zPrefer the thread-per-block with the highest warp occupancy
        and the lowest thread-per-block.

        May return None if all threads-per-blocks are invalid
        r   )r   r   N)r2   appendr%   r   )r	   Ztpblistbinr   Zoccr   r   r   r5   f   s    

zAutoTuner.prefer)Nr   )r   r   r   __doc__r   propertyr&   r*   r,   r+   r2   r7   r5   r   r   r   r   r   +   s   
	
	
r       0   i      i   @   warp?         i   )r-   warp_per_smthread_per_smblock_per_sm	registersreg_alloc_unitreg_alloc_granreg_per_threadsmem_per_smsmem_alloc_unitwarp_alloc_granmax_block_sizedefault_smem_configi            rK      i   )rH   rP   )r-   rE   rF   rG   rH   rI   rJ   rK   rL   Zsmem_per_blockrM   rN   rO   rP   )rL   rP   rH   rN   )rL   rP   rN   i   )rF   rE   rN   ))rD   r   )rD   r   )   r   )rU      )rU      )rV   r   )rV   rD   )rV   rU   )   r   )rX   r   )rX   rD   r   c             C   s   |t | |  S )N)mathr/   )xsr   r   r   r/      s    r/   c             C   s   |t | |  S )N)rY   r4   )rZ   r[   r   r   r   r4      s    r4   c       	   	   C   s   i }yt | }W n0 tk
r@   tdddd |D  Y nX |dkrR|d }|d }| j}x>t||d |D ]*}t|| j| j||d	}|d
 rr|||< qrW |S )zReturns a dictionary of {threadperblock: occupancy, factor}

    Only threadperblock of multiple of warpsize is used.
    Only threadperblock of non-zero occupancy is returned.
    z(%s is not a supported compute capability.c             s   s   | ]}t |V  qd S )N)str)r   cr   r   r   r      s    z!warp_occupancy.<locals>.<genexpr>NrP   r-   r   )r   regsmemr)   limitsr   )	r.   KeyError
ValueErrorjoinZ
maxthreadsr6   compute_warp_occupancyregsZshared)	r   r   r)   Zretra   r1   Z
max_threadr   resultr   r   r   r!      s&    r!   c             C   s2  |d dkst d|d }|d }|d }|d }|d }	t|d	 |}
|d
 }|d }|d }t| | }|}|}|}t||}t|t|| }t|	t|| | |}||krdn|dkrt|| n|}|dkrt|
| n|}t|||}||krd}n||krd}nd}|| }|| }||fS )NrJ   rA   z+assume warp register allocation granularityrG   rE   r-   rK   rH   rL   rM   rI   rN   r   Zwarpsrf   r`   )AssertionErrorminr/   r4   )r   r_   r`   r)   ra   Zlimit_block_per_smZlimit_warp_per_smZlimit_thread_per_warpZlimit_reg_per_threadZlimit_total_regsZlimit_total_smemZmy_smem_alloc_unitrI   rN   Zmy_warp_per_blockZmy_reg_countZmy_reg_per_blockZmy_smemZmy_smem_per_blockZlimit_blocks_due_to_warpsZc39Zlimit_blocks_due_to_regsZlimit_blocks_due_to_smemZactive_block_per_smr   Zactive_warps_per_smr   r   r   r   re   	  sL    


re   )r   )r   )N)!r;   Z
__future__r   r   r   rY   reZSMEM16KZSMEM48KZSMEM64KZSMEM96KZSMEM112Kobjectr   r   ZLIMITS_CC_20ZLIMITS_CC_21ZLIMITS_CC_30copyZLIMITS_CC_35updateZLIMITS_CC_37ZLIMITS_CC_50ZLIMITS_CC_52ZLIMITS_CC_53ZLIMITS_CC_60ZLIMITS_CC_61ZLIMITS_CC_62r.   r/   r4   r!   re   r   r   r   r   <module>   s   Q









