ó
\K]c           @` sV  d  Z  d d l m Z m Z m Z d d l Z d d l Z d d5 Z d d6 Z d d7 Z	 d	 d8 Z
 d
 d9 Z d e f d „  ƒ  YZ d e f d „  ƒ  YZ i d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6e d 6d d 6d d  6d! d" 6e d# 6Z e Z i d d 6d d 6d$ d 6d d 6d% d 6d& d 6d d 6d d 6e d 6d& d 6d' d  6d! d" 6e d# 6Z e j ƒ  Z e j i d( d 6ƒ e j ƒ  Z e j i d) d 6e d# 6ƒ i d d 6d d 6d$ d 6d d 6d% d 6d& d 6d d 6d( d 6e	 d 6e d* 6d& d 6d' d  6d! d" 6e	 d# 6Z e j ƒ  Z e j i e
 d 6e
 d# 6ƒ e j ƒ  Z e j i d d 6ƒ e j ƒ  Z e j i d d  6ƒ e j ƒ  Z e j i e
 d 6e
 d# 6d' d  6ƒ e j ƒ  Z e j i d+ d 6d d 6d' d  6ƒ i e d: 6e d; 6e d< 6e d= 6e d> 6e d? 6e d@ 6e dA 6e dB 6e dC 6e dD 6Z d, d1 „ Z d, d2 „ Z d d3 „ Z  d4 „  Z! d S(E   s<   
- Parse jit compile info
- Compute warp occupany histogram
i    (   t   divisiont   absolute_importt   print_functionNi   i   i
   i0   i@   i`   ip   t   OccupancyThreadKeyc           B` sG   e  Z d  „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   c         C` s/   | \ |  _  |  _ |  j  d |  j f |  _ d  S(   Ni   (   t	   occupancyt   threadst
   comparison(   t   selft   item(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   __init__   s    c         C` s   |  j  | j  k  S(   N(   R   (   R   t   other(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   __lt__   s    c         C` s   |  j  | j  k S(   N(   R   (   R   R
   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   __eq__   s    c         C` s   |  j  | j  k S(   N(   R   (   R   R
   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   __ne__   s    c         C` s   |  j  | j  k S(   N(   R   (   R   R
   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   __gt__!   s    c         C` s   |  j  | j  k S(   N(   R   (   R   R
   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   __le__$   s    c         C` s   |  j  | j  k S(   N(   R   (   R   R
   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   __ge__'   s    (	   t   __name__t
   __module__R	   R   R   R   R   R   R   (    (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR      s   						t	   AutoTunerc           B` sh   e  Z d  Z d
 d d „ Z e d „  ƒ Z e d „  ƒ Z d „  Z d „  Z	 d „  Z
 d „  Z d	 „  Z RS(   s<   Autotune a kernel based upon the theoretical occupancy.
    i    c         C` sb   | |  _  | |  _ t d | d | ƒ |  _ t t t d „  |  j j ƒ  Dƒ d t	 ƒƒ ƒ |  _
 d  S(   Nt   infot   ccc         s` s'   |  ] \ } \ } } | | f Vq d  S(   N(    (   t   .0t   tpbt   occupt   factor(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pys	   <genexpr>2   s   t   key(   R   t   dynsmemt   warp_occupancyt   _tablet   listt   reversedt   sortedt   tablet   itemsR   t   _by_occupancy(   R   R   R   t   smem_configR   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR	   .   s    		c         C` s   |  j  S(   sl   A dict with thread-per-block as keys and tuple-2 of
        (occupency, limiting factor) as values.
        (   R   (   R   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR!   7   s    c         C` s   |  j  S(   s¾   A list of tuple-2 of (occupancy, thread-per-block) sorted in
        descending.

        The first item has the highest occupancy and the lowest number of
        thread-per-block.
        (   R#   (   R   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   by_occupancy>   s    c         C` s
   |  j  ƒ  S(   N(   t   max_occupancy_min_blocks(   R   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   bestH   s    c         C` s   |  j  d d S(   sý   Returns the thread-per-block that optimizes for
        maximum occupancy and minimum blocks.

        Maximum blocks allows for the best utilization of parallel execution
        because each block can be executed concurrently on different SM.
        i    i   (   R%   (   R   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR&   K   s    c         C` s:   t  |  j d } t | | ƒ } |  j j | d g ƒ d S(   s.   Find the occupancy of the closest tpb
        t   thread_per_warpi    (   t   PHYSICAL_LIMITSR   t   ceilR!   t   get(   R   R   t   warpsize(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   closestT   s    c         C` sX   t  |  j d } t t | | ƒ ƒ } t t | | ƒ ƒ } |  j t | | d | ƒ Œ  S(   s=   Returns the best tpb in the given range inclusively.
        R(   i   (   R)   R   t   intR*   t   floort   prefert   range(   R   t   mintpbt   maxtpbR,   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   best_within^   s    c         G` sj   g  } x? | D]7 } |  j  | ƒ } | d k r | j | | f ƒ q q W| rf t | d t ƒd d Sd S(   s¬   Prefer the thread-per-block with the highest warp occupancy
        and the lowest thread-per-block.

        May return None if all threads-per-blocks are invalid
        i    R   iÿÿÿÿi   N(   R-   t   appendR    R   (   R   t   tpblistt   binR   t   occ(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR0   f   s    N(   R   R   t   __doc__t   NoneR	   t   propertyR!   R%   R'   R&   R-   R4   R0   (    (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR   +   s   	
				
	i    R(   t   warp_per_smi   t   thread_per_smi   t   block_per_smi €  t	   registerst   reg_alloc_unitt   warpt   reg_alloc_grani?   t   reg_per_threadt   smem_per_smi€   t   smem_alloc_unitt   warp_alloc_grani   t   max_block_sizet   default_smem_configi   i   i   i   iÿ   i   t   smem_per_blocki   i   i   i   i   i   c         C` s   | t  j |  | ƒ S(   N(   t   mathR*   (   t   xt   s(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR*   å   s    c         C` s   | t  j |  | ƒ S(   N(   RJ   R/   (   RK   RL   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR/   é   s    c   	      C` sà   i  } y t  | } Wn4 t k
 rJ t d d j d „  | Dƒ ƒ ƒ ‚ n X| d k rd | d } n  | d } |  j } xb t | | d | ƒ D]J } t d | d |  j d	 |  j	 d
 | d | ƒ } | d rŽ | | | <qŽ qŽ W| S(   sµ   Returns a dictionary of {threadperblock: occupancy, factor}

    Only threadperblock of multiple of warpsize is used.
    Only threadperblock of non-zero occupancy is returned.
    s(   %s is not a supported compute capabilityt   .c         s` s   |  ] } t  | ƒ Vq d  S(   N(   t   str(   R   t   c(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pys	   <genexpr>ø   s    RH   R(   i   R   t   regt   smemR$   t   limitsi    N(
   R)   t   KeyErrort
   ValueErrort   joinR:   t
   maxthreadsR1   t   compute_warp_occupancyt   regst   shared(	   R   R   R$   t   retRR   R,   t
   max_threadR   t   result(    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyR   í   s&    !
				
c         C` sŸ  | d d k s t  d ƒ ‚ | d } | d } | d } | d } | d }	 t | d	 | ƒ }
 | d
 } | d } | d } t |  | ƒ } | } | } | } t | | ƒ } t | t | | ƒ ƒ } t |	 t | | | ƒ | ƒ } | | k rû d n | d k rt | | ƒ n | } | d k r9t |
 | ƒ n | } t | | | ƒ } | | k rfd } n | | k r{d } n d } | | } | | } | | f S(   NRB   RA   s+   assume warp register allocation granularityR>   R<   R(   RC   R?   RD   RE   R@   RF   i    t   warpsRX   RQ   (   t   AssertionErrort   minR*   R/   (   R   RP   RQ   R$   RR   t   limit_block_per_smt   limit_warp_per_smt   limit_thread_per_warpt   limit_reg_per_threadt   limit_total_regst   limit_total_smemt   my_smem_alloc_unitR@   RF   t   my_warp_per_blockt   my_reg_countt   my_reg_per_blockt   my_smemt   my_smem_per_blockt   limit_blocks_due_to_warpst   c39t   limit_blocks_due_to_regst   limit_blocks_due_to_smemt   active_block_per_smR   t   active_warps_per_smR   (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyRW   	  sL    	







				

i   i   i   i   i   (   i   i    (   i   i   (   i   i    (   i   i   (   i   i   (   i   i    (   i   i   (   i   i   (   i   i    (   i   i   (   i   i   ("   R9   t
   __future__R    R   R   RJ   t   ret   SMEM16Kt   SMEM48Kt   SMEM64Kt   SMEM96Kt   SMEM112Kt   objectR   R   t   LIMITS_CC_20t   LIMITS_CC_21t   LIMITS_CC_30t   copyt   LIMITS_CC_35t   updatet   LIMITS_CC_37t   LIMITS_CC_50t   LIMITS_CC_52t   LIMITS_CC_53t   LIMITS_CC_60t   LIMITS_CC_61t   LIMITS_CC_62R)   R*   R/   R:   R   RW   (    (    (    s:   lib/python2.7/site-packages/numba/cuda/cudadrv/autotune.pyt   <module>   sÂ   




P

		
					
