ó
\K]c           @   sL   d  d l  m Z d  d l m Z d  d l  m Z d  d l Z d d „ Z d S(   iÿÿÿÿ(   t   cuda(   t   driver(   t   numpy_supportNc            sW  t  |  d d ƒ } | ss |  j \ } } |  j j | |  j j f } t j j j | | f | d |  j d | ƒ} n  t j	 |  j ƒ ‰  t
 j ƒ  j } t t j d t j | d ƒ d ƒ ƒ } t | | ƒ } | | d f ‰ t j ‡  ‡ f d †  ƒ }	 t | j d | d ƒ t | j d | d ƒ f }
 | | f } |	 |
 | | f |  | ƒ | S(   sá  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    t   streami    t   dtypei   i   c   	         s  t  j j d ˆ d ˆ  ƒ } t  j j } t  j j } t  j j t  j j } t  j j t  j j } | | } | | } | | |  j d k  rÆ | | |  j d k  rÆ |  | | | | f | | | f <n  t  j	 ƒ  | | j d k  r| | j d k  r| | | f | | | f <n  d  S(   Nt   shapeR   i    i   (
   R    t   sharedt   arrayt	   threadIdxt   xt   yt   blockIdxt   blockDimR   t   syncthreads(	   t   inputt   outputt   tilet   txt   tyt   bxt   byR	   R
   (   t   dtt
   tile_shape(    s;   lib/python2.7/site-packages/numba/cuda/kernels/transpose.pyt   kernel(   s    

.%
&(   t   getattrR   R   t   itemsizeR    t   cudadrvt   devicearrayt   DeviceNDArrayt   npst
   from_dtypeR   t
   get_devicet   MAX_THREADS_PER_BLOCKt   intt   matht   powt   logt   jit(   t   at   bR   t   colst   rowst   stridest   tpbt
   tile_widtht   tile_heightR   t   blockst   threads(    (   R   R   s;   lib/python2.7/site-packages/numba/cuda/kernels/transpose.pyt	   transpose   s&    		(6(	   t   numbaR    t   numba.cuda.cudadrv.driverR   R   R   R"   t   NoneR0   (    (    (    s;   lib/python2.7/site-packages/numba/cuda/kernels/transpose.pyt   <module>   s   