B
     \                 @   s:   d dl mZ d dlmZ d dl mZ d dlZdddZdS )    )cuda)driver)numpy_supportNc                s   t | dd}|sJ| j\}}| jj| | jjf}tjjj||f|| j|d}t	| j t
 j}ttdt|dd }t|| }||d ftj fdd}	t|jd | d t|jd | d f}
||f}|	|
||f | | |S )a  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    streamr   )dtyper         c       	         s   t jj d}t jj}t jj}t jjt jj }t jjt jj }|| }|| }|| | jd k r|| | jd k r| || || f |||f< t 	  ||jd k r||jd k r|||f |||f< d S )N)shaper   r   r   )
r   ZsharedZarrayZ	threadIdxxyZblockIdxZblockDimr	   Zsyncthreads)	inputoutputZtileZtxZtyZbxZbyr
   r   )dt
tile_shape ;lib/python3.7/site-packages/numba/cuda/kernels/transpose.pykernel(   s    $ztranspose.<locals>.kernel)getattrr	   r   itemsizer   ZcudadrvZdevicearrayZDeviceNDArraynpsZ
from_dtyper   Z
get_deviceZMAX_THREADS_PER_BLOCKintmathpowlogZjit)abr   ZcolsZrowsstridesZtpbZ
tile_widthZtile_heightr   ZblocksZthreadsr   )r   r   r   	transpose   s&    

,r   )N)Znumbar   Znumba.cuda.cudadrv.driverr   r   r   r   r   r   r   r   r   <module>   s   