B
     \\                 @   s~  d Z ddlmZmZmZ ddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlZddl	Z	ddlmZ ddlmZ dd	l	mZmZmZ dd
lmZ yeeddZW n ek
r   dd ZY nX dd Zdd Zdd ZG dd deZG dd deZ edd Z!G dd deZ"G dd deZ#G dd deej$Z%d-dd Z&d.d!d"Z'd#d$ Z(d%Z)d&d' Z*d/d)d*Z+d+d, Z,dS )0z
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object.  If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
    )print_functionabsolute_importdivisionN)six)c_void_p   )driver)devices)
dummyarraytypesnumpy_support)to_fixed_tuple	lru_cachec             C   s   | S )N )funcr   r   =lib/python3.7/site-packages/numba/cuda/cudadrv/devicearray.pyr      s    c             C   s   t | ddS )z$Check if an object is a CUDA ndarray__cuda_ndarray__F)getattr)objr   r   r   is_cuda_ndarray   s    r   c                sD   t    fdd}|dt |dt |dtj |dtj dS )z,Verify the CUDA ndarray interface for an objc                s6   t  | st| tt | |s2td| |f d S )Nz%s must be of type %s)hasattrAttributeError
isinstancer   )attrtyp)r   r   r   requires_attr(   s    
z4verify_cuda_ndarray_interface.<locals>.requires_attrshapestridesdtypesizeN)require_cuda_ndarraytuplenpr   r   integer_types)r   r   r   )r   r   verify_cuda_ndarray_interface$   s    

r$   c             C   s   t | stddS )z9Raises ValueError is is_cuda_ndarray(obj) evaluates Falsezrequire an cuda ndarray objectN)r   
ValueError)r   r   r   r   r    4   s    r    c               @   s   e Zd ZdZdZdZd%ddZedd Zd&d	d
Z	edd Z
d'ddZdd Zedd Zedd Zejd(ddZejd)ddZd*ddZd+ddZdd Zdd  Zd,d!d"Zd#d$ ZdS )-DeviceNDArrayBasez$A on GPU NDArray representation
    Tr   Nc             C   s
  t |tjr|f}t |tjr$|f}t|| _t|| jkrDtdtjd|||j	| _
t|| _t|| _t|| _tt| j| _| jdkr|dkrt| j| j| jj	| _t | j}qt|| _ntjt tddd}d| _|| _|| _|| _dS )aN  
        Args
        ----

        shape
            array shape.
        strides
            array strides.
        dtype
            data type as np.dtype.
        stream
            cuda stream.
        writeback
            Deprecated.
        gpu_data
            user provided device memory for the ndarray data buffer
        zstrides not match ndimr   N)contextZpointerr   )r   r   r#   lenndimr%   r
   ArrayZ	from_descitemsize_dummyr!   r   r   r"   r   intprodr   _driverZmemory_size_from_info
alloc_sizer	   get_contextZmemallocZdevice_memory_sizeZMemoryPointerr   gpu_data_DeviceNDArrayBase__writebackstream)selfr   r   r   r4   	writebackr2   r   r   r   __init__@   s4    





zDeviceNDArrayBase.__init__c             C   s(   t | jt | j| jjdf| jjddS )NFr   )r   r   dataZtypestrversion)r!   r   r   device_ctypes_pointervaluer   str)r5   r   r   r   __cuda_array_interface__t   s
    
z*DeviceNDArrayBase.__cuda_array_interface__c             C   s   t  | }||_|S )zBind a CUDA stream to this object so that all subsequent operation
        on this array defaults to the given stream.
        )copyr4   )r5   r4   Zcloner   r   r   bind~   s    
zDeviceNDArrayBase.bindc             C   s   |   S )N)	transpose)r5   r   r   r   T   s    zDeviceNDArrayBase.Tc             C   sx   |rt |t t| jkr| S | jdkr2tdnB|d k	r`t|tt| jkr`td|f nddlm} || S d S )N   z2transposing a non-2D DeviceNDArray isn't supportedzinvalid axes list %rr   )r@   )r!   ranger)   NotImplementedErrorsetr%   Znumba.cuda.kernels.transposer@   )r5   Zaxesr@   r   r   r   r@      s    

zDeviceNDArrayBase.transposec             C   s   |s
| j S |S )N)r4   )r5   r4   r   r   r   _default_stream   s    z!DeviceNDArrayBase._default_streamc             C   s   t | j}t|| jdS )zn
        Magic attribute expected by Numba to get the numba type that
        represents this object.
        A)r   
from_dtyper   r   r*   r)   )r5   r   r   r   r   _numba_type_   s    zDeviceNDArrayBase._numba_type_c             C   s   | j dkrtdS | j jS dS )z:Returns the ctypes pointer to the GPU data buffer
        Nr   )r2   r   r:   )r5   r   r   r   r:      s    
z'DeviceNDArrayBase.device_ctypes_pointerc             C   s   |j dkrdS t|  | |}t| t| }}t|rdt| t|| tj| || j|d nFt	j
||jd rxdndd|jd  d	}t|| tj| || j|d dS )
zCopy `ary` to `self`.

        If `ary` is a CUDA memory, perform a device-to-device transfer.
        Otherwise, perform a a host-to-device transfer.
        r   N)r4   C_CONTIGUOUSCFTZ	WRITEABLE)ordersubokr>   )r   sentry_contiguousrF   
array_corer/   is_device_memorycheck_array_compatibilityZdevice_to_devicer0   r"   arrayflagsZhost_to_device)r5   aryr4   Z	self_coreZary_corer   r   r   copy_to_device   s     




z DeviceNDArrayBase.copy_to_devicec             C   s   t dd | jD r(d}t|| j| jdks:td| |}|dkr`tj| jtj	d}nt
| | |}| jdkrtj|| | j|d |dkr| jdkrtj| j| j|d	}ntj| j| j| j|d
}|S )a^  Copy ``self`` to ``ary`` or create a new Numpy ndarray
        if ``ary`` is ``None``.

        If a CUDA ``stream`` is given, then the transfer will be made
        asynchronously as part as the given stream.  Otherwise, the transfer is
        synchronous: the function returns after the copy is finished.

        Always returns the host array.

        Example::

            import numpy as np
            from numba import cuda

            arr = np.arange(1000)
            d_arr = cuda.to_device(arr)

            my_kernel[100, 100](d_arr)

            result_array = d_arr.copy_to_host()
        c             s   s   | ]}|d k V  qdS )r   Nr   ).0sr   r   r   	<genexpr>   s    z1DeviceNDArrayBase.copy_to_host.<locals>.<genexpr>z2D->H copy not implemented for negative strides: {}r   zNegative memory sizeN)r   r   )r4   )r   r   buffer)r   r   r   rZ   )anyr   rD   formatr0   AssertionErrorrF   r"   emptyZbyterR   r/   device_to_hostr   ndarrayr   r   )r5   rU   r4   msghostaryr   r   r   copy_to_host   s$    




zDeviceNDArrayBase.copy_to_hostc             C   s<   |  |}tdt | jd kr(td| j| j|d d S )Nz+to_host() is deprecated and will be removedzno associated writeback array)r4   )rF   warningswarnDeprecationWarningr3   r%   rc   )r5   r4   r   r   r   to_host   s    

zDeviceNDArrayBase.to_hostc             c   s   |  |}| jdkrtd| jd | jjkr6tdttt	| j
| }| j}| jj}x^t|D ]R}|| }t|| | j
}|| f}	| j|| || }
t|	|| j||
dV  qfW dS )zSplit the array into equal partition of the `section` size.
        If the array cannot be equally divided, the last section will be
        smaller.
        r   zonly support 1d arrayr   zonly support unit stride)r   r4   r2   N)rF   r)   r%   r   r   r+   r-   mathZceilfloatr   rC   minr2   viewDeviceNDArray)r5   Zsectionr4   Znsectr   r+   iZbeginendr   r2   r   r   r   split   s    


zDeviceNDArrayBase.splitc             C   s   | j S )zEReturns a device memory object that is used as the argument.
        )r2   )r5   r   r   r   as_cuda_arg  s    zDeviceNDArrayBase.as_cuda_argc             C   s0   t  | j}t| j| j| jd}t||dS )z
        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
        to another process to share the local allocation.

        Note: this feature is only available on Linux.
        )r   r   r   )
ipc_handle
array_desc)	r	   r1   get_ipc_handler2   dictr   r   r   IpcArrayHandle)r5   ZipchZdescr   r   r   rs     s    z DeviceNDArrayBase.get_ipc_handlec             C   s2   | j j|d\}}t|j|j| j| || jdS )a(  
        Remove axes of size one from the array shape.

        Parameters
        ----------
        axis : None or int or tuple of ints, optional
            Subset of dimensions to remove. A `ValueError` is raised if an axis with
            size greater than one is selected. If `None`, all axes with size one are
            removed.
        stream : cuda stream or 0, optional
            Default stream for the returned view of the array.

        Returns
        -------
        DeviceNDArray
            Squeezed view into the array.

        )axis)r   r   r   r4   r2   )r,   squeezerl   r   r   r   rF   r2   )r5   rv   r4   Z	new_dummy_r   r   r   rw   #  s    zDeviceNDArrayBase.squeezec             C   s:   t |}|j| jjkr tdt| j| j|| j| jdS )zeReturns a new object by reinterpretting the dtype without making a
        copy of the data.
        z new dtype itemsize doesn't match)r   r   r   r4   r2   )	r"   r   r+   	TypeErrorrl   r   r   r4   r2   )r5   r   r   r   r   rk   ?  s    
zDeviceNDArrayBase.view)r   NN)r   )N)r   )Nr   )r   )r   )Nr   )__name__
__module____qualname____doc__Z__cuda_memory__r   r7   propertyr=   r?   rA   r@   rF   rI   r:   r	   require_contextrV   rc   rg   ro   rp   rs   rw   rk   r   r   r   r   r&   :   s,    
3


		-


r&   c                   s:   e Zd ZdZd
 fdd	Zedd Zedd	 Z  ZS )DeviceRecordz
    An on-GPU record type
    r   Nc                s$   d}d}t t| ||||| d S )Nr   )superr   r7   )r5   r   r4   r2   r   r   )	__class__r   r   r7   S  s    zDeviceRecord.__init__c             C   s   t | jjS )z
        For `numpy.ndarray` compatibility. Ideally this would return a
        `np.core.multiarray.flagsobj`, but that needs to be constructed
        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
        aren't writeable).
        )rt   r,   rT   )r5   r   r   r   rT   Y  s    zDeviceRecord.flagsc             C   s   t | jS )zn
        Magic attribute expected by Numba to get the numba type that
        represents this object.
        )r   rH   r   )r5   r   r   r   rI   c  s    zDeviceRecord._numba_type_)r   N)	rz   r{   r|   r}   r7   r~   rT   rI   __classcell__r   r   )r   r   r   O  s   
r   c                s$   ddl m   j fdd}|S )z
    A separate method so we don't need to compile code every assignment (!).

    :param ndim: We need to have static array sizes for cuda.local.array, so
        bake in the number of dimensions into the kernel
    r   )cudac                s     d}d}x t| jD ]}|| j| 9 }qW ||kr<d S  jjdftjd}x`td ddD ]L}|| j|  |d|f< || j|  |j| dk |d|f< || j|  }qdW |t|d  | t|d < d S )Nr   rB   )r   r   r   )	ZgridrC   r)   r   ZlocalrS   r   int64r   )lhsrhslocation
n_elementsrm   idx)r   r)   r   r   kernelv  s    

$z_assign_kernel.<locals>.kernel)numbar   Zjit)r)   r   r   )r   r)   r   _assign_kernell  s    r   c               @   s   e Zd ZdZdd Zedd Zdd Zdd	d
Zdd Z	dd Z
d ddZejdd Zd!ddZd"ddZejdd Zd#ddZd$ddZdS )%rl   z
    An on-GPU array type
    c             C   s   | j jS )zA
        Return true if the array is Fortran-contiguous.
        )r,   Zis_f_contig)r5   r   r   r   is_f_contiguous  s    zDeviceNDArray.is_f_contiguousc             C   s   t | jjS )z
        For `numpy.ndarray` compatibility. Ideally this would return a
        `np.core.multiarray.flagsobj`, but that needs to be constructed
        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
        aren't writeable).
        )rt   r,   rT   )r5   r   r   r   rT     s    zDeviceNDArray.flagsc             C   s   | j jS )z;
        Return true if the array is C-contiguous.
        )r,   Zis_c_contig)r5   r   r   r   is_c_contiguous  s    zDeviceNDArray.is_c_contiguousNc             C   s   |   |S )zE
        :return: an `numpy.ndarray`, so copies to the host.
        )rc   	__array__)r5   r   r   r   r   r     s    zDeviceNDArray.__array__c             C   s
   | j d S )Nr   )r   )r5   r   r   r   __len__  s    zDeviceNDArray.__len__c             O   s   t |dkr&t|d ttfr&|d }t| }|| jkrP|| j| j| j| jdS | j	j
||\}}|| j	jgkr||j|j| j| jdS tddS )z
        Reshape the array without changing its contents, similarly to
        :meth:`numpy.ndarray.reshape`. Example::

            d_arr = d_arr.reshape(20, 50, order='F')
        r   r   )r   r   r   r2   zoperation requires copyingN)r(   r   r!   listtyper   r   r   r2   r,   reshapeextentrD   )r5   ZnewshapeZkwsclsnewarrextentsr   r   r   r     s    


zDeviceNDArray.reshaperK   r   c             C   sX   |  |}t| }| jj|d\}}|| jjgkrL||j|j| j| j|dS t	ddS )zr
        Flatten the array without changing its contents, similar to
        :meth:`numpy.ndarray.ravel`.
        )rM   )r   r   r   r2   r4   zoperation requires copyingN)
rF   r   r,   ravelr   r   r   r   r2   rD   )r5   rM   r4   r   r   r   r   r   r   r     s    

zDeviceNDArray.ravelc             C   s
   |  |S )N)_do_getitem)r5   itemr   r   r   __getitem__  s    zDeviceNDArray.__getitem__c             C   s   |  ||S )z0Do `__getitem__(item)` with CUDA stream
        )r   )r5   r   r4   r   r   r   getitem  s    zDeviceNDArray.getitemc             C   s   |  |}| j|}t| }t| }t|dkr| jj|d  }|j	szt
jd| jd}tj||| jj|d |d S ||j|j| j||dS n&| jj|j }||j|j| j||dS d S )Nr   r   )r   )dstsrcr   r4   )r   r   r   r2   r4   )rF   r,   r   r   Ziter_contiguous_extentr   r(   r2   rk   Zis_arrayr"   r^   r   r/   r_   r+   r   r   r   )r5   r   r4   arrr   r   newdatarb   r   r   r   r     s"    


zDeviceNDArray._do_getitemc             C   s   |  ||S )N)_do_setitem)r5   keyr;   r   r   r   __setitem__  s    zDeviceNDArray.__setitem__c             C   s   |  |||S )z6Do `__setitem__(key, value)` with CUDA stream
        )Z_so_getitem)r5   r   r;   r4   r   r   r   setitem  s    zDeviceNDArray.setitemc             C   s2  |  |}| j|}| jj|j }t|tjr@d}| j	j
f}n|j}|j}t| ||| j	||d}t||d\}	}
|	j|jkrtd|	j|jf tj|jtjd}|	j||	j d < |	j| }	xDtt|j|	jD ].\}\}}|dkr||krtd|||f qW t|j}t|jj||d||	 d S )N)r   )r   r   r   r2   r4   )r4   z$Can't assign %s-D array to %s-D self)r   r   zCCan't copy sequence with size %d to array axis %d with dimension %d)rF   r,   r   r2   rk   r   r   r
   ZElementr   r+   r   r   r   auto_devicer)   r%   r"   Zonesr   r   	enumeratezipr.   r   Zforall)r5   r   r;   r4   r   r   r   r   r   r   rx   Z	rhs_shaperm   lrr   r   r   r   r     s<    

 zDeviceNDArray._do_setitem)N)rK   r   )r   )r   )r   )r   )rz   r{   r|   r}   r   r~   rT   r   r   r   r   r   r	   r   r   r   r   r   r   r   r   r   r   r   rl     s   





rl   c               @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )ru   a"  
    An IPC array handle that can be serialized and transfer to another process
    in the same machine for share a GPU allocation.

    On the destination process, use the *.open()* method to creates a new
    *DeviceNDArray* object that shares the allocation from the original process.
    To release the resources, call the *.close()* method.  After that, the
    destination can no longer use the shared array object.  (Note: the
    underlying weakref to the resource is now dead.)

    This object implements the context-manager interface that calls the
    *.open()* and *.close()* method automatically::

        with the_ipc_array_handle as ipc_array:
            # use ipc_array here as a normal gpu array object
            some_code(ipc_array)
        # ipc_array is dead at this point
    c             C   s   || _ || _d S )N)_array_desc_ipc_handle)r5   rq   rr   r   r   r   r7   I  s    zIpcArrayHandle.__init__c             C   s$   | j t }tf d|i| jS )z
        Returns a new *DeviceNDArray* that shares the allocation from the
        original process.  Must not be used on the original process.
        r2   )r   openr	   r1   rl   r   )r5   Zdptrr   r   r   r   M  s    zIpcArrayHandle.openc             C   s   | j   dS )z5
        Closes the IPC handle to the array.
        N)r   close)r5   r   r   r   r   U  s    zIpcArrayHandle.closec             C   s   |   S )N)r   )r5   r   r   r   	__enter__[  s    zIpcArrayHandle.__enter__c             C   s   |    d S )N)r   )r5   r   r;   	tracebackr   r   r   __exit__^  s    zIpcArrayHandle.__exit__N)	rz   r{   r|   r}   r7   r   r   r   r   r   r   r   r   ru   6  s   ru   c               @   s   e Zd ZdZdddZdS )MappedNDArrayz4
    A host array that uses CUDA mapped memory.
    r   c             C   s
   || _ d S )N)r2   )r5   r2   r4   r   r   r   device_setupg  s    zMappedNDArray.device_setupN)r   )rz   r{   r|   r}   r   r   r   r   r   r   b  s   r   c             C   s.   | j dkr| d} t| j| j| j| ||dS )z/Create a DeviceNDArray object that is like ary.r   r   )r6   r4   r2   )r)   r   rl   r   r   r   )rU   r4   r2   r   r   r   from_array_likek  s    

r   c             C   s   t | j||dS )z.Create a DeviceRecord object that is like rec.)r4   r2   )r   r   )Zrecr4   r2   r   r   r   from_record_likes  s    r   c             C   sD   | j s
| S g }x(| j D ]}||dkr*dntd qW | t| S )aG  
    Extract the repeated core of a broadcast array.

    Broadcast arrays are by definition non-contiguous due to repeated
    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
    contiguity and copy the underlying data from such arrays, we must create
    a view without the repeated dimensions.

    r   N)r   appendslicer!   )rU   Z
core_indexZstrider   r   r   rP   x  s    
rP   zArray contains non-contiguous buffer and cannot be transferred as a single memory region. Please ensure contiguous buffer with numpy .ascontiguousarray()c             C   s(   t | }|jd s$|jd s$ttd S )NrJ   ZF_CONTIGUOUS)rP   rT   r%   errmsg_contiguous_buffer)rU   Zcorer   r   r   rO     s    rO   Tc             C   s   t | r| dfS t| dr,tj| dfS t| tjrFt	| |d}n$tj
| ddd} t|  t| |d}|r||j| |d |dfS dS )z
    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
    host to device. If obj already represents device memory, it is returned and
    no copy is made.
    Fr=   )r4   T)r>   rN   N)r/   rQ   r   r   r   Zas_cuda_arrayr   r"   Zvoidr   rS   rO   r   rV   )r   r4   r>   Zdevobjr   r   r   r     s    

r   c             C   sv   |   |   }}| j|jkr2td| j|jf |j|jkrRtd| j|jf |j|jkrrtd| j|jf d S )Nzincompatible dtype: %s vs. %szincompatible shape: %s vs. %szincompatible strides: %s vs. %s)rw   r   ry   r   r%   r   )Zary1Zary2Zary1sqZary2sqr   r   r   rR     s    rR   )r   N)r   N)r   T)-r}   Z
__future__r   r   r   rd   rh   	functoolsr>   r   r   Zctypesr   Znumpyr"    r   r/   r	   r
   r   r   Znumba.unsafe.ndarrayr   r   r   r   r   r$   r    objectr&   r   r   rl   ru   r`   r   r   r   rP   r   rO   r   rR   r   r   r   r   <module>   sH     % &,	


