B
     \1                 @   sL  d Z ddlmZmZmZ ddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlmZ ddlmZmZmZmZmZmZmZmZ ddlZddlZddlmZmZ ddlmZmZ dd	l m!Z!m"Z" dd
l#m$Z$ ddl#m%Z% ddl&m'Z'm#Z#m(Z( ddlm)Z)m*Z* ddl+m,Z- e.ej/0ddZ1dZ2ej34dZ5dd Z6G dd de7Z8G dd de7Z9G dd de"Z:dd Z;dZ<dZ=dd Z>d d! Z?d"d# Z@d$d% ZAe@ ZBd&ZCG d'd( d(eDZEeE ZFd)d* ZGeG ZHG d+d, d,eDZId-d. ZJG d/d0 d0eDZKeK ZKG d1d2 d2eDZLed3d4ZMG d5d6 d6eDZNd7d8 ZOd9d: ZPd;d< ZQd=d> ZRd?d@ ZSdAdB ZTdCdD ZUG dEdF dFeDZVG dGdH dHeDZWG dIdJ dJeDZXG dKdL dLeDZYG dMdN dNeYZZG dOdP dPeZZ[G dQdR dRej\Z]G dSdT dTeDZ^G dUdV dVe^ej\Z_G dWdX dXeDZ`G dYdZ dZeDZad[d\ ZbG d]d^ d^eDZced_d`dadbdcddgZdG dedf dfeDZedgdh Zfe'jge'jhe'jie'jje'jkdiZlG djdk dkeDZmdldm Zndndo Zodpdq Zpdrds Zqdtdu Zrdvdw Zsdxdy Ztdd{d|Zud}d~ Zvdd Zwdd Zxdd Zydd Zzdd Z{dd Z|dd Z}dddZ~dddZdddZdddZdd Zdd Zejdd ZdS )a  
CUDA driver bridge implementation

NOTE:
The new driver implementation uses a *_PendingDeallocs* that help prevents a
crashing the system (particularly OSX) when the CUDA context is corrupted at
resource deallocation.  The old approach ties resource management directly
into the object destructor; thus, at corruption of the CUDA context,
subsequent deallocation could further corrupt the CUDA context and causes the
system to freeze in some cases.

    )absolute_importprint_functiondivisionN)product)c_intbyrefc_size_tc_charc_char_p	addressofc_void_pc_float)
namedtupledeque)utilsmviewbuf   )CudaSupportErrorCudaDriverError)API_PROTOTYPES)cu_occupancy_b2d_size)enumsdrvapi_extras)config	serialize)longintZNUMBAPRO_VERBOSE_CU_JIT_LOG)   r   Zlinuxc              C   s   t t} t| sttj }t	t |d }t
|ts>t j}| | tjr|t tj}d}|t j|d | | n| t   | S )Nz;== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s)fmt)loggingZ	getLogger__name__r   Zlogger_hasHandlersstrr   ZCUDA_LOG_LEVELuppergetattr
isinstanceintZCRITICALZsetLevelZStreamHandlersysstderrZsetFormatterZ	FormatterZ
addHandlerZNullHandler)ZloggerZlvlZhandlerr    r(   8lib/python3.7/site-packages/numba/cuda/cudadrv/driver.py_make_logger,   s    



r*   c               @   s   e Zd ZdS )DeadMemoryErrorN)r    
__module____qualname__r(   r(   r(   r)   r+   D   s   r+   c               @   s   e Zd ZdS )LinkerErrorN)r    r,   r-   r(   r(   r(   r)   r.   H   s   r.   c                   s$   e Zd Z fddZdd Z  ZS )CudaAPIErrorc                s"   || _ || _tt| || d S )N)codemsgsuperr/   __init__)selfr0   r1   )	__class__r(   r)   r3   M   s    zCudaAPIError.__init__c             C   s   d| j | jf S )Nz[%s] %s)r0   r1   )r4   r(   r(   r)   __str__R   s    zCudaAPIError.__str__)r    r,   r-   r3   r6   __classcell__r(   r(   )r5   r)   r/   L   s   r/   c              C   s|  t jdd } | dkrt  tjdkr:tj}dg}dg}n4tjdkrXtj}dg}dg}ntj}d	d
g}ddg}| d k	ryt j	
| } W n  tk
r   td|  Y nX t j	| std|  | g}n|dd t||D  }g }g }x`|D ]X}y||}W nB tk
r@ }	 z"|t j	|  ||	 W d d }	~	X Y qX |S qW t|r\t  nddd |D }
t|
 d S )NZNUMBAPRO_CUDA_DRIVER0Zwin32z\windows\system32z
nvcuda.dlldarwinz/usr/local/cuda/libzlibcuda.dylibz/usr/libz
/usr/lib64z
libcuda.sozlibcuda.so.1z+NUMBAPRO_CUDA_DRIVER %s is not a valid pathzrNUMBAPRO_CUDA_DRIVER %s is not a valid file path.  Note it must be a filepath of the .so/.dll/.dylib or the driverc             S   s   g | ]\}}t j||qS r(   )ospathjoin).0xyr(   r(   r)   
<listcomp>y   s   zfind_driver.<locals>.<listcomp>
c             s   s   | ]}t |V  qd S )N)r!   )r=   er(   r(   r)   	<genexpr>   s    zfind_driver.<locals>.<genexpr>)r:   environget_raise_driver_not_foundr&   platformctypesZWinDLLZCDLLr;   abspath
ValueErrorisfiler   OSErrorappendallr<   _raise_driver_error)ZenvpathZdlloaderZdldirZdlnamesZ
candidatesZpath_not_existZdriver_load_errorr;   ZdllrB   errmsgr(   r(   r)   find_driverV   sN    



rQ   z
CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBAPRO_CUDA_DRIVER
with the file path of the CUDA driver shared library.
zM
Possible CUDA driver libraries are found but error occurred during load:
%s
c               C   s   t td S )N)r   DRIVER_NOT_FOUND_MSGr(   r(   r(   r)   rF      s    rF   c             C   s   t t|  d S )N)r   DRIVER_LOAD_ERROR_MSG)rB   r(   r(   r)   rO      s    rO   c              C   s>   d} t  }x,ttD ] }|| rtt|}|||< qW |S )NZ
CUDA_ERROR)r   
UniqueDictdirr   
startswithr#   )prefixmapnamer0   r(   r(   r)   _build_reverse_error_map   s    

rZ   c               C   s   t  S )N)r:   getpidr(   r(   r(   r)   _getpid   s    r\   z9driver missing function: %s.
Requires CUDA 8.0 or above.
c               @   s   e Zd ZdZdZdd Zdd Zdd Zd	d
 Ze	dd Z
dd Zdd Zdd Zdd Zd ddZdd Zdd Zdd Zdd ZdS )!Driverz0
    Driver API functions are lazily bound.
    Nc             C   s&   | j }|d k	r|S t| }|| _ |S )N)
_singletonobject__new__)clsobjr(   r(   r)   r`      s    
zDriver.__new__c          
   C   sn   t  | _d| _d | _d | _ytjr0d}t|t	 | _
W n. tk
rh } zd| _|| _W d d }~X Y nX d S )NFzzCUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 in the environment, or because CUDA is unsupported on 32-bit systems.T)r   rT   devicesis_initializedinitialization_errorpidr   ZDISABLE_CUDAr   rQ   lib)r4   r1   rB   r(   r(   r)   r3      s    
zDriver.__init__c          
   C   sn   t  ad| _ytd | d W n4 tk
rX } z|| _td| W d d }~X Y n
X t | _	| 
  d S )NTZinitr   zError at driver init: 
%s:)r*   _loggerrd   infoZcuInitr/   re   r   r\   rf   _initialize_extras)r4   rB   r(   r(   r)   
initialize   s    
zDriver.initializec             C   sl   t d t}|tj}|| d t tt tj	t tj
t j}|tj}d|_| d|}|| _d S )NcuIpcOpenMemHandlecall_cuIpcOpenMemHandle)rH   Z	CFUNCTYPEr   r   set_cuIpcOpenMemHandle	_find_apir   ZPOINTERr   cu_device_ptrcu_ipc_mem_handleZc_uintrm   r    _wrap_api_callrl   )r4   Z	set_protorn   Z
call_protorm   	safe_callr(   r(   r)   rj      s    



zDriver._initialize_extrasc             C   s   | j s|   | jd kS )N)rd   rk   re   )r4   r(   r(   r)   is_available   s    zDriver.is_availablec             C   s   yt | }W n tk
r(   t|Y nX |d }|dd  }| jsL|   | jd k	rdtd| j | |}||_||_	| 
||}t| || |S )Nr   r   zError at driver init: 
%s:)r   KeyErrorAttributeErrorrd   rk   re   r   ro   restypeargtypesrr   setattr)r4   fnameprotorw   rx   libfnrs   r(   r(   r)   __getattr__  s"    


zDriver.__getattr__c                s   t  fdd}|S )Nc                 s&   t dj |  } | d S )Nzcall driver api: %s)rh   debugr    _check_error)argsretcode)rz   r|   r4   r(   r)   safe_cuda_api_call  s    z1Driver._wrap_api_call.<locals>.safe_cuda_api_call)	functoolswraps)r4   rz   r|   r   r(   )rz   r|   r4   r)   rr     s    zDriver._wrap_api_callc                sd   yt | j d S  tk
r$   Y nX yt | j S  tk
rF   Y nX  fdd}t|  | |S )NZ_v2c                 s   t t  d S )N)r   MISSING_FUNCTION_ERRMSG)r   Zkws)rz   r(   r)   absent_function4  s    z)Driver._find_api.<locals>.absent_function)r#   rg   rv   ry   )r4   rz   r   r(   )rz   r)   ro   %  s    zDriver._find_apic             C   sx   |t jkrtt|d}d||f }t| |t jkrj| jd k	rjt | jkrjd}t	|t | j t
dt||d S )NZUNKNOWN_CUDA_ERRORzCall to %s results in %sz0pid %s forked from pid %s after CUDA driver initzCUDA initialized before forking)r   ZCUDA_SUCCESS	ERROR_MAPrE   rh   errorZCUDA_ERROR_NOT_INITIALIZEDrf   r\   Zcriticalr   r/   )r4   rz   r   Zerrnamer1   r(   r(   r)   r   :  s    


zDriver._check_errorr   c             C   s0   | j |}|d kr&t|}|| j |< t|S )N)rc   rE   Deviceweakrefproxy)r4   devnumdevr(   r(   r)   
get_deviceG  s
    
zDriver.get_devicec             C   s   t  }| t| |jS )N)r   ZcuDeviceGetCountr   value)r4   countr(   r(   r)   get_device_countN  s    zDriver.get_device_countc             C   s   t | j S )z)Returns a list of active devices
        )listrc   values)r4   r(   r(   r)   list_devicesS  s    zDriver.list_devicesc             C   s    x| j  D ]}|  qW dS )zReset all devices
        N)rc   r   reset)r4   r   r(   r(   r)   r   X  s    zDriver.resetc             C   s&   t d}tt| |js"dS |S )zqGet current active context in CUDA driver runtime.
        Note: Lowlevel calls that returns the handle.
        r   N)r   
cu_contextdriverZcuCtxGetCurrentr   r   )r4   handler(   r(   r)   get_context^  s
    
zDriver.get_context)r   )r    r,   r-   __doc__r^   r`   r3   rk   rj   propertyrt   r}   rr   ro   r   r   r   r   r   r   r(   r(   r(   r)   r]      s    	
r]   c              C   sF   d} t  }x4ttD ](}|| rtt|||t| d  < qW |S )NZCU_DEVICE_ATTRIBUTE_)r   rT   rU   r   rV   r#   len)rW   rX   rY   r(   r(   r)   _build_reverse_device_attrsl  s    
r   c               @   sx   e Zd ZdZedd Zdd Zdd Zedd	 Z	d
d Z
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )r   z
    The device object owns the CUDA contexts.  This is owned by the driver
    object.  User should not construct devices directly.
    c             C   sF   x@t t D ]}t|}| |kr|S qW d|}t|dS )zgCreate Device object from device identity created by
        ``Device.get_device_identity()``.
        zKNo device of {} is found. Target device may not be visible in this process.N)ranger   r   r   get_device_identityformatRuntimeError)r4   ZidentityZdeviddrP   r(   r(   r)   from_identity}  s    
zDevice.from_identityc             C   s   t  }tt|| ||jks(td|j| _i | _t  }t  }tt|t|| j |j|jf| _	d}t
|  }t||| j |j| _d | _d S )NzDriver returned another device   )r   r   ZcuDeviceGetr   r   AssertionErroridZ
attributesZcuDeviceComputeCapabilitycompute_capabilityr	   ZcuDeviceGetNamerY   primary_context)r4   r   Z
got_devnumZcc_majorZcc_minorZbufszZbufr(   r(   r)   r3     s    
zDevice.__init__c             C   s   | j | j| jdS )N)Zpci_domain_idZ
pci_bus_idZpci_device_id)ZPCI_DOMAIN_IDZ
PCI_BUS_IDZPCI_DEVICE_ID)r4   r(   r(   r)   r     s    zDevice.get_device_identityc             C   s   t dt | jS )z,
        For backward compatibility
        zADeprecated attribute 'COMPUTE_CAPABILITY'; use lower case version)warningswarnDeprecationWarningr   )r4   r(   r(   r)   COMPUTE_CAPABILITY  s    zDevice.COMPUTE_CAPABILITYc             C   s   d| j | jf S )Nz<CUDA device %d '%s'>)r   rY   )r4   r(   r(   r)   __repr__  s    zDevice.__repr__c             C   sX   yt | }W n tk
r(   t|Y nX t }tt||| j t| ||j	 |j	S )zRead attributes lazily
        )
DEVICE_ATTRIBUTESru   rv   r   r   ZcuDeviceGetAttributer   r   ry   r   )r4   attrr0   r   r(   r(   r)   r}     s    zDevice.__getattr__c             C   s
   t | jS )N)hashr   )r4   r(   r(   r)   __hash__  s    zDevice.__hash__c             C   s   t |tr| j|jkS dS )NF)r$   r   r   )r4   otherr(   r(   r)   __eq__  s    
zDevice.__eq__c             C   s
   | |k S )Nr(   )r4   r   r(   r(   r)   __ne__  s    zDevice.__ne__c             C   sL   | j dk	r| j S t|  t }tt|| j tt	
| |}|| _ |S )zo
        Returns the primary context for the device.
        Note: it is not pushed to the CPU thread.
        N)r   met_requirement_for_devicer   r   r   ZcuDevicePrimaryCtxRetainr   r   Contextr   r   )r4   ZhctxZctxr(   r(   r)   get_primary_context  s    
zDevice.get_primary_contextc             C   s   t | j d| _dS )z6
        Release reference to primary context
        N)r   ZcuDevicePrimaryCtxReleaser   r   )r4   r(   r(   r)   release_primary_context  s    zDevice.release_primary_contextc          	   C   s4   z | j d k	r| j   |   W d t| j X d S )N)r   r   r   r   ZcuDevicePrimaryCtxResetr   )r4   r(   r(   r)   r     s
    

zDevice.resetN)r    r,   r-   r   classmethodr   r3   r   r   r   r   r}   r   r   r   r   r   r   r(   r(   r(   r)   r   x  s   	r   c             C   s   | j tk rtd| tf d S )Nz%s has compute capability < %s)r   MIN_REQUIRED_CCr   )devicer(   r(   r)   r     s    
r   c               @   s    e Zd ZdZdd Zdd ZdS )_SizeNotSetzC
    Dummy object for _PendingDeallocs when *size* is not set.
    c             C   s   dS )N?r(   )r4   r(   r(   r)   r6     s    z_SizeNotSet.__str__c             C   s   dS )Nr   r(   )r4   r(   r(   r)   __int__  s    z_SizeNotSet.__int__N)r    r,   r-   r   r6   r   r(   r(   r(   r)   r     s   r   c               @   sZ   e Zd ZdZdd Zedd ZefddZdd	 Z	e
jd
d Zedd Zdd ZdS )_PendingDeallocszd
    Pending deallocations of a context (or device since we are using the primary
    context).
    c             C   s   t  | _d| _d| _|| _d S )Nr   )r   _cons_disable_count_size_memory_capacity)r4   Zcapacityr(   r(   r)   r3     s    z_PendingDeallocs.__init__c             C   s   t | jtj S )N)r%   r   r   ZCUDA_DEALLOCS_RATIO)r4   r(   r(   r)   _max_pending_bytes  s    z#_PendingDeallocs._max_pending_bytesc             C   s\   t d|j| | j|||f |  jt|7  _t| jtj	ksP| j| j
krX|   dS )a_  
        Add a pending deallocation.

        The *dtor* arg is the destructor function that takes an argument,
        *handle*.  It is used as ``dtor(handle)``.  The *size* arg is the
        byte size of the resource added.  It is an optional argument.  Some
        resources (e.g. CUModule) has an unknown memory footprint on the device.
        z add pending dealloc: %s %s bytesN)rh   ri   r    r   rM   r   r%   r   r   ZCUDA_DEALLOCS_COUNTr   clear)r4   dtorr   sizer(   r(   r)   add_item  s    	z_PendingDeallocs.add_itemc             C   sD   | j s@x2| jr8| j \}}}td|j| || qW d| _dS )zh
        Flush any pending deallocations unless it is disabled.
        Do nothing if disabled.
        zdealloc: %s %s bytesr   N)is_disabledr   popleftrh   ri   r    r   )r4   r   r   r   r(   r(   r)   r   !  s    z_PendingDeallocs.clearc          	   c   s<   |  j d7  _ z
dV  W d|  j d8  _ | j dks6tX dS )zs
        Context manager to temporarily disable flushing pending deallocation.
        This can be nested.
        r   Nr   )r   r   )r4   r(   r(   r)   disable-  s
    
z_PendingDeallocs.disablec             C   s
   | j dkS )Nr   )r   )r4   r(   r(   r)   r   :  s    z_PendingDeallocs.is_disabledc             C   s
   t | jS )z:
        Returns number of pending deallocations.
        )r   r   )r4   r(   r(   r)   __len__>  s    z_PendingDeallocs.__len__N)r    r,   r-   r   r3   r   r   r   r   r   
contextlibcontextmanagerr   r   r   r(   r(   r(   r)   r     s   r   _MemoryInfoz
free,totalc               @   s   e Zd ZdZdd Zdd Zdd Zd8d	d
Zd9ddZdd Z	dd Z
dd Zdd Zd:ddZd;ddZdd Zdd Zdd Zd<d!d"Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd=d.d/Zd0d1 Zd2d3 Zd4d5 Zd6d7 ZdS )>r   zs
    This object wraps a CUDA Context resource.

    Contexts should not be constructed directly by user code.
    c             C   s0   || _ || _t | _d | _t | _i | _d S )N)r   r   r   rT   allocationsdeallocationsmodulesZextras)r4   r   r   r(   r(   r)   r3   O  s    

zContext.__init__c             C   sB   t d| jj t d| jj | j  | j  | j  dS )z?
        Clean up all owned resources in this context.
        zreset context of device %sN)rh   ri   r   r   r   r   r   r   )r4   r(   r(   r)   r   Y  s
    

zContext.resetc             C   s0   t  }t  }tt|t| t|j|jdS )z>Returns (free, total) memory in bytes in the context.
        )freetotal)r   r   ZcuMemGetInfor   r   r   )r4   r   r   r(   r(   r)   get_memory_infoe  s    zContext.get_memory_infoNc             C   s@   t  }|s"tt||j|| ntt||j||| |jS )zReturn occupancy of a function.
        :param func: kernel for which occupancy is calculated
        :param blocksize: block size the kernel is intended to be launched with
        :param memsize: per-block dynamic shared memory usage intended, in bytes)r   r   Z+cuOccupancyMaxActiveBlocksPerMultiprocessorr   r   Z4cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsr   )r4   func	blocksizememsizeflagsretvalr(   r(   r)   $get_active_blocks_per_multiprocessorm  s
    z,Context.get_active_blocks_per_multiprocessorc       	   	   C   sd   t  }t  }t|}|s8tt|t||j||| n tt|t||j|||| |j|jfS )a  Suggest a launch configuration with reasonable occupancy.
        :param func: kernel for which occupancy is calculated
        :param b2d_func: function that calculates how much per-block dynamic shared memory 'func' uses based on the block size.
        :param memsize: per-block dynamic shared memory usage intended, in bytes
        :param blocksizelimit: maximum block size the kernel is designed to handle)r   r   r   Z cuOccupancyMaxPotentialBlockSizer   r   Z)cuOccupancyMaxPotentialBlockSizeWithFlagsr   )	r4   r   Zb2d_funcr   Zblocksizelimitr   Zgridsizer   Zb2d_cbr(   r(   r)   get_max_potential_block_sizez  s    

z$Context.get_max_potential_block_sizec             C   s*   t | j | jdkr&t|  j| _dS )z@
        Pushes this context on the current CPU Thread.
        N)r   ZcuCtxPushCurrentr   r   r   r   r   )r4   r(   r(   r)   push  s    
zContext.pushc             C   s,   t  }tt| |j| jjks(tdS )z
        Pops this context off the current CPU thread. Note that this context must
        be at the top of the context stack, otherwise an error will occur.
        N)r   r   r   ZcuCtxPopCurrentr   r   r   r   )r4   Zpoppedr(   r(   r)   pop  s    zContext.popc          
   C   sR   y
|  W nB t k
rL } z$|jtjkr:| j  |  n W dd}~X Y nX dS )z
        Attempt allocation by calling *allocator*.  If a out-of-memory error
        is raised, the pending deallocations are flushed and the allocation
        is retried.  If it fails in the second attempt, the error is reraised.
        N)r/   r0   r   ZCUDA_ERROR_OUT_OF_MEMORYr   r   )r4   	allocatorrB   r(   r(   r)   _attempt_allocation  s    

zContext._attempt_allocationc                sT   t   fdd}| | t|  }tt|  |}|| jj< |	 S )Nc                  s   t t  d S )N)r   Z
cuMemAllocr   r(   )bytesizeptrr(   r)   r     s    z#Context.memalloc.<locals>.allocator)
r   rp   r   _alloc_finalizerAutoFreePointerr   r   r   r   own)r4   r   r   	finalizermemr(   )r   r   r)   memalloc  s    
zContext.memallocFc       	         s   t  d|rtjO |r&tjO |r4tjO  fdd}|rT| | n|  d }t|  |}|rtt	| | |d}|| j
|jj< | S tt	| | |d}|S d S )Nr   c                  s   t t  d S )N)r   ZcuMemHostAllocr   r(   )r   r   pointerr(   r)   r     s    z'Context.memhostalloc.<locals>.allocator)r   )r   r   ZCU_MEMHOSTALLOC_DEVICEMAPZCU_MEMHOSTALLOC_PORTABLEZCU_MEMHOSTALLOC_WRITECOMBINEDr   _hostalloc_finalizerMappedMemoryr   r   r   r   r   r   PinnedMemory)	r4   r   mappedZportableZwcr   ownerr   r   r(   )r   r   r   r)   memhostalloc  s,    


zContext.memhostallocc                s   t ttfrt|r0| jjs0td| j d |rB tjO   fdd}|rb| 	| n|  t
| |}|rtt| ||d}|| j|jj< | S tt| ||d}|S d S )Nz%s cannot map host memoryr   c                  s   t   d S )N)r   ZcuMemHostRegisterr(   )r   r   r   r(   r)   r     s    z!Context.mempin.<locals>.allocator)r   )r$   r%   longr   r   ZCAN_MAP_HOST_MEMORYr   r   ZCU_MEMHOSTREGISTER_DEVICEMAPr   _pin_finalizerr   r   r   r   r   r   r   r   )r4   r   r   r   r   r   r   r   r(   )r   r   r   r)   mempin  s(    
zContext.mempinc             C   s   t d S )N)NotImplementedError)r4   r   r(   r(   r)   memunpin  s    zContext.memunpinc             C   sZ   t stdt }tt||jj	 | j
 }|j	j|jj	j }t|||j||dS )z>
        Returns a *IpcHandle* from a GPU allocation.
        zOS does not support CUDA IPC)offset)SUPPORTS_IPCrL   r   rq   r   ZcuIpcGetMemHandlerH   r   r   r   r   r   r   	IpcHandler   )r4   ZmemoryZ	ipchandlesource_infor   r(   r(   r)   get_ipc_handle  s    

zContext.get_ipc_handlec             C   s2   t  }d}tt||| tt| ||dS )Nr   )contextr   r   )r   rp   r   rl   r   MemoryPointerr   r   )r4   r   r   dptrr   r(   r(   r)   open_ipc_handle  s
    zContext.open_ipc_handler   c             C   s    |dkst dt|| dS )zLEnable peer access between the current context and the peer context
        r   z$*flags* is reserved and MUST be zeroN)r   r   ZcuCtxEnablePeerAccess)r4   Zpeer_contextr   r(   r(   r)   enable_peer_access  s    zContext.enable_peer_accessc             C   s$   t  }tt|| jj| t|S )zsReturns a bool indicating whether the peer access between the
        current and peer device is possible.
        )r   r   ZcuDeviceCanAccessPeerr   r   r   bool)r4   Zpeer_devicecan_access_peerr(   r(   r)   r   %  s    zContext.can_access_peerc             C   s&   t |tr|d}t|}| |S )Nutf8)r$   r!   encoder
   create_module_image)r4   ptximager(   r(   r)   create_module_ptx1  s    

zContext.create_module_ptxc             C   s"   t | |}|| j|jj< t|S )N)load_module_imager   r   r   r   r   )r4   r  moduler(   r(   r)   r  7  s    
zContext.create_module_imagec             C   s   | j |jj= d S )N)r   r   r   )r4   r  r(   r(   r)   unload_module<  s    zContext.unload_modulec             C   s2   t  }tt|d tt| |t| j	|S )Nr   )
r   Z	cu_streamr   ZcuStreamCreater   Streamr   r   _stream_finalizerr   )r4   r   r(   r(   r)   create_stream?  s    zContext.create_streamTc             C   sF   t  }d}|s|tjO }tt|| tt	| |t
| j|dS )Nr   )r   )r   Zcu_eventr   ZCU_EVENT_DISABLE_TIMINGr   ZcuEventCreater   Eventr   r   _event_finalizerr   )r4   Ztimingr   r   r(   r(   r)   create_eventE  s    
zContext.create_eventc             C   s   t   d S )N)r   ZcuCtxSynchronize)r4   r(   r(   r)   synchronizeN  s    zContext.synchronizec             C   s   d| j | jjf S )Nz<CUDA context %s of device %d>)r   r   r   )r4   r(   r(   r)   r   Q  s    zContext.__repr__c             C   s   t |tr| j|jkS tS d S )N)r$   r   r   NotImplemented)r4   r   r(   r(   r)   r   T  s    
zContext.__eq__c             C   s   |  | S )N)r   )r4   r   r(   r(   r)   r   Z  s    zContext.__ne__)N)N)FFF)F)r   )T)r    r,   r-   r   r3   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r  r  r  r   r   r   r(   r(   r(   r)   r   H  s4   


		
 
#	

	r   c             C   s  t tjdd}t|  }t|  }tjt|tjt	|tj
t|tjt	|tjt	ti}tjt| |  }t	t| |  }t }ytt||t||| W n> tk
r }	 z d|jd }
t|	j|
W dd}	~	X Y nX |j}tt| ||t| |S )z!
    image must be a pointer
    NUMBAPRO_CUDA_LOG_SIZEi   zcuModuleLoadDataEx error:
%sr  N)r%   r:   rD   rE   r	   r   CU_JIT_INFO_LOG_BUFFERr   !CU_JIT_INFO_LOG_BUFFER_SIZE_BYTESr   CU_JIT_ERROR_LOG_BUFFER"CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTESCU_JIT_LOG_VERBOSEVERBOSE_JIT_LOGr   cu_jit_optionr   keysr   Z	cu_moduler   ZcuModuleLoadDataExr   r/   r   decoder0   Moduler   r   _module_finalizer)r   r  logszZjitinfoZ	jiterrorsoptionsoption_keysoption_valsr   rB   r1   info_logr(   r(   r)   r  ^  s(    





r  c                s"   | j  | j fdd}|S )Nc                  s     r j = tj d S )N)r   r   r   Z	cuMemFreer(   )r   r   r   r   r(   r)   core  s    z_alloc_finalizer.<locals>.core)r   r   )r   r   r   r#  r(   )r   r   r   r   r)   r     s    r   c                s,   | j  | jst fdd}|S )a[  
    Finalize page-locked host memory allocated by `context.memhostalloc`.

    This memory is managed by CUDA, and finalization entails deallocation. The
    issues noted in `_pin_finalizer` are not relevant in this case, and the
    finalization is placed in the `context.deallocations` queue along with
    finalization of device objects.

    c                  s$   r r j = tj d S )N)r   r   r   ZcuMemFreeHostr(   )r   r   r   r   r   r(   r)   r#    s    z"_hostalloc_finalizer.<locals>.core)r   r   r   )r   r   r   r   r#  r(   )r   r   r   r   r   r)   r     s    
r   c                s   | j   fdd}|S )aB  
    Finalize temporary page-locking of host memory by `context.mempin`.

    This applies to memory not otherwise managed by CUDA. Page-locking can
    be requested multiple times on the same memory, and must therefore be
    lifted as soon as finalization is requested, otherwise subsequent calls to
    `mempin` may fail with `CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`, leading
    to unexpected behavior for the context managers `cuda.{pinned,mapped}`.
    This function therefore carries out finalization immediately, bypassing the
    `context.deallocations` queue.

    c                  s   r r j = t d S )N)r   r   ZcuMemHostUnregisterr(   )r   r   r   r(   r)   r#    s    z_pin_finalizer.<locals>.core)r   )r   r   r   r#  r(   )r   r   r   r)   r     s    r   c                s    fdd}|S )Nc                  s     tj d S )N)r   r   ZcuEventDestroyr(   )deallocsr   r(   r)   r#    s    z_event_finalizer.<locals>.corer(   )r$  r   r#  r(   )r$  r   r)   r    s    r  c                s    fdd}|S )Nc                  s     tj d S )N)r   r   ZcuStreamDestroyr(   )r$  r   r(   r)   r#    s    z_stream_finalizer.<locals>.corer(   )r$  r   r#  r(   )r$  r   r)   r    s    r  c                s    | j  | j fdd}|S )Nc                 s$   t j  fdd} |  d S )Nc                s"    s| j  kstt|  d S )N)r   r   r   ZcuModuleUnload)r   )r   shutting_downr(   r)   module_unload  s    z6_module_finalizer.<locals>.core.<locals>.module_unload)r   r%  r   )r&  )deallocr   r   )r%  r)   r#    s    z_module_finalizer.<locals>.core)r   r   )r   r   r#  r(   )r'  r   r   r)   r    s    r  c               @   s(   e Zd ZdZdd Zdd Zdd ZdS )	_CudaIpcImplzjImplementation of GPU IPC using CUDA driver API.
    This requires the devices to be peer accessible.
    c             C   s*   |j | _ |j| _|j| _|j| _d | _d S )N)baser   r   r   _opened_mem)r4   parentr(   r(   r)   r3     s
    z_CudaIpcImpl.__init__c             C   sP   | j dk	rtd| jdk	r$td|| j| j| j }|| _| | jS )zT
        Import the IPC memory and returns a raw CUDA memory pointer object
        Nz'opening IpcHandle from original processzIpcHandle is already opened)	r)  rJ   r*  r   r   r   r   r   view)r4   r   r   r(   r(   r)   open  s    

z_CudaIpcImpl.openc             C   s*   | j d krtdt| j j d | _ d S )NzIpcHandle not opened)r*  rJ   r   ZcuIpcCloseMemHandler   )r4   r(   r(   r)   close  s    
z_CudaIpcImpl.closeN)r    r,   r-   r   r3   r-  r.  r(   r(   r(   r)   r(    s   r(  c               @   s(   e Zd ZdZdd Zdd Zdd ZdS )	_StagedIpcImplzImplementation of GPU IPC using custom staging logic to workaround
    CUDA IPC limitation on peer accessibility between devices.
    c             C   s(   || _ |j| _|j| _|j| _|| _d S )N)r+  r)  r   r   r   )r4   r+  r   r(   r(   r)   r3     s
    z_StagedIpcImpl.__init__c          	   C   s   ddl m} t| j}t| jd}|j|j  |	|j
 }W d Q R X || j}t||| j |j|j  |  W d Q R X | S )Nr   )cuda)r+  )numbar0  r   r   r   r(  r+  Zgpusr   r-  rc   r   r   r   device_to_devicer.  r   )r4   r   r0  ZsrcdevimplZ
source_ptrZnewmemr(   r(   r)   r-  
  s    z_StagedIpcImpl.openc             C   s   d S )Nr(   )r4   r(   r(   r)   r.     s    z_StagedIpcImpl.closeN)r    r,   r-   r   r3   r-  r.  r(   r(   r(   r)   r/    s   r/  c               @   sh   e Zd ZdZdddZdd Zdd	 Zd
d Zdd Zdd Z	dddZ
dd Zdd Zedd ZdS )r   a  
    Internal IPC handle.

    Serialization of the CUDA IPC handle object is implemented here.

    The *base* attribute is a reference to the original allocation to keep it
    alive.  The *handle* is a ctypes object of the CUDA IPC handle. The *size*
    is the allocation size.
    Nr   c             C   s(   || _ || _|| _|| _d | _|| _d S )N)r)  r   r   r   _implr   )r4   r)  r   r   r   r   r(   r(   r)   r3   /  s    zIpcHandle.__init__c             C   s   | j d krtdd S )Nz#IPC handle doesn't have source info)r   r   )r4   r(   r(   r)   _sentry_source_info7  s    
zIpcHandle._sentry_source_infoc             C   s4   |    | j|j krdS t| j}||jS )zdReturns a bool indicating whether the active context can peer
        access the IPC handle
        T)r5  r   r   r   r   r   r   r   )r4   r   Zsource_devicer(   r(   r)   r   ;  s
    zIpcHandle.can_access_peerc             C   s4   |    | jdk	rtdt| | j| _| j|S )zCOpen the IPC by allowing staging on the host memory first.
        NzIpcHandle is already opened)r5  r4  rJ   r/  r   r-  )r4   r   r(   r(   r)   open_stagedE  s
    
zIpcHandle.open_stagedc             C   s(   | j dk	rtdt| | _ | j |S )zT
        Import the IPC memory and returns a raw CUDA memory pointer object
        NzIpcHandle is already opened)r4  rJ   r(  r-  )r4   r   r(   r(   r)   open_directP  s    

zIpcHandle.open_directc             C   s*   | j dks| |r| j}n| j}||S )a  Open the IPC handle and import the memory for usage in the given
        context.  Returns a raw CUDA memory pointer object.

        This is enhanced over CUDA IPC that it will work regardless of whether
        the source device is peer-accessible by the destination device.
        If the devices are peer-accessible, it uses .open_direct().
        If the devices are not peer-accessible, it uses .open_staged().
        N)r   r   r7  r6  )r4   r   fnr(   r(   r)   r-  Z  s    	zIpcHandle.openc             C   s6   ddl m} |dkr|j}| |}|j||||dS )zC
        Simliar to `.open()` but returns an device array.
        r   )devicearrayN)shapestridesdtypeZgpu_data) r9  itemsizer-  ZDeviceNDArray)r4   r   r:  r<  r;  r9  r   r(   r(   r)   
open_arrayi  s    
zIpcHandle.open_arrayc             C   s&   | j d krtd| j   d | _ d S )NzIpcHandle not opened)r4  rJ   r.  )r4   r(   r(   r)   r.  w  s    

zIpcHandle.closec             C   s*   t | j}| j|| j| j| jf}tj|fS )N)tupler   r5   r   r   r   r   Z_rebuild_reduction)r4   Zpreprocessed_handler   r(   r(   r)   
__reduce__}  s    
zIpcHandle.__reduce__c             C   s   t j| }| d ||||dS )N)r)  r   r   r   r   )r   rq   )ra   Z
handle_aryr   r   r   r   r(   r(   r)   _rebuild  s    
zIpcHandle._rebuild)Nr   )N)r    r,   r-   r   r3   r5  r   r6  r7  r-  r?  r.  rA  r   rB  r(   r(   r(   r)   r   %  s   	



r   c               @   sZ   e Zd ZdZdZdddZedd Zdd	 Zd
d Z	dddZ
dddZedd ZdS )r   ao  A memory pointer that owns the buffer with an optional finalizer.

    When an instance is deleted, the finalizer will be called regardless
    of the `.refct`.

    An instance is created with `.refct=1`.  The buffer lifetime
    is tied to the MemoryPointer instance's lifetime.  The finalizer is invoked
    only if the MemoryPointer instance's lifetime ends.
    TNc             C   sP   || _ || _|| _|| _|d k	| _d| _| j| _|| _|d k	rLt	| || _
d S )Nr   )r   device_pointerr   _cuda_memsize_
is_managedrefctr   _ownerr   finalize
_finalizer)r4   r   r   r   r   r   r(   r(   r)   r3     s    
zMemoryPointer.__init__c             C   s   | j d kr| S | j S )N)rG  )r4   r(   r(   r)   r     s    zMemoryPointer.ownerc             C   s   t t| S )N)OwnedPointerr   r   )r4   r(   r(   r)   r     s    zMemoryPointer.ownc             C   s.   | j r*| jjstd|   | jjr*tdS )z8
        Forces the device memory to the trash.
        zFreeing dead memoryN)rE  rI  aliver   r   )r4   r(   r(   r)   r     s
    zMemoryPointer.freer   c             C   s@   |d kr| j n|}|r,t| j|||j nt| j|| d S )N)r   r   cuMemsetD8AsyncrC  r   
cuMemsetD8)r4   Zbyter   streamr(   r(   r)   memset  s
    
zMemoryPointer.memsetc             C   s   |d kr| j | }n|| }| jjd kr>|dkr8td| }n:| jj| }|dk rZtdt|}t| j||| jd}t	| jtt
frt
t| j|S |S d S )Nr   z non-empty slice into empty slicezsize cannot be negative)r   )r   rC  r   r   r   rp   r   r   r   r$   rJ  r   r   )r4   startstopr   r,  r)  r   r(   r(   r)   r,    s    
zMemoryPointer.viewc             C   s   | j S )N)rC  )r4   r(   r(   r)   device_ctypes_pointer  s    z#MemoryPointer.device_ctypes_pointer)NN)Nr   )N)r    r,   r-   r   __cuda_memory__r3   r   r   r   r   rO  r,  rR  r(   r(   r(   r)   r     s   	



r   c                   s    e Zd ZdZ fddZ  ZS )r   zModifies the ownership semantic of the MemoryPointer so that the
    instance lifetime is directly tied to the number of references.

    When `.refct` reaches zero, the finalizer is invoked.
    c                s$   t t| j|| |  jd8  _d S )Nr   )r2   r   r3   rF  )r4   r   kwargs)r5   r(   r)   r3     s    zAutoFreePointer.__init__)r    r,   r-   r   r3   r7   r(   r(   )r5   r)   r     s   r   c                   s*   e Zd ZdZd fdd	Zdd Z  ZS )r   TNc                sb   || _ || _t }tt||d || _tt	| j
||||d | j| _| j| _| jj| _d S )Nr   )r   )ownedhost_pointerr   rp   r   ZcuMemHostGetDevicePointerr   rC  r2   r   r3   r   r   _buflen_r   _bufptr_)r4   r   r   Zhostpointerr   r   devptr)r5   r(   r)   r3     s    zMappedMemory.__init__c             C   s   t t| S )N)MappedOwnedPointerr   r   )r4   r(   r(   r)   r      s    zMappedMemory.own)N)r    r,   r-   rS  r3   r   r7   r(   r(   )r5   r)   r     s   r   c               @   s   e Zd ZdddZdd ZdS )r   Nc             C   sT   || _ || _|| _|| _|d k	| _| j| _| j| _| jj| _|d k	rPt	
| | d S )N)r   rU  r   rV  rE  r   rW  r   rX  r   rH  )r4   r   r   r   r   r   r(   r(   r)   r3     s    

zPinnedMemory.__init__c             C   s   | S )Nr(   )r4   r(   r(   r)   r     s    zPinnedMemory.own)N)r    r,   r-   r3   r   r(   r(   r(   r)   r     s   
r   c               @   s   e Zd ZdddZdd ZdS )rJ  Nc                sZ   || _ |d kr| j | _n|jr"t|| _| j   fdd}| j  jd7  _t| | d S )Nc                  sL   y2  j d8  _  j dkst j dkr0   W n tk
rF   Y nX d S )Nr   r   )rF  r   r   ReferenceErrorr(   )r   r(   r)   deref$  s    
z$OwnedPointer.__init__.<locals>.derefr   )Z_mem_viewrE  r   rF  r   rH  )r4   Zmemptrr,  r\  r(   )r   r)   r3     s    


zOwnedPointer.__init__c             C   s   t | j|S )z$Proxy MemoryPointer methods
        )r#   r]  )r4   rz   r(   r(   r)   r}   1  s    zOwnedPointer.__getattr__)N)r    r,   r-   r3   r}   r(   r(   r(   r)   rJ    s   
rJ  c               @   s   e Zd ZdS )rZ  N)r    r,   r-   r(   r(   r(   r)   rZ  7  s   rZ  c               @   s:   e Zd Zdd Zdd Zdd Zdd Zejd	d
 Z	dS )r
  c             C   s$   || _ || _|d k	r t| | d S )N)r   r   r   rH  )r4   r   r   r   r(   r(   r)   r3   <  s    zStream.__init__c             C   s   | j jS )N)r   r   )r4   r(   r(   r)   r   B  s    zStream.__int__c             C   s   d| j j| jf S )Nz<CUDA stream %d on %s>)r   r   r   )r4   r(   r(   r)   r   E  s    zStream.__repr__c             C   s   t | j dS )zy
        Wait for all commands in this stream to execute. This will commit any
        pending memory transfers.
        N)r   ZcuStreamSynchronizer   )r4   r(   r(   r)   r  H  s    zStream.synchronizec             c   s   | V  |    dS )z
        A context manager that waits for all commands in this stream to execute
        and commits any pending memory transfers upon exiting the context.
        N)r  )r4   r(   r(   r)   auto_synchronizeO  s    zStream.auto_synchronizeN)
r    r,   r-   r3   r   r   r  r   r   r^  r(   r(   r(   r)   r
  ;  s
   r
  c               @   sB   e Zd ZdddZdd ZdddZd	d
 ZdddZdd ZdS )r  Nc             C   s$   || _ || _|d k	r t| | d S )N)r   r   r   rH  )r4   r   r   r   r(   r(   r)   r3   Z  s    zEvent.__init__c          
   C   sN   yt | j W n4 tk
rD } z|jtjkr2dS  W dd}~X Y nX dS dS )zy
        Returns True if all work before the most recent record has completed;
        otherwise, returns False.
        FNT)r   ZcuEventQueryr   r/   r0   r   ZCUDA_ERROR_NOT_READY)r4   rB   r(   r(   r)   query`  s    zEvent.queryr   c             C   s    |r
|j nd}t| j | dS )a  
        Set the record point of the event to the current point in the given
        stream.

        The event will be considered to have occurred when all work that was
        queued in the stream at the time of the call to ``record()`` has been
        completed.
        r   N)r   r   ZcuEventRecord)r4   rN  hstreamr(   r(   r)   recordo  s    	zEvent.recordc             C   s   t | j dS )zN
        Synchronize the host thread for the completion of the event.
        N)r   ZcuEventSynchronizer   )r4   r(   r(   r)   r  {  s    zEvent.synchronizec             C   s&   |r
|j nd}d}t|| j | dS )zZ
        All future works submitted to stream will wait util the event completes.
        r   N)r   r   ZcuStreamWaitEvent)r4   rN  r`  r   r(   r(   r)   wait  s    z
Event.waitc             C   s
   t | |S )N)event_elapsed_time)r4   evtendr(   r(   r)   elapsed_time  s    zEvent.elapsed_time)N)r   )r   )	r    r,   r-   r3   r_  ra  r  rb  re  r(   r(   r(   r)   r  Y  s   


r  c             C   s"   t  }tt|| j|j |jS )zF
    Compute the elapsed time between two events in milliseconds.
    )r   r   ZcuEventElapsedTimer   r   r   )Zevtstartrd  Zmsecr(   r(   r)   rc    s    rc  c               @   s.   e Zd Zd
ddZdd Zdd Zdd	 ZdS )r  Nc             C   s,   || _ || _|| _|d k	r(t| || _d S )N)r   r   r"  r   rH  rI  )r4   r   r   r"  r   r(   r(   r)   r3     s
    zModule.__init__c             C   s   | j |  d S )N)r   r	  )r4   r(   r(   r)   unload  s    zModule.unloadc             C   s4   t  }tt|| j|d tt	| ||S )Nr  )
r   Zcu_functionr   ZcuModuleGetFunctionr   r   r  Functionr   r   )r4   rY   r   r(   r(   r)   get_function  s    zModule.get_functionc             C   sD   t  }t  }tt|t|| j|d t| j	|||j
fS )Nr  )r   rp   r   r   ZcuModuleGetGlobalr   r   r  r   r   r   )r4   rY   r   r   r(   r(   r)   get_global_symbol  s
    zModule.get_global_symbol)N)r    r,   r-   r3   rf  rh  ri  r(   r(   r(   r)   r    s   
r  FuncAttrregssharedlocalconst
maxthreadsc               @   sd   e Zd ZdZdZdZdZdd Zdd Zddd	Z	dd
dZ
dd Zedd Zdd Zdd ZdS )rg  )r   r   r   r   c             C   s    || _ || _|| _|  | _d S )N)r  r   rY   _read_func_attr_allZattrs)r4   r  r   rY   r(   r(   r)   r3     s    zFunction.__init__c             C   s
   d| j  S )Nz<CUDA function %s>)rY   )r4   r(   r(   r)   r     s    zFunction.__repr__Fc             C   sH   |p
|o
|}|rt j}n|r$t j}n|r0t j}nt j}t| j| d S )N)r   ZCU_FUNC_CACHE_PREFER_EQUALZCU_FUNC_CACHE_PREFER_L1ZCU_FUNC_CACHE_PREFER_SHAREDZCU_FUNC_CACHE_PREFER_NONEr   ZcuFuncSetCacheConfigr   )r4   Zprefer_equalZprefer_cacheZprefer_sharedflagr(   r(   r)   cache_config  s    zFunction.cache_configc             C   sf   xt |dk r|d7 }qW xt |dk r2|d7 }qW t| }||_||_||_|r\||_nd|_|S )N   )r   r   )r   copygriddimblockdim	sharedmemrN  )r4   ru  rv  rw  rN  Zinstr(   r(   r)   	configure  s    
zFunction.configurec             G   s2   | j r| j j}nd}t| j| j| j| j|| dS )zS
        *args -- Must be either ctype objects of DevicePointer instances.
        N)rN  r   launch_kernelru  rv  rw  )r4   r   Zstreamhandler(   r(   r)   __call__  s
    
zFunction.__call__c             C   s
   | j jjS )N)r  r   r   )r4   r(   r(   r)   r     s    zFunction.devicec             C   s    t  }tt||| j |jS )z,
        Read CUfunction attributes
        )r   r   ZcuFuncGetAttributer   r   r   )r4   Zattridr   r(   r(   r)   _read_func_attr  s    zFunction._read_func_attrc             C   sN   |  tj}|  tj}|  tj}|  tj}|  tj}t|||||dS )N)rk  rn  rm  rl  ro  )r{  r   ZCU_FUNC_ATTRIBUTE_NUM_REGSZ"CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTESZ"CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTESZ#CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTESZ'CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCKrj  )r4   ZnregsZcmemZlmemZsmemZmaxtpbr(   r(   r)   rp    s    
zFunction._read_func_attr_allN)FFF)r   r   )r    r,   r-   ru  rv  rN  rw  r3   r   rr  rx  rz  r   r   r{  rp  r(   r(   r(   r)   rg    s    

rg  c             C   s   |\}}}|\}	}
}g }x6|D ].}t |r>|tt| q|t| qW tt| | }t| ||||	|
||||d  d S )N)is_device_memoryrM   r   rR  r   r   r   ZcuLaunchKernel)Zcufunc_handleru  rv  rw  r`  r   ZgxZgyZgzZbxZbyZbzZ
param_valsargZparamsr(   r(   r)   ry    s    


ry  )or  acubinZfatbinc               @   sP   e Zd ZdddZedd Zedd Zdd	d
Zdd Zdd Z	dd Z
dS )Linkerr   c             C   s  t tjdd}t|  }t|  }tjt|tjt	|tj
t|tjt	|tjt	di}|rnt	||tj< t| tjg }t| }~tjt| | }t	t| | }	t  | _}
tt|||	t| j t| tj|
 || _|| _||||	g| _d S )Nr  i   r   ) r%   r:   rD   rE   r	   r   r  r   r  r   r  r  r  ZCU_JIT_MAX_REGISTERSr   r  ZCU_JIT_TARGET_FROM_CUCONTEXTr   r   r  r   Zcu_link_stater   r   ZcuLinkCreater   r   rH  ZcuLinkDestroylinker_info_buflinker_errors_buf_keep_alive)r4   Zmax_registersr  Z
linkerinfoZlinkererrorsr  Zraw_keysZ
raw_valuesr   r!  r   r(   r(   r)   r3   #  s,    





zLinker.__init__c             C   s   | j jdS )Nr  )r  r   r  )r4   r(   r(   r)   r"  D  s    zLinker.info_logc             C   s   | j jdS )Nr  )r  r   r  )r4   r(   r(   r)   	error_logH  s    zLinker.error_log<cudapy-ptx>c          
   C   s   t |}t |d}|  j||g7  _y$t| jtj|t||dd d  W n4 t	k
r } zt
d|| jf W d d }~X Y nX d S )Nr  r   z%s
%s)r
   r  r  r   ZcuLinkAddDatar   r   CU_JIT_INPUT_PTXr   r/   r.   r  )r4   r  rY   ZptxbufZnamebufrB   r(   r(   r)   add_ptxL  s    zLinker.add_ptxc          
   C   sn   t |d}| j| yt| j||dd d  W n4 tk
rh } ztd|| j	f W d d }~X Y nX d S )Nr  r   z%s
%s)
r
   r  r  rM   r   ZcuLinkAddFiler   r/   r.   r  )r4   r;   kindZpathbufrB   r(   r(   r)   add_fileV  s    zLinker.add_filec             C   s(   | ddd }t| }| || d S )N.r   )rsplitFILE_EXTENSION_MAPr  )r4   r;   Zextr  r(   r(   r)   add_file_guess_ext_  s    zLinker.add_file_guess_extc          
   C   s   t d}td}yt| jt|t| W n4 tk
r` } ztd|| jf W dd}~X Y nX |j	}|dksxt
d| jdd= ||fS )z
        Returns (cubin, size)
            cubin is a pointer to a internal buffer of cubin owned
            by the linker; thus, it should be loaded before the linker
            is destroyed.
        r   z%s
%sNz"linker returned a zero sized cubin)r   r   r   ZcuLinkCompleter   r   r/   r.   r  r   r   r  )r4   r  r   rB   r(   r(   r)   completed  s    $zLinker.completeN)r   )r  )r    r,   r-   r3   r   r"  r  r  r  r  r  r(   r(   r(   r)   r  "  s   
!

	r  c             C   s&   t t||t| }t |d dS )z*Query attribute on the device pointer
    z!Failed to query pointer attributeN)r   cuPointerGetAttributer   rR  Zcheck_error)devmemr   Zodatar   r(   r(   r)   _device_pointer_attr|  s    
r  c             C   s<   t d}t| tj| tjdtjdtjdtjdi}||j S )zAQuery the device pointer type: host, device, array, unified?
    r   Zhostr   ZarrayZunified)	r   r  r   Z CU_POINTER_ATTRIBUTE_MEMORY_TYPEZCU_MEMORYTYPE_HOSTZCU_MEMORYTYPE_DEVICEZCU_MEMORYTYPE_ARRAYZCU_MEMORYTYPE_UNIFIEDr   )r  ZptrtyperX   r(   r(   r)   device_pointer_type  s    
r  c             C   s$   t d}tj}tt|||  |S )zZQuery the device pointer usable in the current context from an arbitrary
    pointer.
    r   )r   r   Z#CU_POINTER_ATTRIBUTE_DEVICE_POINTERr   r  r   )r   rY  r   r(   r(   r)   get_devptr_for_active_ctx  s    r  c             C   sF   t  }t }t| }tt|t|| |j|j }}||| fS )a  Find the extents (half open begin and end pointer) of the underlying
    device memory allocation.

    NOTE: it always returns the extents of the allocation but the extents
    of the device memory view that can be a subsection of the entire allocation.
    )r   rp   r   rR  r   ZcuMemGetAddressRanger   r   )r  snrY  r(   r(   r)   device_extents  s    r  c             C   sH   t | dd}|dkr.t| \}}|| }|| _|dksDtd||S )zCheck the memory size of the device memory.
    The result is cached in the device memory object.
    It may query the driver for the memory size of the device memory allocation.
    rD  Nr   z{} length array)r#   r  rD  r   r   )r  Zszr  rB   r(   r(   r)   device_memory_size  s    r  c             C   s   t | dd}|dk	o|jdkS )z?Returns True if the obj.dtype is datetime64 or timedelta64
    r<  NZMm)r#   char)rb   r<  r(   r(   r)   _is_datetime_dtype  s    r  c             C   s   t | r| tj} | S )z^Workaround for numpy#4983: buffer protocol doesn't support
    datetime64 or timedelta64.
    )r  r,  npZint64)rb   r(   r(   r)   _workaround_for_datetime  s    r  Fc             C   sD   t | ttfr| S d}|s.t | tjp,t| }t| } t| ||S )a  Get host pointer from an obj.

    If `readonly` is False, the buffer must be writable.

    NOTE: The underlying data pointer from the host data buffer is used and
    it should not be changed until the operation which can be asynchronous
    completes.
    F)	r$   r%   r   r  Zvoidr  r  r   Zmemoryview_get_buffer)rb   readonlyZforcewritabler(   r(   r)   rV    s    	rV  c             C   s   t | } t| S )zHReturns (start, end) the start and end pointer of the array (half open).)r  r   Zmemoryview_get_extents)rb   r(   r(   r)   host_memory_extents  s    r  c             C   s<   t | t |kstdt | }t| |||\}}|| S )z_Get the byte size of a contiguous memory buffer given the shape, strides
    and itemsize.
    z# dim mismatch)r   r   r   Zmemoryview_get_extents_info)r:  r;  r>  ndimr  rB   r(   r(   r)   memory_size_from_info  s    r  c             C   s$   t | \}}||kstd|| S )zGet the size of the memoryzmemory extend of negative size)r  r   )rb   r  rB   r(   r(   r)   host_memory_size  s    r  c             C   s
   t | jS )z$Get the device pointer as an integer)rR  r   )rb   r(   r(   r)   rC    s    rC  c             C   s   | dkrt dS t|  | jS )z,Get the ctypes object for the device pointerNr   )r   require_device_memoryrR  )rb   r(   r(   r)   rR    s    rR  c             C   s   t | ddS )ah  All CUDA memory object is recognized as an instance with the attribute
    "__cuda_memory__" defined and its value evaluated to True.

    All CUDA memory object should also define an attribute named
    "device_pointer" which value is an int(or long) object carrying the pointer
    value of the device memory address.  This is not tested in this method.
    rS  F)r#   )rb   r(   r(   r)   r|     s    r|  c             C   s   t | stddS )z9A sentry for methods that accept CUDA memory object.
    zNot a CUDA memory object.N)r|  	Exception)rb   r(   r(   r)   r    s    r  c             G   s   t | dg }|| dS )zAdd dependencies to the device memory.

    Mainly used for creating structures that points to other device memory,
    so that the referees are not GC and released.
    Z	_depends_N)r#   extend)r  ZobjsZdepsetr(   r(   r)   device_memory_depends  s    r  c             C   sR   g }|r*t |tsttj}||j ntj}|t| t	|dd|f|  dS )z
    NOTE: The underlying data pointer from the host data buffer is used and
    it should not be changed until the operation which can be asynchronous
    completes.
    T)r  N)
r$   r
  r   r   ZcuMemcpyHtoDAsyncrM   r   ZcuMemcpyHtoDrC  rV  )dstsrcr   rN  varargsr8  r(   r(   r)   host_to_device  s    r  c             C   sN   g }|r*t |tsttj}||j ntj}|t| t	||f|  dS )z
    NOTE: The underlying data pointer from the host data buffer is used and
    it should not be changed until the operation which can be asynchronous
    completes.
    N)
r$   r
  r   r   ZcuMemcpyDtoHAsyncrM   r   ZcuMemcpyDtoHrV  rC  )r  r  r   rN  r  r8  r(   r(   r)   device_to_host.  s    r  c             C   sN   g }|r*t |tsttj}||j ntj}|t| t||f|  dS )z
    NOTE: The underlying data pointer from the host data buffer is used and
    it should not be changed until the operation which can be asynchronous
    completes.
    N)	r$   r
  r   r   ZcuMemcpyDtoDAsyncrM   r   ZcuMemcpyDtoDrC  )r  r  r   rN  r  r8  r(   r(   r)   r2  @  s    r2  c             C   sJ   g }|r*t |tsttj}||j ntj}|t| ||f|  dS )zMemset on the device.
    If stream is not zero, asynchronous mode is used.

    dst: device memory
    val: byte value to be written
    size: number of byte to be written
    stream: a CUDA stream
    N)	r$   r
  r   r   rL  rM   r   rM  rC  )r  valr   rN  r  r8  r(   r(   r)   device_memsetR  s    	r  c               C   s   t   dS )z;
    Enable profile collection in the current context.
    N)r   ZcuProfilerStartr(   r(   r(   r)   profile_startg  s    r  c               C   s   t   dS )z<
    Disable profile collection in the current context.
    N)r   ZcuProfilerStopr(   r(   r(   r)   profile_stopn  s    r  c               c   s   t   dV  t  dS )z]
    Context manager that enables profiling on entry and disables profiling on
    exit.
    N)r  r  r(   r(   r(   r)   	profilingu  s    r  )F)r   )r   )r   )r   )r   Z
__future__r   r   r   r&   r:   rH   r   r   rt  r   r   	itertoolsr   r   r   r   r	   r
   r   r   r   r   Znumpyr  collectionsr   r   r1  r   r   r   r   r   r   r   r   r=  r   r   r   r   Znumba.utilsr   r   r%   rD   rE   r  r   rG   rV   r   r*   r   r+   r.   r/   rQ   rR   rS   rF   rO   rZ   r\   r   r   r_   r]   r   r   r   r   r   r   r   r   r   r  r   r   r   r  r  r  r(  r/  r   r   r   r   ZMemAllocr   rJ  rZ  r
  r  rc  r  rj  rg  ry  ZCU_JIT_INPUT_OBJECTr  ZCU_JIT_INPUT_LIBRARYZCU_JIT_INPUT_CUBINZCU_JIT_INPUT_FATBINARr  r  r  r  r  r  r  r  r  rV  r  r  r  rC  rR  r|  r  r  r  r  r2  r  r  r  r   r  r(   r(   r(   r)   <module>   s   (
A
 .	w
C
  "$&kQ4	P
Z







