B
     \|:                 @   s  d dl mZmZ d dlZd dlmZ d dlZd dlZd dl	m
Z
 d dlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ d dlmZmZ ddlmZ d dl	mZ d dlmZ d dlmZ d dl m!Z! e!dd Z"d2ddZ#d3ddZ$dd Z%G dd de&Z'G dd de&Z(dd Z)d d! Z*G d"d# d#e&Z+ed$d%d&d'gZ,G d(d) d)e&Z-G d*d+ d+e+Z.d,d- Z/d.d/ Z0G d0d1 d1e+Z1dS )4    )print_functionabsolute_importN)
namedtuple)ConcreteTemplate)typescompiler   )hlc)devicesdriverenumsdrvapi)HsaKernelLaunchError)gcn_occupancy)hsadgpu_present)devicearray)AbstractTemplate)ctypes_support)config)global_compiler_lockc       
   	   C   sh   ddl m} |j}|j}t }|d |d |d tj||| |||i d}|j	}	|	
  |S )Nr   )HSATargetDescZ
no_compileZno_cpython_wrapperZnrt)	typingctx	targetctxfuncargsreturn_typeflagslocals)
descriptorr   r   r   r   ZFlagssetZunsetZcompile_extralibraryfinalize)
pyfuncr   r   debugr   r   r   r   cresr!    r&   1lib/python3.7/site-packages/numba/roc/compiler.pycompile_hsa   s"    


r(   Fc             C   sN   t | tj||d}|j|jj}|j||j	j
}t|j|j|j	j
d}|S )N)r$   )llvm_modulenameargtypes)r(   r   Zvoidr!   get_functionfndescllvm_func_nametarget_contextZprepare_hsa_kernel	signaturer   	HSAKernelmoduler*   )r#   r   r$   r%   r   kernelZhsakernr&   r&   r'   compile_kernel3   s    r4   c                sv   t | |||d  j jj} j| t G  fdddt} j	
|  jg} j
 j| S )N)r$   c                   s   e Zd ZZ jgZdS )z0compile_device.<locals>.device_function_templateN)__name__
__module____qualname__keyr0   Zcasesr&   )r%   devfnr&   r'   device_function_templateC   s   r:   )r(   r!   r,   r-   r.   r/   mark_hsa_deviceDeviceFunctionr   Ztyping_contextinsert_user_function)r#   r   r   r$   r   r:   libsr&   )r%   r9   r'   compile_device=   s    r?   c                s>   ddl m} t|  G  fdddt}|j}| |  S )z%Compile a DeviceFunctionTemplate
    r   )r   c                   s   e Zd Z Z fddZdS )z9compile_device_template.<locals>.device_function_templatec                s   |rt  |S )N)AssertionErrorcompile)selfr   Zkws)dftr&   r'   genericW   s    zAcompile_device_template.<locals>.device_function_template.genericN)r5   r6   r7   r8   rD   r&   )rC   r&   r'   r:   T   s   r:   )r   r   DeviceFunctionTemplater   r   r=   )r#   r   r:   r   r&   )rC   r'   compile_device_templateM   s    rF   c               @   s"   e Zd ZdZdddZdd ZdS )	rE   z#Unmaterialized device function
    Fc             C   s   || _ || _i | _d S )N)py_funcr$   _compileinfos)rB   r#   r$   r&   r&   r'   __init__c   s    zDeviceFunctionTemplate.__init__c             C   s   || j krt| jd|| jd}|j|jj}|j	| | j  }|| j |< |jg}|rl|j
| |j| q|j| |j| n
| j | }|jS )zCompile the function for the given argument types.

        Each signature is compiled once by caching the compiled function inside
        this object.
        N)r$   )rH   r(   rG   r$   r!   r,   r-   r.   r/   r;   r=   Zadd_user_functionr0   )rB   r   r%   r   Zfirst_definitionr>   r&   r&   r'   rA   i   s    


zDeviceFunctionTemplate.compileN)F)r5   r6   r7   __doc__rI   rA   r&   r&   r&   r'   rE   `   s   
rE   c               @   s   e Zd Zdd ZdS )r<   c             C   s
   || _ d S )N)r%   )rB   r%   r&   r&   r'   rI      s    zDeviceFunction.__init__N)r5   r6   r7   rI   r&   r&   r&   r'   r<      s   r<   c             C   s    t | ttfs| gS t| S d S )N)
isinstancetuplelist)valr&   r&   r'   _ensure_list   s    rO   c             C   s*   t | }xt||D ]}| d qW d S )Nr   )lenrangeappend)rN   sizen_r&   r&   r'   _ensure_size_or_append   s    rV   c               @   s<   e Zd ZdZdd Zdd ZdddZdd
dZdd ZdS )HSAKernelBasez.Define interface for configurable kernels
    c             C   s   d| _ d| _d | _d S )N)r   )global_size
local_sizestream)rB   r&   r&   r'   rI      s    zHSAKernelBase.__init__c             C   s
   t  | S )N)copy)rB   r&   r&   r'   r[      s    zHSAKernelBase.copyNc             C   sl   t |}|dk	r>t |}tt|t|}t|| t|| |  }t||_|r\t|nd|_||_|S )zCConfigure the OpenCL kernel
        local_size can be None
        N)	rO   maxrP   rV   r[   rL   rX   rY   rZ   )rB   rX   rY   rZ   rS   Zcloner&   r&   r'   	configure   s    


zHSAKernelBase.configure@   c             C   s   | j |t|||dS )z6Simplified configuration for 1D kernel launch
        )rZ   )r]   min)rB   ZnelemrY   rZ   r&   r&   r'   forall   s    zHSAKernelBase.forallc             C   sl   t |d }t |d }tt|t|}t|| t|| dd t||D }| j||f|dd  S )a*  Mimick CUDA python's square-bracket notation for configuration.
        This assumes a the argument to be:
            `griddim, blockdim, stream`
        The blockdim maps directly to local_size.
        The actual global_size is computed by multiplying the local_size to
        griddim.
        r   r   c             S   s   g | ]\}}|| qS r&   r&   ).0glr&   r&   r'   
<listcomp>   s    z-HSAKernelBase.__getitem__.<locals>.<listcomp>   N)rO   r\   rP   rV   zipr]   )rB   r   ZgriddimZblockdimrS   Zgsr&   r&   r'   __getitem__   s    

zHSAKernelBase.__getitem__)NN)r^   N)	r5   r6   r7   rJ   rI   r[   r]   r`   rg   r&   r&   r&   r'   rW      s   

rW   Z_CachedEntrysymbol
executablekernarg_regionc               @   s   e Zd Zdd Zdd ZdS )_CachedProgramc             C   s   || _ || _i | _d S )N)_entry_name_binary_cache)rB   
entry_namebinaryr&   r&   r'   rI      s    z_CachedProgram.__init__c             C   s  t  }| j|}|d krd| j}|j}t| j}t	j
t| j }||}t }tjt	|t| jd t	| t|}	t }
|
||	 |
  |
||}|jj}x$|D ]}|jr|tjr|}P qW |d k	stt ||
|d}|| j|< ||fS )Nz{0})rh   ri   rj   )!r
   get_contextrn   getformatrl   agent	bytearrayrm   ctypesc_byterP   Zfrom_bufferr   Zhsa_code_object_tr   r   Zhsa_code_object_deserializeZ	addressofZbyrefZ
CodeObjectZ
ExecutableloadZfreezeZ
get_symbolregionsglobalsZhost_accessibleZsupportsr   ZHSA_REGION_GLOBAL_FLAG_KERNARGr@   _CacheEntry)rB   ctxresultrh   rt   ZbaZbblobZbasZcode_ptrcodeexZsymobjry   Zregrj   r&   r&   r'   rr      s<    





z_CachedProgram.getN)r5   r6   r7   rI   rr   r&   r&   r&   r'   rk      s   rk   c                   sH   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	  Z
S )r1   z
    A HSA kernel object
    c                sX   t t|   || _|  \| _| _|| _t|| _	g | _
t| j| jd| _|   d S )N)ro   rp   )superr1   rI   _llvm_module_generateGCNassemblyrp   ro   rL   argument_typesZ_arglocrk   
_cacheprog_parse_kernel_resource)rB   r)   r*   r+   )	__class__r&   r'   rI     s    
zHSAKernel.__init__c             C   s@   t d| j}t|d| _t d| j}t|d| _dS )z9
        Temporary workaround for register limit
        z"\bwavefront_sgpr_count\s*=\s*(\d+)r   z!\bworkitem_vgpr_count\s*=\s*(\d+)N)researchr   intgroup_wavefront_sgpr_count_workitem_vgpr_count)rB   mr&   r&   r'   r     s    z HSAKernel._parse_kernel_resourcec             C   sH   t | j}tj|| j| jd}|jrDd}|d	|j
}t|d S )N)
group_sizeZvgpr_per_workitemZsgpr_per_wavez2insufficient resources to launch kernel due to:
{}
)npZprodrY   r   Zget_limiting_factorsr   r   Zreasonsrs   joinZsuggestionsr   )rB   r   ZlimitsZfmtmsgr&   r&   r'   _sentry_resource_limit  s    
z HSAKernel._sentry_resource_limitc             C   s    t  }|t| j | S )N)r	   ZModuleZ	load_llvmstrr   ZgenerateGCN)rB   Zhlcmodr&   r&   r'   r   )  s    zHSAKernel._generateGCNc             C   sP   | j  \}}|jjdkr<ttj|jj }|j|}nd}||j||jfS )z'
        Bind kernel to device
        r   N)	r   rr   rh   Zkernarg_segment_sizerv   sizeofrw   rj   allocate)rB   r|   entryZszkernargsr&   r&   r'   bind.  s    

zHSAKernel.bindc                sh  |    |  \}} g }g }x&t| j|D ]\}}t|||| q.W d}xX|D ]P}	t|	}
t|
|}||7 }j| }t	|t
t|	}|	|d< ||
7 }qRW |j}| jd krt  d }| jd k	rtd}|| j  |j|| j| j|d | jd k	r| j| x|D ]}|  qW d k	rd| jd krN  n| j fdd d S )Nr   r   )Zworkgroup_sizeZ	grid_sizesignalc                  s
     S )N)freer&   )rj   r   r&   r'   <lambda>s  s    z$HSAKernel.__call__.<locals>.<lambda>)r   r   rf   r   _unpack_argumentrv   r   _calc_padding_for_alignmentvaluecastZPOINTERtypeZdefault_queuerZ   r   Zimplicit_syncZcreate_signalZinsert_barrierZ_get_last_signaldispatchrY   rX   Z_add_signalr   Z_add_callback)rB   r   r|   rh   Zexpanded_valuesretrtyrN   baseavalignZpadZoffsetedZasptrZqqr   wbr&   )rj   r   r'   __call__<  s@    








zHSAKernel.__call__)r5   r6   r7   rJ   rI   r   r   r   r   r   __classcell__r&   r&   )r   r'   r1     s   	r1   c                s  t | tjrtj}trLtt	 \ }|rD|
 fdd  j}ntjj}td }}|j}	|jj}
|
| |
| |
|	 |
|
 |
| x&tjD ]}|
|j|  qW x&tjD ]}|
|j|  qW nt | tjr0ttd|  }|
| n| tjkrRt}|
| n| tjkrtt}|
| n| tjkrtt}|
| nn| tjkr|
tj |
tj  n<| tj!kr|
tj |
tj  n
t"| dS )z>
    Convert arguments to ctypes and append to kernelargs
    c                  s
     S )N)Zcopy_to_hostr&   )devaryrN   r&   r'   r     s    z"_unpack_argument.<locals>.<lambda>r   zc_%sN)#rK   r   ZArrayrv   Z	c_ssize_tr   r   Zauto_devicer
   rq   rR   Zdevice_ctypes_pointerZc_void_pdatarS   ZdtypeitemsizerQ   ndimshapestridesZIntegergetattrZfloat64Zc_doubleZfloat32Zc_floatZbooleanZc_uint8r   Z	complex64realimagZ
complex128NotImplementedError)r   rN   Z
kernelargsr   Zc_intpZconvr   ZmeminfoparentZnitemsr   ZaxZcvalr&   )r   rN   r'   r   v  sN    







r   c             C   s$   t ||  }|dkrdS | | S dS )zV
    Returns byte padding required to move the base pointer into proper alignment
    r   N)r   )r   r   Zrmdrr&   r&   r'   r     s    r   c                   s,   e Zd Z fddZdd Zdd Z  ZS )AutoJitHSAKernelc                s2   t t|   || _i | _ddlm} |j| _d S )Nr   )r   )r   r   rI   rG   definitionsr   r   r   )rB   r   r   )r   r&   r'   rI     s
    zAutoJitHSAKernel.__init__c             G   s*   | j | }|| j| j| j}||  d S )N)
specializer]   rX   rY   rZ   )rB   r   r3   Zcfgr&   r&   r'   r     s    
zAutoJitHSAKernel.__call__c                sD   t  fdd|D } j|}|d kr@t j|}| j|< |S )Nc                s   g | ]} j |qS r&   )r   Zresolve_argument_type)ra   a)rB   r&   r'   rd     s   z/AutoJitHSAKernel.specialize.<locals>.<listcomp>)rL   r   rr   r4   rG   )rB   r   r+   r3   r&   )rB   r'   r     s    

zAutoJitHSAKernel.specialize)r5   r6   r7   rI   r   r   r   r&   r&   )r   r'   r     s   	r   )F)F)2Z
__future__r   r   r[   collectionsr   r   Znumpyr   Znumba.typing.templatesr   Znumbar   r   r	   Zhsadrvr
   r   r   r   Zhsadrv.errorr    r   Znumba.roc.hsadrv.driverr   r   r   r   r   rv   r   Znumba.compiler_lockr   r(   r4   r?   rF   objectrE   r<   rO   rV   rW   r{   rk   r1   r   r   r   r&   r&   r&   r'   <module>   s@   


$63s9