B
      ›\U  ã            
   @   s¤  d dl mZmZmZ d dlmZ d dlZd dlmZ d dlm	  m
Z d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZ d	dlmZmZ eƒ ZejZedejƒdd„ ƒZedejƒdd„ ƒZedejƒdd„ ƒZedejƒdd„ ƒZ edejƒdd„ ƒZ!edejƒdd„ ƒZ"dd„ Z#x$ej$ %¡ D ]Z&ee&ƒe#e&ƒƒ q6W ed ej'ƒd!d"„ ƒZ(d a)d#d$„ Z*ed%ejej+ƒd&d'„ ƒZ,ed%ej-ej+ƒd(d)„ ƒZ.ed*ejej+ƒd+d,„ ƒZ/ed*ej-ej+ƒd-d.„ ƒZ0eej1ƒd/d0„ ƒZ2eej3ej4ƒd1d2„ ƒZ5eej6ej4ƒd3d4„ ƒZ7eej8ej4ƒd5d6„ ƒZ9eej:ƒd7d8„ ƒZ;eej<ƒd9d:„ ƒZ=eej>ƒd;d<„ ƒZ?eej@ej4ƒd=d>„ ƒZAeejBej4ej4ej4ej4ej4ƒeejBej4ej4ejCej4ej4ƒeejBej4ej4ejDej4ej4ƒeejBej4ej4ejEej4ej4ƒd?d@„ ƒƒƒƒZFeejGej4ej4ejHƒdAdB„ ƒZIeejJej4ej4ƒeejJej4ejCƒeejJej4ejDƒeejJej4ejEƒdCdD„ ƒƒƒƒZKeejLej4ej4ƒeejLej4ejCƒeejLej4ejDƒeejLej4ejEƒdEdF„ ƒƒƒƒZMeejNej+ƒdGdH„ ƒZOeejPej+ej+ej+ƒdIdJ„ ƒZQeejRejSƒdKdL„ ƒZTeejRejUƒdMdN„ ƒZVeejWej+ƒdOdP„ ƒZXeejYej+ƒdQdR„ ƒZZeej[ej+ej+ej+ƒdSdT„ ƒZ\dUdV„ Z]dWdX„ Z^eej_j`ej'ejej+ƒeej_j`ej'ej-ej+ƒeej_j`ej'ejaej+ƒe^dYdZ„ ƒƒƒƒZbeej_jcej'ejej+ƒeej_jcej'ejaej+ƒeej_jcej'ej-ej+ƒe^d[d\„ ƒƒƒƒZdeej_jeej'ejej+ƒeej_jeej'ejaej+ƒeej_jeej'ej-ej+ƒe^d]d^„ ƒƒƒƒZfeej_jgej'ej+ej+ƒd_d`„ ƒZhdadb„ Zididdde„Zjdjdgdh„ZkdS )ké    )Úprint_functionÚabsolute_importÚdivision)ÚreduceN)ÚType)ÚRegistry)Úcgutils)Úsix)Útypesé   )Únvvm)Ú	nvvmutilsÚstubszptx.grid.1dc             C   s   t |ƒdkst‚tj|ddS )Nr   )Údim)ÚlenÚAssertionErrorr   Úget_global_id)ÚcontextÚbuilderÚsigÚargs© r   ú2lib/python3.7/site-packages/numba/cuda/cudaimpl.pyÚ
ptx_grid1d   s    r   zptx.grid.2dc             C   s2   t |ƒdkst‚tj|dd\}}t |||g¡S )Nr   é   )r   )r   r   r   r   r   Ú
pack_array)r   r   r   r   Úr1Úr2r   r   r   Ú
ptx_grid2d   s    r   zptx.grid.3dc             C   s6   t |ƒdkst‚tj|dd\}}}t ||||g¡S )Nr   é   )r   )r   r   r   r   r   r   )r   r   r   r   r   r   Úr3r   r   r   Ú
ptx_grid3d"   s    r!   zptx.gridsize.1dc             C   s8   t |ƒdkst‚t |d¡}t |d¡}| ||¡}|S )Nr   zntid.xznctaid.x)r   r   r   Ú	call_sregÚmul)r   r   r   r   ÚntidxÚnctaidxZresr   r   r   Úptx_gridsize1d)   s
    r&   zptx.gridsize.2dc       
      C   sh   t |ƒdkst‚t |d¡}t |d¡}t |d¡}t |d¡}| ||¡}| ||¡}	t |||	g¡S )Nr   zntid.xznctaid.xzntid.yznctaid.y)r   r   r   r"   r#   r   r   )
r   r   r   r   r$   r%   ÚntidyÚnctaidyr   r   r   r   r   Úptx_gridsize2d3   s    r)   zptx.gridsize.3dc             C   sŽ   t |ƒdkst‚t |d¡}t |d¡}t |d¡}t |d¡}t |d¡}t |d¡}	| ||¡}
| ||¡}| ||	¡}t ||
||g¡S )Nr   zntid.xznctaid.xzntid.yznctaid.yzntid.zznctaid.z)r   r   r   r"   r#   r   r   )r   r   r   r   r$   r%   r'   r(   ZntidzZnctaidzr   r   r    r   r   r   Úptx_gridsize3dA   s    r*   c                s   ‡ fdd„}|S )Nc                s   |rt ‚t |ˆ ¡S )N)r   r   r"   )r   r   r   r   )Úsregr   r   Úptx_sreg_implV   s    z(ptx_sreg_template.<locals>.ptx_sreg_implr   )r+   r,   r   )r+   r   Úptx_sreg_templateU   s    r-   zptx.cmem.arylikec          	      sL  |j }|\}|j}‡ fdd„t |jdd¡D ƒ}tj t 	d¡|¡}t
j}	|j|jd|	d}
tj|
_d|
_||
_ˆ  |j¡}ˆ  |¡}d	|d
  ¡  |
_t |t 	d¡|	¡}|
 t t 	d¡|	¡¡}| ||g¡}ˆ  |¡ˆ |ƒ}‡ fdd„|jD ƒ}‡ fdd„|jD ƒ}ˆ j|| ||j j¡t! "||¡t! "||¡|j#|j$d d | %¡ S )Nc                s   g | ]}ˆ   tj|¡‘qS r   )Úget_constantr
   Zbyte)Ú.0Úi)r   r   r   ú
<listcomp>k   s   z$ptx_cmem_arylike.<locals>.<listcomp>ÚA)Úorderé   Z_cudapy_cmem)ÚnameÚ	addrspaceTr   r   c                s   g | ]}ˆ   tj|¡‘qS r   )r.   r
   Úintp)r/   Ús)r   r   r   r1   ƒ   s    c                s   g | ]}ˆ   tj|¡‘qS r   )r.   r
   r7   )r/   r8   )r   r   r   r1   „   s    )ÚdataÚshapeÚstridesÚitemsizeÚparentÚmeminfo)&ÚmoduleZreturn_typer	   Z	iterbytesÚtobytesÚlcÚConstantÚarrayr   Úintr   ZADDRSPACE_CONSTANTÚadd_global_variableÚtypeZLINKAGE_INTERNALÚlinkageZglobal_constantÚinitializerÚget_data_typeÚdtypeÚget_abi_sizeofÚ
bit_lengthÚalignr   Úinsert_addrspace_convÚbitcastÚpointerÚcallÚ
make_arrayr:   r;   Úpopulate_arrayr9   r   r   r<   r=   Ú	_getvalue)r   r   r   r   ÚlmodZarrÚarytyZ	constvalsZconstaryr6   ZgvÚlldtyperM   ÚconvÚaddrspaceptrZgenptrÚaryÚkshapeÚkstridesr   )r   r   Úptx_cmem_aryliked   s:    




r]   c             C   s   t d7 a d | t ¡S )zÍDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})Ú_unique_smem_idÚformat)r5   r   r   r   Ú_get_unique_smem_id“   s    r`   zptx.smem.allocc          	   C   s&   |\}}t | ||f|tdƒtjddS )NÚ_cudapy_smemT)r:   rJ   Úsymbol_namer6   Úcan_dynsized)Ú_generic_arrayr`   r   ÚADDRSPACE_SHARED)r   r   r   r   ÚlengthrJ   r   r   r   Úptx_smem_alloc_intp   s
    rg   c          	   C   s$   |\}}t | |||tdƒtjddS )Nra   T)r:   rJ   rb   r6   rc   )rd   r`   r   re   )r   r   r   r   r:   rJ   r   r   r   Úptx_smem_alloc_array¦   s
    
rh   zptx.lmem.allocc          	   C   s"   |\}}t | ||f|dtjddS )NÚ_cudapy_lmemF)r:   rJ   rb   r6   rc   )rd   r   ÚADDRSPACE_LOCAL)r   r   r   r   rf   rJ   r   r   r   Úptx_lmem_alloc_intp¯   s
    rk   c          	   C   s    |\}}t | |||dtjddS )Nri   F)r:   rJ   rb   r6   rc   )rd   r   rj   )r   r   r   r   r:   rJ   r   r   r   Úptx_lmem_alloc_array¸   s
    
rl   c             C   sD   |rt ‚d}|j}t t ¡ d¡}|j||d}| |d¡ |  ¡ S )Nzllvm.nvvm.barrier0r   )r5   )r   r?   r   ÚfunctionÚvoidÚget_or_insert_functionrQ   Úget_dummy_value)r   r   r   r   ÚfnamerU   ÚfntyÚsyncr   r   r   Úptx_syncthreadsÁ   s    rt   c             C   s>   d}|j }t t d¡t d¡f¡}|j||d}| ||¡S )Nzllvm.nvvm.barrier0.popcé    )r5   )r?   r   rm   rD   ro   rQ   )r   r   r   r   rq   rU   rr   rs   r   r   r   Úptx_syncthreads_countÌ   s
    rv   c             C   s>   d}|j }t t d¡t d¡f¡}|j||d}| ||¡S )Nzllvm.nvvm.barrier0.andru   )r5   )r?   r   rm   rD   ro   rQ   )r   r   r   r   rq   rU   rr   rs   r   r   r   Úptx_syncthreads_andÕ   s
    rw   c             C   s>   d}|j }t t d¡t d¡f¡}|j||d}| ||¡S )Nzllvm.nvvm.barrier0.orru   )r5   )r?   r   rm   rD   ro   rQ   )r   r   r   r   rq   rU   rr   rs   r   r   r   Úptx_syncthreads_orÞ   s
    rx   c             C   sD   |rt ‚d}|j}t t ¡ d¡}|j||d}| |d¡ |  ¡ S )Nzllvm.nvvm.membar.ctar   )r5   )r   r?   r   rm   rn   ro   rQ   rp   )r   r   r   r   rq   rU   rr   rs   r   r   r   Úptx_threadfence_blockç   s    ry   c             C   sD   |rt ‚d}|j}t t ¡ d¡}|j||d}| |d¡ |  ¡ S )Nzllvm.nvvm.membar.sysr   )r5   )r   r?   r   rm   rn   ro   rQ   rp   )r   r   r   r   rq   rU   rr   rs   r   r   r   Úptx_threadfence_systemò   s    rz   c             C   sD   |rt ‚d}|j}t t ¡ d¡}|j||d}| |d¡ |  ¡ S )Nzllvm.nvvm.membar.glr   )r5   )r   r?   r   rm   rn   ro   rQ   rp   )r   r   r   r   rq   rU   rr   rs   r   r   r   Úptx_threadfence_deviceý   s    r{   c             C   sD   d}|j }t t ¡ t d¡f¡}|j||d}| ||¡ |  ¡ S )Nzllvm.nvvm.bar.warp.syncru   )r5   )r?   r   rm   rn   rD   ro   rQ   rp   )r   r   r   r   rq   rU   rr   rs   r   r   r   Úptx_warp_sync  s    r|   c          
   C   sü  |\}}}}}|j d }	|	tjkr6| |t |	j¡¡}d}
|j}t t 	t d¡t d¡f¡t d¡t d¡t d¡t d¡t d¡f¡}|j
||
d}|	jdkrü| ||||||f¡}|	tjkrú| |d¡}| |d¡}| |t ¡ ¡}t |||f¡}nü| |t d¡¡}| ||  tjd¡¡}| |t d¡¡}| ||||||f¡}| ||||||f¡}| |d¡}| |d¡}| |d¡}| |t d¡¡}| |t d¡¡}| ||  tjd¡¡}| ||¡}|	tjkrè| |t ¡ ¡}t |||f¡}|S )a‹  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic function supports
    both 32 and 64 bit ints and floats, so for feature parity, i64, f32, and f64 are implemented.
    Floats by way of bitcasting the float to an int, then shuffling, then bitcasting back.
    And 64-bit values by packing them into 2 32bit values, shuffling thoose, and then packing back together.
    r   zllvm.nvvm.shfl.sync.i32ru   r   )r5   r   é@   )r   r
   Úreal_domainrO   r   rD   Úbitwidthr?   rm   Ústructro   rQ   Úfloat32Zextract_valueÚfloatr   Zmake_anonymous_structZtruncZlshrr.   Úi8ZzextZshlÚor_Úfloat64Zdouble)r   r   r   r   ÚmaskÚmodeÚvalueÚindexZclampZ
value_typerq   rU   rr   ÚfuncZretÚrvZpredZfvZvalue1Z
value_lshrZvalue2Zret1Zret2Zrv1Zrv2Zrv1_64Zrv2_64Zrv_shlr   r   r   Úptx_shfl_sync_i32  sB    

.

rŒ   c             C   s^   d}|j }t t t d¡t d¡f¡t d¡t d¡t d¡f¡}|j||d}| ||¡S )Nzllvm.nvvm.vote.syncru   r   )r5   )r?   r   rm   r€   rD   ro   rQ   )r   r   r   r   rq   rU   rr   rŠ   r   r   r   Úptx_vote_syncB  s    r   c             C   s†   |\}}|j d j}|j d tjkr6| |t |¡¡}d |¡}|j}t 	t d¡t d¡t |¡f¡}	|j
|	|d}
| |
||f¡S )Nr   zllvm.nvvm.match.any.sync.i{}ru   )r5   )r   r   r
   r~   rO   r   rD   r_   r?   rm   ro   rQ   )r   r   r   r   r†   rˆ   Úwidthrq   rU   rr   rŠ   r   r   r   Úptx_match_any_syncL  s    
"r   c             C   s–   |\}}|j d j}|j d tjkr6| |t |¡¡}d |¡}|j}t 	t 
t d¡t d¡f¡t d¡t |¡f¡}	|j|	|d}
| |
||f¡S )Nr   zllvm.nvvm.match.all.sync.i{}ru   )r5   )r   r   r
   r~   rO   r   rD   r_   r?   rm   r€   ro   rQ   )r   r   r   r   r†   rˆ   rŽ   rq   rU   rr   rŠ   r   r   r   Úptx_match_all_sync\  s    
r   c             C   s   |  |d ¡S )Nr   )Zctpop)r   r   r   r   r   r   r   Úptx_popcm  s    r‘   c             C   s
   |j |Ž S )N)Úfma)r   r   r   r   r   r   r   Úptx_fmar  s    r“   c             C   s6   |j  tj tj d¡tj d¡f¡d¡}| ||¡S )Nru   Z	__nv_brev)r?   ro   rA   r   rm   rD   rQ   )r   r   r   r   Úfnr   r   r   Úptx_brev_u4w  s    r•   c             C   s6   |j  tj tj d¡tj d¡f¡d¡}| ||¡S )Nr}   Z__nv_brevll)r?   ro   rA   r   rm   rD   rQ   )r   r   r   r   r”   r   r   r   Úptx_brev_u8‚  s    r–   c             C   s   |  |d |  tjd¡¡S )Nr   )Zctlzr.   r
   Úboolean)r   r   r   r   r   r   r   Úptx_clz  s    r˜   c             C   s   |  |d |  tjd¡¡S )Nr   )Zcttzr.   r
   r—   )r   r   r   r   r   r   r   Úptx_ffs”  s    r™   c             C   s   |\}}}|  |||¡S )N)Zselect)r   r   r   r   ZtestÚaÚbr   r   r   Úptx_selp›  s    
rœ   c                sV   |t jkr t j|dd}|g}ntjˆ |t|ƒd}‡ ‡fdd„t||ƒD ƒ}||fS )z4
    Convert integer indices into tuple of intp
    r   )rJ   Úcount)r   c                s"   g | ]\}}ˆ  ˆ ||tj¡‘qS r   )Úcastr
   r7   )r/   Útr0   )r   r   r   r   r1   ª  s   z&_normalize_indices.<locals>.<listcomp>)r
   Zinteger_domainÚUniTupler   Zunpack_tupler   Úzip)r   r   ÚindtyÚindsÚindicesr   )r   r   r   Ú_normalize_indices¡  s    
r¥   c                s   ‡ fdd„}|S )Nc                sœ   |j \}}}|\}}}	|j}
t| |||ƒ\}}|
|krFtd|
|f ƒ‚|jt|ƒkrjtd|jt|ƒf ƒ‚|  |¡| ||ƒ}t ||||¡}ˆ | ||
||	ƒS )Nzexpect %s but got %sz#indexing %d-D array with %d-D index)	r   rJ   r¥   Ú	TypeErrorÚndimr   rR   r   Úget_item_pointer)r   r   r   r   rV   r¢   ÚvaltyrZ   r£   ÚvalrJ   r¤   ÚlaryÚptr)Údispatch_fnr   r   Úimp°  s    
z_atomic_dispatcher.<locals>.impr   )r­   r®   r   )r­   r   Ú_atomic_dispatcher¯  s    r¯   c             C   s`   |t jkr&|j}| t |¡||f¡S |t jkrL|j}| t |¡||f¡S | d||d¡S d S )NÚaddÚ	monotonic)	r
   r   r?   rQ   r   Zdeclare_atomic_add_float32r…   Zdeclare_atomic_add_float64Ú
atomic_rmw)r   r   rJ   r¬   rª   rU   r   r   r   Úptx_atomic_add_tupleÆ  s    

r³   c             C   sš   |j }|tjkr&| t |¡||f¡S |tjkrF| t |¡||f¡S |tjtj	fkrh|j
d||ddS |tjtjfkrŠ|j
d||ddS td| ƒ‚d S )NÚmaxr±   )ÚorderingZumaxz&Unimplemented atomic max with %s array)r?   r
   r…   rQ   r   Zdeclare_atomic_max_float64r   Zdeclare_atomic_max_float32Úint32Úint64r²   Úuint32Úuint64r¦   )r   r   rJ   r¬   rª   rU   r   r   r   Úptx_atomic_maxÕ  s    

rº   c             C   sš   |j }|tjkr&| t |¡||f¡S |tjkrF| t |¡||f¡S |tjtj	fkrh|j
d||ddS |tjtjfkrŠ|j
d||ddS td| ƒ‚d S )NÚminr±   )rµ   Zuminz&Unimplemented atomic min with %s array)r?   r
   r…   rQ   r   Zdeclare_atomic_min_float64r   Zdeclare_atomic_min_float32r¶   r·   r²   r¸   r¹   r¦   )r   r   rJ   r¬   rª   rU   r   r   r   Úptx_atomic_minç  s    

r¼   c             C   sˆ   |j \}}}|\}}}	|j}
|  |¡| ||ƒ}|  tjd¡}t ||||f¡}|jtjkrx|j	}| 
t |¡|||	f¡S td|
 ƒ‚d S )Nr   z3Unimplemented atomic compare_and_swap with %s array)r   rJ   rR   r.   r
   r7   r   r¨   r¶   r?   rQ   r   Zdeclare_atomic_cas_int32r¦   )r   r   r   r   rV   Zoldtyr©   rZ   Úoldrª   rJ   r«   Zzeror¬   rU   r   r   r   Úptx_atomic_cas_tupleù  s    
r¾   c             C   s   t  tj| j ¡S )N)ÚllZcreate_target_datar   Zdata_layoutZaddress_size)r   r   r   r   Ú_get_target_data  s    rÀ   Fc             C   sì   t tj|ƒ}|  |¡}t ||¡}	|tjkr>tj	||	|d}
nž|j
}| |	||¡}|  |¡|_|dkr~|rttj|_qŒtdƒ‚ntj |	¡|_|tjkr¢td| ƒ‚t |t d¡|¡}| t t d¡|¡¡}| ||g¡}
t| ||
||ƒS )N)r5   r   zarray length <= 0zunsupported type: %sr4   )r   Úoperatorr#   rI   r   rC   r   rj   r   Zalloca_oncer?   rE   rK   rM   rA   ZLINKAGE_EXTERNALrG   Ú
ValueErrorrB   ZundefrH   r
   Znumber_domainr¦   r   rN   rD   rO   rP   rQ   Ú_make_array)r   r   r:   rJ   rb   r6   rc   Z	elemcountrW   ZlarytyÚdataptrrU   ZgvmemrX   rY   r   r   r   rd     s&    




rd   ÚCc          	      sø   t |ƒ}tj||dd}ˆ  |¡ˆ |ƒ}tˆ ƒ}	ˆ  |¡}
|
 |	¡}|g}x2tt|dd … ƒƒD ]\}}| 	||d  ¡ q`W dd„ t|ƒD ƒ}‡ fdd„|D ƒ}‡ fdd„|D ƒ}ˆ j
|| ||jj¡t ||¡t ||¡ˆ  tj|¡d d	 | ¡ S )
NrÅ   )rJ   r§   Úlayoutr   éÿÿÿÿc             S   s   g | ]}|‘qS r   r   )r/   r8   r   r   r   r1   I  s    z_make_array.<locals>.<listcomp>c                s   g | ]}ˆ   tj|¡‘qS r   )r.   r
   r7   )r/   r8   )r   r   r   r1   K  s    c                s   g | ]}ˆ   tj|¡‘qS r   )r.   r
   r7   )r/   r8   )r   r   r   r1   L  s    )r9   r:   r;   r<   r>   )r   r
   ÚArrayrR   rÀ   rI   Zget_abi_sizeÚ	enumerateÚreversedÚappendrS   rO   r9   rF   r   r   r.   r7   rT   )r   r   rÄ   rJ   r:   rÆ   r§   rV   rZ   Z
targetdatarW   r<   Zrstridesr0   Zlastsizer;   r[   r\   r   )r   r   rÃ   <  s&    



rÃ   )F)rÅ   )lZ
__future__r   r   r   Ú	functoolsr   rÁ   Zllvmlite.llvmpy.corer   ZllvmpyZcorerA   Zllvmlite.bindingZbindingr¿   Znumba.targets.imputilsr   Znumbar   r	   r
   Zcudadrvr   Ú r   r   ÚregistryÚlowerr7   r   r   r!   r&   r)   r*   r-   ZSREG_MAPPINGÚkeysr+   rÈ   r]   r^   r`   ZAnyrg   r    rh   rk   rl   Zsyncthreadsrt   Zsyncthreads_countZi4rv   Zsyncthreads_andrw   Zsyncthreads_orrx   Zthreadfence_blockry   Zthreadfence_systemrz   Zthreadfencer{   Zsyncwarpr|   Zshfl_sync_intrinsicrƒ   Zf4Zf8rŒ   Zvote_sync_intrinsicr—   r   Zmatch_any_syncr   Zmatch_all_syncr   Zpopcr‘   r’   r“   ZbrevZu4r•   Úu8r–   Zclzr˜   Zffsr™   Zselprœ   r¥   r¯   Zatomicr°   ZTupler³   r´   rº   r»   r¼   Zcompare_and_swapr¾   rÀ   rd   rÃ   r   r   r   r   Ú<module>   s–   
	,
							
,-
   
+