o
    Tæ«dî˜  ã                
   @   s¤  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z!m"Z" e
ƒ Z#e#j$Z$e#j%Z&e#j'Z'dd„ Z(e&e )e¡dƒdd„ ƒZ*e&e )e¡dƒdd„ ƒZ+e&e )e¡dƒdd„ ƒZ,e&e )e¡dƒdd„ ƒZ-e&e )e¡dƒdd„ ƒZ.e&e d ƒd!d"„ ƒZ/e&e d#ƒd$d%„ ƒZ0e&e d&ƒd'd(„ ƒZ1e$ej2j3ƒd)d*„ ƒZ4e$d+e!ƒd,d-„ ƒZ5e$ej6j7ej8ƒd.d/„ ƒZ9d a:d0d1„ Z;e$ej<j=ej>ej?ƒd2d3„ ƒZ@e$ej<j=ejAej?ƒe$ej<j=ejBej?ƒd4d5„ ƒƒZCe$ejDj=ej>ej?ƒd6d7„ ƒZEe$ejDj=ejAej?ƒe$ejDj=ejBej?ƒd8d9„ ƒƒZFe$ejGƒd:d;„ ƒZHe$ejIƒd<d=„ ƒZJe$ejKƒd>d?„ ƒZLe$ejMƒd@dA„ ƒZNe$ejMejOƒdBdC„ ƒZPe$ejQejOejOejOejOejOƒe$ejQejOejOejRejOejOƒe$ejQejOejOejSejOejOƒe$ejQejOejOejTejOejOƒdDdE„ ƒƒƒƒZUe$ejVejOejOejWƒdFdG„ ƒZXe$ejYejOejOƒe$ejYejOejRƒe$ejYejOejSƒe$ejYejOejTƒdHdI„ ƒƒƒƒZZe$ej[ejOejOƒe$ej[ejOejRƒe$ej[ejOejSƒe$ej[ejOejTƒdJdK„ ƒƒƒƒZ\e$ej]ƒdLdM„ ƒZ^e$ej_ƒdNdO„ ƒZ`e$ejaej?ƒdPdQ„ ƒZbe$ejcej?ej?ej?ƒdRdS„ ƒZddTdU„ ZeeejfejgƒdVdW„ ƒZheejgejfƒdXdY„ ƒZidZd[„ Zjeejfejkƒd\d]„ ƒZleejkejfƒeej>ejfƒd^d_„ ƒƒZmd`da„ Znenejojpdbƒ enejqdbƒ enejrdbƒ enejojsdcƒ enejtdcƒ enejudcƒ enejojvddƒ enejwddƒ enejxddƒ e$ejojyejfƒdedf„ ƒZze$ej{ejfƒdgdh„ ƒZ|e$ejoj}ejfƒdidj„ ƒZ~e$eejfƒdkdl„ ƒZ€e$ejojejfejfejfƒdmdn„ ƒZ‚e$ejƒejfejfƒe$ej„ejfejfƒdodp„ ƒƒZ…dqZ†drds„ Z‡e$ejojˆejfejfƒe‡dtƒƒ e$ej‰ejfejfƒe‡dtƒƒ e$ejojŠejfejfƒe‡duƒƒ e$ej‹ejfejfƒe‡duƒƒ e$ejojŒejfejfƒe‡dvƒƒ e$ejejfejfƒe‡dvƒƒ e$ejojŽejfejfƒe‡dwƒƒ e$ejejfejfƒe‡dwƒƒ e$ejojejfejfƒe‡dxƒƒ e$ej‘ejfejfƒe‡dxƒƒ e$ejoj’ejfejfƒe‡dyƒƒ e$ej“ejfejfƒe‡dyƒƒ dzd{„ Z”e”ejoj•d|dwƒ e”ejoj–d}dyƒ ej—d~ej˜diZ™e$ejšej—ƒe$ejšej˜ƒd€d„ ƒƒZ›e$ejœejƒd‚dƒ„ ƒZže$ejœejŸƒd„d…„ ƒZ e$ej¡ej?ƒd†d‡„ ƒZ¢e$ej£ejOƒe$ej£ejƒdˆd‰„ ƒƒZ¤e$ej£ejRƒe$ej£ejŸƒdŠd‹„ ƒƒZ¥e$ej¦ej?ej?ej?ƒdŒd„ ƒZ§e$e¨ejSejSƒdŽd„ ƒZ©e$e¨ejTejSƒe$e¨ejSejTƒe$e¨ejTejTƒdd‘„ ƒƒƒZªe$e«ejSejSƒd’d“„ ƒZ¬e$e«ejTejSƒe$e«ejSejTƒe$e«ejTejTƒd”d•„ ƒƒƒZ­e$e®ejSƒe$e®ejTƒd–d—„ ƒƒZ¯e$e®ejSejkƒe$e®ejTejkƒd˜d™„ ƒƒZ°dšd›„ Z±ej²dœ Z³dœej² Z´e$ejµejSƒe±e³ƒƒ e$ejµejTƒe±e³ƒƒ e$ej¶ejSƒe±e´ƒƒ e$ej¶ejTƒe±e´ƒƒ ddž„ Z·dŸd „ Z¸e$ej¹jqej8ejºej?ƒe$ej¹jqej8ejBej?ƒe$ej¹jqej8ejAej?ƒe¸d¡d¢„ ƒƒƒƒZ»e$ej¹jtej8ejºej?ƒe$ej¹jtej8ejBej?ƒe$ej¹jtej8ejAej?ƒe¸d£d¤„ ƒƒƒƒZ¼e$ej¹j½ej8ejºej?ƒe$ej¹j½ej8ejBej?ƒe$ej¹j½ej8ejAej?ƒe¸d¥d¦„ ƒƒƒƒZ¾e$ej¹j¿ej8ejºej?ƒe$ej¹j¿ej8ejBej?ƒe$ej¹j¿ej8ejAej?ƒe¸d§d¨„ ƒƒƒƒZÀd©dª„ ZÁeÁej¹jÂd«ƒ eÁej¹jÃd¬ƒ eÁej¹jÄd­ƒ e$ej¹jÅej8ejºej?ƒe$ej¹jÅej8ejBej?ƒe$ej¹jÅej8ejAej?ƒe¸d®d¯„ ƒƒƒƒZÆe$ej¹j¨ej8ejºej?ƒe$ej¹j¨ej8ejAej?ƒe$ej¹j¨ej8ejBej?ƒe¸d°d±„ ƒƒƒƒZÇe$ej¹j«ej8ejºej?ƒe$ej¹j«ej8ejAej?ƒe$ej¹j«ej8ejBej?ƒe¸d²d³„ ƒƒƒƒZÈe$ej¹jÉej8ejºej?ƒe$ej¹jÉej8ejAej?ƒe$ej¹jÉej8ejBej?ƒe¸d´dµ„ ƒƒƒƒZÊe$ej¹jËej8ejºej?ƒe$ej¹jËej8ejAej?ƒe$ej¹jËej8ejBej?ƒe¸d¶d·„ ƒƒƒƒZÌe$ej¹jÍej8ej?ej?ƒd¸d¹„ ƒZÎe$ej¹jÏej8ejºej?ej?ƒe$ej¹jÏej8ejAej?ej?ƒe$ej¹jÏej8ejBej?ej?ƒdºd»„ ƒƒƒZÐe$ejÑejÒƒd¼d½„ ƒZÓ	¾dÃd¿dÀ„ZÔe'e"ƒdÁdÂ„ ƒZÕee Ö¡ e$ƒ dS )Äé    )ÚreduceN)Úir)ÚRegistryÚ
lower_cast)Úparse_dtype)Úmodels)ÚtypesÚcgutils)Úufunc_db)Úregister_ufuncsé   )Únvvm)Úcuda)Ú	nvvmutilsÚstubsÚerrors)Údim3Ú
grid_groupÚCUDADispatcherc                 C   sB   t  | d| ¡}t  | d| ¡}t  | d| ¡}t | |||f¡S )Nz%s.xz%s.yz%s.z)r   Ú	call_sregr	   Úpack_struct)ÚbuilderÚprefixÚxÚyÚz© r   úf/home/ncw/WWW/www-new/content/articles/pi-bbp/venv/lib/python3.10/site-packages/numba/cuda/cudaimpl.pyÚinitialize_dim3   s   r   Ú	threadIdxc                 C   ó
   t |dƒS )NÚtid©r   ©Úcontextr   ÚsigÚargsr   r   r   Úcuda_threadIdx    ó   
r'   ÚblockDimc                 C   r    )NÚntidr"   r#   r   r   r   Úcuda_blockDim%   r(   r+   ÚblockIdxc                 C   r    )NÚctaidr"   r#   r   r   r   Úcuda_blockIdx*   r(   r.   ÚgridDimc                 C   r    )NÚnctaidr"   r#   r   r   r   Úcuda_gridDim/   r(   r1   Úlaneidc                 C   s   t  |d¡S )Nr2   )r   r   r#   r   r   r   Úcuda_laneid4   ó   r3   r   c                 C   ó   |  |d¡S ©Nr   ©Úextract_valuer#   r   r   r   Údim3_x9   r4   r9   r   c                 C   r5   ©Nr   r7   r#   r   r   r   Údim3_y>   r4   r;   r   c                 C   r5   )Né   r7   r#   r   r   r   Údim3_zC   r4   r=   c                 C   s(   |   tjd¡}|j}| t |¡|f¡S r:   )Úget_constantr   Úint32ÚmoduleÚcallr   Ú declare_cudaCGGetIntrinsicHandle)r$   r   r%   r&   ÚoneÚlmodr   r   r   Úcg_this_gridH   s   þrE   zGridGroup.syncc                 C   s0   |   tjd¡}|j}| t |¡g |¢|‘R ¡S r6   )r>   r   r?   r@   rA   r   Údeclare_cudaCGSynchronize)r$   r   r%   r&   ÚflagsrD   r   r   r   Úptx_sync_groupQ   s   þrH   c                 C   s   |d S r6   r   r#   r   r   r   Úcuda_const_array_like\   s   rI   c                 C   s   t d7 a d | t ¡S )zÍDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})Ú_unique_smem_idÚformat©Únamer   r   r   Ú_get_unique_smem_idf   s   rN   c              	   C   s8   |j d j}t|j d ƒ}t| ||f|tdƒtjddS )Nr   r   Ú_cudapy_smemT©ÚshapeÚdtypeÚsymbol_nameÚ	addrspaceÚcan_dynsized)r&   Úliteral_valuer   Ú_generic_arrayrN   r   ÚADDRSPACE_SHARED©r$   r   r%   r&   ÚlengthrR   r   r   r   Úcuda_shared_array_integerp   s   ýr[   c              	   C   s>   dd„ |j d D ƒ}t|j d ƒ}t| |||tdƒtjddS )Nc                 S   ó   g | ]}|j ‘qS r   ©rV   ©Ú.0Úsr   r   r   Ú
<listcomp>}   ó    z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   rO   TrP   )r&   r   rW   rN   r   rX   ©r$   r   r%   r&   rQ   rR   r   r   r   Úcuda_shared_array_tuplez   s   
ýrd   c              	   C   s4   |j d j}t|j d ƒ}t| ||f|dtjddS )Nr   r   Ú_cudapy_lmemFrP   )r&   rV   r   rW   r   ÚADDRSPACE_LOCALrY   r   r   r   Úcuda_local_array_integer…   s   ýrg   c              	   C   s:   dd„ |j d D ƒ}t|j d ƒ}t| |||dtjddS )Nc                 S   r\   r   r]   r^   r   r   r   ra   ’   rb   z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   re   FrP   )r&   r   rW   r   rf   rc   r   r   r   Úptx_lmem_alloc_array   s   
ýrh   c                 C   óD   |rJ ‚d}|j }t t ¡ d¡}t |||¡}| |d¡ |  ¡ S )Nzllvm.nvvm.membar.ctar   ©r@   r   ÚFunctionTypeÚVoidTyper	   Úget_or_insert_functionrA   Úget_dummy_value©r$   r   r%   r&   ÚfnamerD   ÚfntyÚsyncr   r   r   Úptx_threadfence_blockš   ó   rs   c                 C   ri   )Nzllvm.nvvm.membar.sysr   rj   ro   r   r   r   Úptx_threadfence_system¥   rt   ru   c                 C   ri   )Nzllvm.nvvm.membar.glr   rj   ro   r   r   r   Úptx_threadfence_device°   rt   rv   c                 C   s*   |   tjd¡}t tj¡}t| |||gƒS )Nl   ÿÿ )r>   r   r?   ÚnoneÚptx_syncwarp_mask)r$   r   r%   r&   ÚmaskÚmask_sigr   r   r   Úptx_syncwarp»   s   r{   c                 C   sD   d}|j }t t ¡ t d¡f¡}t |||¡}| ||¡ |  ¡ S )Nzllvm.nvvm.bar.warp.syncé    )	r@   r   rk   rl   ÚIntTyper	   rm   rA   rn   ro   r   r   r   rx   Â   s   rx   c              
   C   sü  |\}}}}}|j d }	|	tjv r| |t |	j¡¡}d}
|j}t t 	t d¡t d¡f¡t d¡t d¡t d¡t d¡t d¡f¡}t
 |||
¡}|	jdkr| ||||||f¡}|	tjkr}| |d¡}| |d¡}| |t ¡ ¡}t
 |||f¡}|S | |t d¡¡}| ||  tjd¡¡}| |t d¡¡}| ||||||f¡}| ||||||f¡}| |d¡}| |d¡}| |d¡}| |t d¡¡}| |t d¡¡}| ||  tjd¡¡}| ||¡}|	tjkrô| |t ¡ ¡}t
 |||f¡}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r<   zllvm.nvvm.shfl.sync.i32r|   r   r   é@   )r&   r   Úreal_domainÚbitcastr   r}   Úbitwidthr@   rk   ÚLiteralStructTyper	   rm   rA   Úfloat32r8   Ú	FloatTypeÚmake_anonymous_structÚtruncÚlshrr>   Úi8ÚzextÚshlÚor_Úfloat64Ú
DoubleType)r$   r   r%   r&   ry   ÚmodeÚvalueÚindexÚclampÚ
value_typerp   rD   rq   ÚfuncÚretÚrvÚpredÚfvÚvalue1Ú
value_lshrÚvalue2Úret1Úret2Úrv1Úrv2Úrv1_64Úrv2_64Úrv_shlr   r   r   Úptx_shfl_sync_i32Ì   sJ   

ÿþ

ñ
r¢   c                 C   s^   d}|j }t t t d¡t d¡f¡t d¡t d¡t d¡f¡}t |||¡}| ||¡S )Nzllvm.nvvm.vote.syncr|   r   )r@   r   rk   r‚   r}   r	   rm   rA   )r$   r   r%   r&   rp   rD   rq   r“   r   r   r   Úptx_vote_sync  s   ÿþr£   c                 C   s†   |\}}|j d j}|j d tjv r| |t |¡¡}d |¡}|j}t 	t d¡t d¡t |¡f¡}	t
 ||	|¡}
| |
||f¡S )Nr   zllvm.nvvm.match.any.sync.i{}r|   )r&   r   r   r   r€   r   r}   rK   r@   rk   r	   rm   rA   ©r$   r   r%   r&   ry   r   Úwidthrp   rD   rq   r“   r   r   r   Úptx_match_any_sync  s   
"r¦   c                 C   s–   |\}}|j d j}|j d tjv r| |t |¡¡}d |¡}|j}t 	t 
t d¡t d¡f¡t d¡t |¡f¡}	t ||	|¡}
| |
||f¡S )Nr   zllvm.nvvm.match.all.sync.i{}r|   )r&   r   r   r   r€   r   r}   rK   r@   rk   r‚   r	   rm   rA   r¤   r   r   r   Úptx_match_all_sync  s   
ÿþr§   c                 C   ó,   t jt  t  d¡g ¡dddd}| |g ¡S )Nr|   zactivemask.b32 $0;ú=rT©Úside_effect©r   Ú	InlineAsmrk   r}   rA   ©r$   r   r%   r&   Ú
activemaskr   r   r   Úptx_activemask/  s   ÿr°   c                 C   r¨   )Nr|   zmov.u32 $0, %lanemask_lt;r©   Trª   r¬   r®   r   r   r   Úptx_lanemask_lt6  s
   þr±   c                 C   s   |  |d ¡S r6   )Úctpopr#   r   r   r   Úptx_popc>  ó   r³   c                 C   s
   |j |Ž S ©N)Úfmar#   r   r   r   Úptx_fmaC  r(   r·   c                 C   s:   dddœ}z||  W S  t y   d| › d}t |¡‚w )N)Úf32Úf)Úf64Úd)r|   r~   z$Conversion between float16 and floatú unsupported©ÚKeyErrorr   ÚCudaLoweringError©r   ÚtypemapÚmsgr   r   r   Úfloat16_float_ty_constraintH  s   


þrÃ   c           	      C   sd   |j |j kr|S t|j ƒ\}}t |  |¡t d¡g¡}t |d|› dd|› d¡}| ||g¡S )Né   zcvt.ú.f16 $0, $1;ú=ú,h)r   rÃ   r   rk   Úget_value_typer}   r­   rA   ©	r$   r   ÚfromtyÚtotyÚvalÚtyÚ
constraintrq   Úasmr   r   r   Úfloat16_to_float_castR  s   rÐ   c           	      C   sb   |j |j kr|S t|j ƒ\}}t t d¡|  |¡g¡}t |d|› dd|› ¡}| ||g¡S )NrÄ   úcvt.rn.f16.ú $0, $1;ú=h,)r   rÃ   r   rk   r}   rÈ   r­   rA   rÉ   r   r   r   Úfloat_to_float16_cast^  s   rÔ   c                 C   s>   dddddœ}z||  W S  t y   d| › d}t |¡‚w )NÚcÚhÚrÚl)é   rÄ   r|   r~   z"Conversion between float16 and intr¼   r½   rÀ   r   r   r   Úfloat16_int_constraintj  s   

þrÚ   c           
      C   sf   |j }t|ƒ}|jrdnd}t |  |¡t d¡g¡}t |d|› |› dd|› d¡}	| |	|g¡S )Nr`   ÚurÄ   zcvt.rni.rÅ   rÆ   rÇ   )	r   rÚ   Úsignedr   rk   rÈ   r}   r­   rA   ©
r$   r   rÊ   rË   rÌ   r   rÎ   Ú
signednessrq   rÏ   r   r   r   Úfloat16_to_integer_castt  s   
þrß   c           
      C   sd   |j }t|ƒ}|jrdnd}t t d¡|  |¡g¡}t |d|› |› dd|› ¡}	| |	|g¡S )Nr`   rÛ   rÄ   rÑ   rÒ   rÓ   )	r   rÚ   rÜ   r   rk   r}   rÈ   r­   rA   rÝ   r   r   r   Úinteger_to_float16_cast  s   
ÿþrà   c                    s    t | tjtjƒ‡ fdd„ƒ}d S )Nc                    sB   t  t  d¡t  d¡t  d¡g¡}t  |ˆ › dd¡}| ||¡S )NrÄ   z.f16 $0,$1,$2;ú=h,h,h©r   rk   r}   r­   rA   ©r$   r   r%   r&   rq   rÏ   ©Úopr   r   Úptx_fp16_binary‘  s
   ÿz*lower_fp16_binary.<locals>.ptx_fp16_binary©Úlowerr   Úfloat16)Úfnrå   ræ   r   rä   r   Úlower_fp16_binary  ó   rë   ÚaddÚsubÚmulc                 C   ó4   t  t  d¡t  d¡g¡}t  |dd¡}| ||¡S )NrÄ   zneg.f16 $0, $1;ú=h,hrâ   rã   r   r   r   Úptx_fp16_hneg¤  ó   rò   c                 C   ó   t | |||ƒS rµ   )rò   r#   r   r   r   Úoperator_hneg«  r´   rõ   c                 C   rð   )NrÄ   zabs.f16 $0, $1;rñ   râ   rã   r   r   r   Úptx_fp16_habs°  ró   rö   c                 C   rô   rµ   )rö   r#   r   r   r   Úoperator_habs·  r´   r÷   c                 C   sH   t  d¡t  d¡t  d¡g}t  t  d¡|¡}t  |dd¡}| ||¡S )NrÄ   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   r}   rk   r­   rA   )r$   r   r%   r&   Úargtysrq   rÏ   r   r   r   Úptx_hfma¼  s   rù   c                 C   ó   dd„ }|   ||||¡S )Nc                 S   s   t j | |¡S rµ   )r   Úfp16Úhdiv)r   r   r   r   r   Úfp16_divÇ  s   zfp16_div_impl.<locals>.fp16_div©Úcompile_internal)r$   r   r%   r&   rý   r   r   r   Úfp16_div_implÄ  s   r   z’{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                    ó   ‡ fdd„}|S )Nc           	         sr   t  t  d¡t  d¡t  d¡g¡}t  |tjˆ dd¡}| ||¡}|  tj	d¡}| 
|t  d¡¡}| d||¡S )NrÄ   rä   rá   r   z!=)r   rk   r}   r­   Ú	_fp16_cmprK   rA   r>   r   Úint16r€   Úicmp_unsigned)	r$   r   r%   r&   rq   rÏ   ÚresultÚzeroÚ
int_resulträ   r   r   Úptx_fp16_comparisonÕ  s   "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )rå   r  r   rä   r   Ú_gen_fp16_cmpÔ  s   r	  ÚeqÚneÚgeÚgtÚleÚltc                    s    t | tjtjƒ‡ fdd„ƒ}d S )Nc                    s(   t ˆ ƒ| |||ƒ}| ||d |d ¡S )Nr   r   )r	  Úselect)r$   r   r%   r&   Úchoicerä   r   r   Úptx_fp16_minmaxï  s   z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxrç   )rê   rp   rå   r  r   rä   r   Úlower_fp16_minmaxî  rì   r  ÚmaxÚminÚ
__nv_cbrtfÚ	__nv_cbrtc           
      C   sF   |j }t| }|  |¡}|j}t ||g¡}t |||¡}	| |	|¡S rµ   )	Úreturn_typeÚ
cbrt_funcsrÈ   r@   r   rk   r	   rm   rA   )
r$   r   r%   r&   rÍ   rp   ÚftyrD   rq   rê   r   r   r   Úptx_cbrt  s   
r  c              	   C   ó2   t  |jt t d¡t d¡f¡d¡}| ||¡S )Nr|   Ú	__nv_brev©r	   rm   r@   r   rk   r}   rA   ©r$   r   r%   r&   rê   r   r   r   Úptx_brev_u4  ó   ýr   c              	   C   r  )Nr~   Ú__nv_brevllr  r  r   r   r   Úptx_brev_u8  r!  r#  c                 C   s   |  |d |  tjd¡¡S r6   )Úctlzr>   r   Úbooleanr#   r   r   r   Úptx_clz'  s   þr&  c              	   C   r  )Nr|   Ú__nv_ffsr  r  r   r   r   Ú
ptx_ffs_32.  ó   ýr(  c              	   C   s2   t  |jt t d¡t d¡f¡d¡}| ||¡S )Nr|   r~   Ú
__nv_ffsllr  r  r   r   r   Ú
ptx_ffs_648  r)  r+  c                 C   s   |\}}}|  |||¡S rµ   )r  )r$   r   r%   r&   ÚtestÚaÚbr   r   r   Úptx_selpB  s   
r/  c              	   C   ó4   t  |jt t ¡ t ¡ t ¡ f¡d¡}| ||¡S )NÚ
__nv_fmaxf©r	   rm   r@   r   rk   r„   rA   r  r   r   r   Ú
ptx_max_f4H  ó   þûr3  c              
   C   óh   t  |jt t ¡ t ¡ t ¡ f¡d¡}| ||  ||d |jd t	j
¡|  ||d |jd t	j
¡g¡S )NÚ	__nv_fmaxr   r   ©r	   rm   r@   r   rk   r   rA   Úcastr&   r   Údoubler  r   r   r   Ú
ptx_max_f8S  ó   þûþr:  c              	   C   r0  )NÚ
__nv_fminfr2  r  r   r   r   Ú
ptx_min_f4d  r4  r=  c              
   C   r5  )NÚ	__nv_fminr   r   r7  r  r   r   r   Ú
ptx_min_f8o  r;  r?  c              	   C   sJ   t  |jt t d¡t ¡ f¡d¡}| ||  ||d |j	d t
j¡g¡S )Nr~   Ú__nv_llrintr   )r	   rm   r@   r   rk   r}   r   rA   r8  r&   r   r9  r  r   r   r   Ú	ptx_round€  s   þûÿrA  c                 C   rú   )Nc                 S   sÂ   t  | ¡s
t  | ¡r| S |dkr1|dkrd|d  }d}nd| }d}| | | }t  |¡r0| S n	d|  }| | }t|ƒ}t  || ¡dkrOdt|d ƒ }|dkr[|| | }|S ||9 }|S )Nr   é   g      $@g’ÕMÏð€Dg      ð?g      à?g       @)ÚmathÚisinfÚisnanÚroundÚfabs)r   ÚndigitsÚpow1Úpow2r   r   r   r   r   Úround_ndigits•  s,   
ÿ
þz$round_to_impl.<locals>.round_ndigitsrþ   )r$   r   r%   r&   rK  r   r   r   Úround_to_impl’  s   !rL  c                    r  )Nc                    s$   |j \}|  |ˆ ¡}| ||d ¡S r6   )r&   r>   Úfmul)r$   r   r%   r&   ÚargtyÚfactor©Úconstr   r   Úimplº  s   zgen_deg_rad.<locals>.implr   )rQ  rR  r   rP  r   Úgen_deg_rad¹  s   rS  g     €f@c                    s˜   |t jv rt j|dd}|g}n
tjˆ |t|ƒd}‡ ‡fdd„t||ƒD ƒ}|j}||kr6td||f ƒ‚|j	t|ƒkrHtd|j	t|ƒf ƒ‚||fS )z4
    Convert integer indices into tuple of intp
    r   )rR   Úcount)rT  c                    s"   g | ]\}}ˆ  ˆ ||tj¡‘qS r   )r8  r   Úintp)r_   ÚtÚi©r   r$   r   r   ra   Ò  s    ÿz&_normalize_indices.<locals>.<listcomp>zexpect %s but got %sz#indexing %d-D array with %d-D index)
r   Úinteger_domainÚUniTupler	   Úunpack_tupleÚlenÚziprR   Ú	TypeErrorÚndim)r$   r   ÚindtyÚindsÚarytyÚvaltyÚindicesrR   r   rX  r   Ú_normalize_indicesÉ  s   
ÿÿre  c                    r  )Nc                    sj   |j \}}}|\}}}	|j}
t| |||||ƒ\}}|  |¡| ||ƒ}tj| ||||dd}ˆ | ||
||	ƒS )NT©Ú
wraparound)r&   rR   re  Ú
make_arrayr	   Úget_item_pointer)r$   r   r%   r&   rb  r`  rc  Úaryra  rÌ   rR   rd  ÚlaryÚptr©Údispatch_fnr   r   Úimpá  s   

ÿÿz_atomic_dispatcher.<locals>.impr   )rn  ro  r   rm  r   Ú_atomic_dispatcherà  s   rp  c                 C   ó\   |t jkr|j}| t |¡||f¡S |t jkr&|j}| t |¡||f¡S | d||d¡S )Nrí   Ú	monotonic)	r   rƒ   r@   rA   r   Údeclare_atomic_add_float32rŒ   Údeclare_atomic_add_float64Ú
atomic_rmw©r$   r   rR   rl  rÌ   rD   r   r   r   Úptx_atomic_add_tupleò  ó   
ÿ
ÿrw  c                 C   rq  )Nrî   rr  )	r   rƒ   r@   rA   r   Údeclare_atomic_sub_float32rŒ   Údeclare_atomic_sub_float64ru  rv  r   r   r   Úptx_atomic_sub  rx  r{  c                 C   óL   |t jjv r|j}|j}ttd|› ƒ}| ||ƒ||f¡S td|› dƒ‚)NÚdeclare_atomic_inc_intzUnimplemented atomic inc with ú array©	r   ÚcudadeclÚunsigned_int_numba_typesr   r@   Úgetattrr   rA   r^  ©r$   r   rR   rl  rÌ   ÚbwrD   rê   r   r   r   Úptx_atomic_inc  ó   r…  c                 C   r|  )NÚdeclare_atomic_dec_intzUnimplemented atomic dec with r~  r  rƒ  r   r   r   Úptx_atomic_dec"  r†  rˆ  c                    s@   t ‡ fdd„ƒ}tjtjtjfD ]}t| tj|tjƒ|ƒ qd S )Nc                    s2   |t jjv r| ˆ ||d¡S tdˆ › d|› dƒ‚)Nrr  zUnimplemented atomic z with r~  ©r   r€  Úinteger_numba_typesru  r^  ©r$   r   rR   rl  rÌ   rä   r   r   Úimpl_ptx_atomic1  s   z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)rp  r   rU  rZ  ÚTuplerè   ÚArrayÚAny)Ústubrå   rŒ  rÍ   r   rä   r   Úptx_atomic_bitwise0  s
   ÿr‘  ÚandÚorÚxorc                 C   s,   |t jjv r| d||d¡S td|› dƒ‚)NÚxchgrr  zUnimplemented atomic exch with r~  r‰  r‹  r   r   r   Úptx_atomic_exchA  s   r–  c                 C   ó–   |j }|tjkr| t |¡||f¡S |tjkr#| t |¡||f¡S |tjtj	fv r4|j
d||ddS |tjtjfv rE|j
d||ddS td| ƒ‚©Nr  rr  ©ÚorderingÚumaxz&Unimplemented atomic max with %s array)r@   r   rŒ   rA   r   Údeclare_atomic_max_float64rƒ   Údeclare_atomic_max_float32r?   Úint64ru  Úuint32Úuint64r^  rv  r   r   r   Úptx_atomic_maxL  ó   
ÿ
ÿr¡  c                 C   r—  ©Nr  rr  r™  Úuminz&Unimplemented atomic min with %s array)r@   r   rŒ   rA   r   Údeclare_atomic_min_float64rƒ   Údeclare_atomic_min_float32r?   rž  ru  rŸ  r   r^  rv  r   r   r   Úptx_atomic_min`  r¢  r§  c                 C   r—  r˜  )r@   r   rŒ   rA   r   Údeclare_atomic_nanmax_float64rƒ   Údeclare_atomic_nanmax_float32r?   rž  ru  rŸ  r   r^  rv  r   r   r   Úptx_atomic_nanmaxt  r¢  rª  c                 C   r—  r£  )r@   r   rŒ   rA   r   Údeclare_atomic_nanmin_float64rƒ   Údeclare_atomic_nanmin_float32r?   rž  ru  rŸ  r   r^  rv  r   r   r   Úptx_atomic_nanminˆ  r¢  r­  c                 C   sT   |  |jd tj|jd |jd ¡}|d |  tjd¡|d |d f}t| |||ƒS )Nr   r   r<   )r  r&   r   rU  r>   Úptx_atomic_casr#   r   r   r   Úptx_atomic_compare_and_swapœ  s   $"r¯  c                 C   s–   |j \}}}}|\}}	}
}t| |||	||ƒ\}}|  |¡| ||ƒ}tj| ||||dd}|jtjjv rD|j	}|jj
}t |||||
|¡S td|j ƒ‚)NTrf  z&Unimplemented atomic cas with %s array)r&   re  rh  r	   ri  rR   r   r€  rŠ  r@   r   r   Úatomic_cmpxchgr^  )r$   r   r%   r&   rb  r`  Úoldtyrc  rj  ra  ÚoldrÌ   rd  rk  rl  rD   r   r   r   r   r®  £  s   ÿÿr®  c                 C   s@   t jt  t  ¡ t  d¡g¡dddd}|d }| ||g¡ d S )Nr|   znanosleep.u32 $0;r×   Trª   r   )r   r­   rk   rl   r}   rA   )r$   r   r%   r&   Ú	nanosleepÚnsr   r   r   Úptx_nanosleep»  s
   ÿrµ  Fc               	      sb  t tj|dƒ}|dko|ot|ƒdk}|dkr|stdƒ‚ˆ j| }	t|tjtj	fƒp5t|	t
jƒp5|tjk}
|tjvrC|
sCtd| ƒ‚ˆ  |¡}t ||¡}|tjkr\tj|||d}n4|j}t ||||¡}ˆ  |¡}d|d  ¡ > |_|r{d|_nt |tj¡|_| |t  t !d¡¡d¡}t" #t $¡ j%¡}ˆ  |¡}| &|¡}|}g }t't(|ƒƒD ]\}}| )|¡ ||9 }q¬d	d
„ t(|ƒD ƒ}‡ fdd
„|D ƒ}|rútj*t +t !d¡g ¡dddd}| ,| -|g ¡t !d¡¡}ˆ  .tj/|¡}| 0||¡g}n	‡ fdd
„|D ƒ}t|ƒ}tj1||dd}ˆ  2|¡ˆ |ƒ}ˆ j3|| 4||j5j6¡||ˆ  .tj/|¡d d | 7¡ S )Nr   r   zarray length <= 0zunsupported type: %srL   ÚexternalrÙ   Úgenericc                 S   s   g | ]}|‘qS r   r   r^   r   r   r   ra     s    z"_generic_array.<locals>.<listcomp>c                    ó   g | ]	}ˆ   tj|¡‘qS r   ©r>   r   rU  r^   ©r$   r   r   ra     ó    r|   zmov.u32 $0, %dynamic_smem_size;r©   Trª   r~   c                    r¸  r   r¹  r^   rº  r   r   ra     r»  ÚC)rR   r_  Úlayout)ÚdatarQ   ÚstridesÚitemsizeÚmeminfo)8r   Úoperatorrï   r\  Ú
ValueErrorÚdata_model_managerÚ
isinstancer   ÚRecordÚBooleanr   ÚStructModelré   Únumber_domainr^  Úget_data_typer   Ú	ArrayTyper   rf   r	   Úalloca_oncer@   Úadd_global_variableÚget_abi_sizeofÚ
bit_lengthÚalignÚlinkageÚConstantÚ	UndefinedÚinitializerÚaddrspacecastÚPointerTyper}   ÚllÚcreate_target_dataÚNVVMÚdata_layoutÚget_abi_sizeÚ	enumerateÚreversedÚappendr­   rk   r‰   rA   r>   rU  ÚudivrŽ  rh  Úpopulate_arrayr€   r¾  ÚtypeÚ	_getvalue) r$   r   rQ   rR   rS   rT   rU   Ú	elemcountÚdynamic_smemÚ
data_modelÚother_supported_typeÚlldtypeÚlarytyÚdataptrrD   ÚgvmemrÐ  Ú
targetdatarÀ  Ú
laststrideÚrstridesrW  Úlastsizer¿  ÚkstridesÚget_dynshared_sizeÚdynsmem_sizeÚ	kitemsizeÚkshaper_  rb  rj  r   rº  r   rW   Æ  sx   

ÿý


ÿ
ÿ



þÿûrW   c                 C   s   |   ¡ S rµ   )rn   )r$   r   rÍ   Úpyvalr   r   r   Úcuda_dispatcher_const*  s   rõ  )F)×Ú	functoolsr   rÂ  rC  Úllvmliter   Úllvmlite.bindingÚbindingr×  Únumba.core.imputilsr   r   Únumba.core.typing.npydeclr   Únumba.core.datamodelr   Ú
numba.corer   r	   Únumba.npr
   Únumba.np.npyimplr   Úcudadrvr   Únumbar   Ú
numba.cudar   r   r   Únumba.cuda.typesr   r   r   Úregistryrè   Úlower_getattrÚ
lower_attrÚlower_constantr   ÚModuler'   r+   r.   r1   r3   r9   r;   r=   ÚcgÚ	this_gridrE   rH   rQ  Ú
array_likerŽ  rI   rJ   rN   ÚsharedÚarrayÚIntegerLiteralr  r[   r  rZ  rd   Úlocalrg   rh   Úthreadfence_blockrs   Úthreadfence_systemru   Úthreadfencerv   Úsyncwarpr{   Úi4rx   Úshfl_sync_intrinsicrˆ   Úf4Úf8r¢   Úvote_sync_intrinsicr%  r£   Úmatch_any_syncr¦   Úmatch_all_syncr§   r¯   r°   Úlanemask_ltr±   Úpopcr³   r¶   r·   rÃ   ré   ÚFloatrÐ   rÔ   rÚ   ÚIntegerrß   rà   rë   rû   Úhaddrí   ÚiaddÚhsubrî   ÚisubÚhmulrï   ÚimulÚhnegrò   Únegrõ   Úhabsrö   Úabsr÷   Úhfmarù   ÚtruedivÚitruedivr   r  r	  Úheqr
  Úhner  Úhger  Úhgtr  Úhler  Úhltr  r  ÚhmaxÚhminrƒ   rŒ   r  Úcbrtr  ÚbrevÚu4r   Úu8r#  Úclzr&  Úffsr(  r+  Úselpr/  r  r3  r:  r  r=  r?  rF  rA  rL  rS  ÚpiÚ_deg2radÚ_rad2degÚradiansÚdegreesre  rp  ÚatomicrU  rw  r{  Úincr…  Údecrˆ  r‘  Úand_r‹   r”  Úexchr–  r¡  r§  Únanmaxrª  Únanminr­  Úcompare_and_swapr¯  Úcasr®  r³  rŸ  rµ  rW   rõ  Ú
get_ufuncsr   r   r   r   Ú<module>   s   














		
		







	ÿÿÿÿ.










	





þ










%






ÿd
