o
    Tdf;                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ d dlmZmZmZ d dlm Z  d dl!m"Z" G dd de	jZ#e $de j%Z&G dd deZ'G dd deZ(dS )    N)cached_property)ir)typingtypes	debuginfoitanium_manglercgutils)
Dispatcher)NumbaInvalidConfigWarning)BaseContext)MinimalCallConv)	cmathdecl)	datamodel   )nvvm)codegen	nvvmutilsufuncs)cuda_data_manager)warnc                       s$   e Zd Zdd Z fddZ  ZS )CUDATypingContextc                 C   s   ddl m}m}m}m} ddlm}m} | |j	 | |j	 | |j	 | t
j	 | |j	 | |j	 | |j d S )Nr   )cudadeclcudamathlibdevicedeclvector_typesr   )enumdecl
cffi_utils) r   r   r   r   numba.core.typingr   r   install_registryregistryr   typing_registry)selfr   r   r   r   r   r    r#   d/home/ncw/WWW/www-new/content/articles/pi-bbp/venv/lib/python3.10/site-packages/numba/cuda/target.pyload_additional_registries   s   z,CUDATypingContext.load_additional_registriesc                    s   ddl m} t|trJt||sJz|j}W n4 tyI   |js#td|j	 }d|d< |
dd|d< |
dd|d< ||j|}||_|}Y nw tt| |S )	Nr   )CUDADispatcherz<using cpu function on device but its compilation is disabledTdevicedebugFopt)numba.cuda.dispatcherr&   
isinstancer	   _CUDATypingContext__dispatcherAttributeError_can_compile
ValueErrortargetoptionscopygetpy_funcsuperr   resolve_value_type)r"   valr&   r0   disp	__class__r#   r$   r5   %   s$   


z$CUDATypingContext.resolve_value_type)__name__
__module____qualname__r%   r5   __classcell__r#   r#   r8   r$   r      s    r   z	[^a-z0-9]c                       s   e Zd ZdZdZd* fdd	Zedd Zedd Zd	d
 Z	dd Z
dd Zdd Zedd Zedd Zedd ZdddddZ	d+ddZdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Z  ZS ),CUDATargetContextTcudac                    s    t  || ttj| _d S N)r4   __init__r   chainr   default_managerdata_model_manager)r"   	typingctxtargetr8   r#   r$   rA   I   s   
zCUDATargetContext.__init__c                 C   s&   t  jrtjS d}tt| tjS )Nz3debuginfo is not generated for CUDA toolkits < 11.2)r   NVVM	is_nvvm70r   	DIBuilderr   r
   DummyDIBuilder)r"   msgr#   r#   r$   rI   O   s
   
zCUDATargetContext.DIBuilderc                 C      dS )NFr#   r"   r#   r#   r$   enable_boundscheckX      z$CUDATargetContext.enable_boundscheckc                 C   s   | j |S r@   )_internal_codegen_create_empty_module)r"   namer#   r#   r$   create_module^   s   zCUDATargetContext.create_modulec                 C   s   t d| _d | _d S )Nznumba.cuda.jit)r   JITCUDACodegenrP   _target_datarM   r#   r#   r$   inita   s   
zCUDATargetContext.initc                 C   s   ddl m}m}m} ddl m}m}m} ddl m}m} ddl m	}	 ddl
m}
 ddlm} ddlm} d	d
lm}m}m}m}m} ddlm} | |j | |
j | |j | |j | |	j | |j | |j d S )Nr   )numberstupleobjslicing)rangeobj	iteratorsenumimpl)unicodecharseq)	cmathimpl)cffiimpl)arrayobj)
npdatetimer   )cudaimpl	printimpllibdeviceimplmathimplr   )ndarray)numba.cpythonrW   rX   rY   rZ   r[   r\   r]   r^   r_   
numba.miscr`   numba.npra   rb   r   rc   rd   re   rf   r   numba.np.unsaferg   r   r    impl_registry)r"   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   r   rg   r#   r#   r$   r%   e   s    z,CUDATargetContext.load_additional_registriesc                 C   s   | j S r@   )rP   rM   r#   r#   r$   r   }   s   zCUDATargetContext.codegenc                 C   s"   | j d u rtt j| _ | j S r@   )rU   llcreate_target_datar   rG   data_layoutrM   r#   r#   r$   target_data   s   
zCUDATargetContext.target_datac                    s*   ddl m  d}t fdd|D }|S )z
        Some CUDA intrinsics are at the module level, but cannot be treated as
        constants, because they are loaded from a special register in the PTX.
        These include threadIdx, blockDim, etc.
        r   r?   )	threadIdxblockDimblockIdxgridDimlaneidwarpsizec                    s   g | ]	}t  |fqS r#   )r   Module).0ncrq   r#   r$   
<listcomp>   s    z;CUDATargetContext.nonconst_module_attrs.<locals>.<listcomp>)numbar?   tuple)r"   	nonconstsnonconsts_with_modr#   rq   r$   nonconst_module_attrs   s   z'CUDATargetContext.nonconst_module_attrsc                 C   s   t | S r@   )CUDACallConvrM   r#   r#   r$   	call_conv   s   zCUDATargetContext.call_convr#   Nabi_tagsuidc                C   s   t j||||dS )Nr   )r   mangle)r"   rR   argtypesr   r   r#   r#   r$   mangler   s   
zCUDATargetContext.manglerc	              	   C   sV   t j|jdd}	|  j|j d|	||d}
|
| | |
||	||||}|
|fS )a  
        Adapt a code library ``codelib`` with the numba compiled CUDA kernel
        with name ``fname`` and arguments ``argtypes`` for NVVM.
        A new library is created with a wrapper function that can be used as
        the kernel entry point for the given kernel.

        Returns the new code library and the wrapper function.

        Parameters:

        codelib:       The CodeLibrary containing the device function to wrap
                       in a kernel call.
        fndesc:        The FunctionDescriptor of the source function.
        debug:         Whether to compile with debug.
        lineinfo:      Whether to emit line info.
        nvvm_options:  Dict of NVVM options used when compiling the new library.
        filename:      The source filename that the function is contained in.
        linenum:       The source line that the function is on.
        max_registers: The max_registers argument for the code library.
        cudapyns_kernel_)
entry_namenvvm_optionsmax_registers)r   prepend_namespacellvm_func_namer   create_libraryrR   add_linking_librarygenerate_kernel_wrapper)r"   codelibfndescr(   lineinfor   filenamelinenumr   kernel_namelibrarywrapperr#   r#   r$   prepare_cuda_kernel   s   

z%CUDATargetContext.prepare_cuda_kernelc           $   	      s  |j }| |}	t|	j}
tt |
}| dttd| j	
tjg|
 }t||j}tj|jdd}t|| t d}|sN|rl|oR| }| j|| |d}| ||j|| |||  fdd}|d	}g }g }d
D ]}||d|  ||d|  q}|	| j}| j	||tj||\}}|rkt||j |  W d   n1 sw   Y  | |!|j" t#|j$j%d}t&' j(r|)|||j*dd}|+|d}n(t|j$|j$|j$|j$g}d}tj||d}|,||||j*g}|-d||}t./|} | |4 t0d
|D ]\}!}"| 1|!}#|2|#|" q$t0d
|D ]\}!}"| 3|!}#|2|#|" q:W d   n	1 sVw   Y  W d   n	1 sfw   Y  |  t&4  |5 |s|r|6  |6  |7 j  S )z
        Generate the kernel wrapper in the given ``library``.
        The function being wrapped is described by ``fndesc``.
        The wrapper function is returned.
        zcuda.kernel.wrapper    r   r   r   )modulefilepathcgctxdirectives_onlyc                    s4    j |  }ttd|}t|jjd |_|S )Nr   )	rR   r   add_global_variabler   IntTypeConstanttypepointeeinitializer)postfixrR   gvwrapfnwrapper_moduler#   r$   define_error_gv   s   
zBCUDATargetContext.generate_kernel_wrapper.<locals>.define_error_gv__errcode__xyzz	__tid%s__z__ctaid%s__N	monotonicr   ___numba_atomic_i32_cas_hack)rR   z==)8r   get_arg_packerlistargument_typesr   FunctionTypeVoidTyperS   r   r   get_return_typer   pyobjectFunctionr   r   r   rR   	IRBuilderappend_basic_blockrI   mark_subprogramargsmark_locationappendfrom_argumentscall_functionvoidr   	if_likelyis_okret_voidif_thennot_is_python_excr   r   r   r   rG   rH   cmpxchgcodeextract_valuecallicmp_unsignedr   SRegBuilderziptidstorectaidset_cuda_kerneladd_ir_modulefinalizeget_function)$r"   r   r   r   r(   r   r   r   r   arginfoargtyswrapfntyfntyfuncprefixedbuilderr   r   r   gv_excgv_tidgv_ctaidicallargsstatus_oldxchgchangedcasfntycas_hackcasfnsregdimptrr6   r#   r   r$   r      s   










z)CUDATargetContext.generate_kernel_wrapperc              	      s   |j } fddt|jddD }ttdt|}t||}tj	}t
j||jd|d}	d|	_d	|	_||	_ |j}
 |
}d
|d   |	_ttd}||	|d} | |} fdd|jD } fdd|jD } j||||jj|||j|jdd | S )i
        Unlike the parent version.  This returns a a pointer in the constant
        addrspace.
        c                       g | ]	}  tj|qS r#   )get_constantr   byte)ry   r   rM   r#   r$   r{   *  s    z9CUDATargetContext.make_constant_array.<locals>.<listcomp>A)order   _cudapy_cmem	addrspaceinternalT   r   genericc                    r   r#   r   r   intpry   srM   r#   r$   r{   C      c                    r   r#   r   r   rM   r#   r$   r{   D  r   N)datashapestridesitemsizeparentmeminfo) r   itertobytesr   	ArrayTyper   lenr   r   ADDRSPACE_CONSTANTr   r   r   linkageglobal_constantr   get_data_typedtypeget_abi_sizeof
bit_lengthalignPointerTypeaddrspacecast
make_arrayr  r  populate_arraybitcastr  r  r  	_getvalue)r"   r   arytyarrlmod	constvals
constarytyconstaryr   r   lldtyper  ptrtygenptrarykshapekstridesr#   rM   r$   make_constant_array"  s8   

z%CUDATargetContext.make_constant_arrayc                 C   s   t |dd }ddt|g}|j|}|du r2t j||j	|t
jd}d|_d|_||_|j	jj}||t
jS )	r   zutf-8    $__conststring__Nr   r   T)r   make_bytearrayencodejoinr   mangle_identifierglobalsr2   r   r   r   r  r  r  r   r   elementr  
as_pointer)r"   modstringtextrR   r   chartyr#   r#   r$   insert_const_stringM  s   
z%CUDATargetContext.insert_const_stringc                 C   s0   |j }| ||}ttd}|||dS )z
        Insert a constant string in the constant addresspace and return a
        generic i8 pointer to the data.

        This function attempts to deduplicate.
        r   r   )r   r4  r   r  r   r  )r"   r   r1  r  r   	charptrtyr#   r#   r$   insert_string_const_addrspacec  s   z/CUDATargetContext.insert_string_const_addrspacec                 C   rL   )zRun O1 function passes
        Nr#   )r"   r   r#   r#   r$   optimize_functiono  rO   z#CUDATargetContext.optimize_functionc                 C   s
   t |S r@   )r   get_ufunc_info)r"   	ufunc_keyr#   r#   r$   r8  |  s   
z CUDATargetContext.get_ufunc_inforq   r@   )r:   r;   r<   implement_powi_as_math_callstrict_alignmentrA   propertyrI   rN   rS   rV   r%   r   rp   r   r   r   r   r   r   r%  r4  r6  r7  r8  r=   r#   r#   r8   r$   r>   E   s6    





$b+r>   c                   @   s   e Zd ZdS )r   N)r:   r;   r<   r#   r#   r#   r$   r     s    r   ))re	functoolsr   llvmlite.bindingbindingrm   llvmliter   
numba.corer   r   r   r   r   numba.core.dispatcherr	   numba.core.errorsr
   numba.core.baser   numba.core.callconvr   r   r   r   cudadrvr   
numba.cudar   r   r   numba.cuda.modelsr   warningsr   r   compileIVALID_CHARSr>   r   r#   r#   r#   r$   <module>   s*    *  =