o
    Td"                     @   s   d dl mZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 G dd deZG dd de
ZG d	d
 d
e	ZG dd deZdZG dd dejZdZG dd dejZdS )    )cuda)array)deviceufunc)UFuncMechanismGeneralizedUFuncGUFuncCallStepsc                   @   s2   e Zd ZdZdd Zdd ZdddZd	d
 ZdS )CUDAUFuncDispatcherzD
    Invoke the CUDA ufunc specialization for the given inputs.
    c                 C   s   || _ |j| _d S N)	functions__name__)selftypes_to_retty_kernelspyfunc r   i/home/ncw/WWW/www-new/content/articles/pi-bbp/venv/lib/python3.10/site-packages/numba/cuda/vectorizers.py__init__   s   zCUDAUFuncDispatcher.__init__c                 O   s   t | j||S )a  
        *args: numpy arrays or DeviceArrayBase (created by cuda.to_device).
               Cannot mix the two types in one call.

        **kws:
            stream -- cuda stream; when defined, asynchronous mode is used.
            out    -- output array. Can be a numpy array or DeviceArrayBase
                      depending on the input arguments.  Type must match
                      the input arguments.
        )CUDAUFuncMechanismcallr
   )r   argskwsr   r   r   __call__   s   zCUDAUFuncDispatcher.__call__r   c                 C   s   t t| j d dksJ d|jdksJ d|jd }g }|dkr)td|dkr1|d S |p6t }|	 0 tj
j|rF|}nt||}| |||}td|jd}|j||d	 W d    |d S 1 snw   Y  |d S )
Nr      zmust be a binary ufunc   zmust use 1d arrayzReduction on an empty array.)r   )dtypestream)lenlistr
   keysndimshape	TypeErrorr   r   auto_synchronizecudadrvdevicearrayis_cuda_ndarray	to_device_CUDAUFuncDispatcher__reducenp_arrayr   copy_to_host)r   argr   ngpu_memsmemoutbufr   r   r   reduce   s(   "


zCUDAUFuncDispatcher.reducec           
      C   s   |j d }|d dkr2||d \}}|| || | |||}|| | ||||dS ||d \}}	|| ||	 | ||	||d |d dkrZ| |||S |S )Nr   r   r   )r.   r   )r    splitappendr'   )
r   r-   r,   r   r+   fatcutthincutr.   leftrightr   r   r   __reduce;   s   





zCUDAUFuncDispatcher.__reduceNr   )r   
__module____qualname____doc__r   r   r0   r'   r   r   r   r   r      s    
r   c                       sR   e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
  ZS )_CUDAGUFuncCallSteps_streamc                    s$   t  |||| |dd| _d S )Nr   r   )superr   getr=   )r   ninnoutr   kwargs	__class__r   r   r   X   s   z_CUDAGUFuncCallSteps.__init__c                 C   
   t |S r	   r   is_cuda_arrayr   objr   r   r   is_device_array\      
z$_CUDAGUFuncCallSteps.is_device_arrayc                 C      t jj|r	|S t |S r	   r   r#   r$   r%   as_cuda_arrayrH   r   r   r   as_device_array_      
z$_CUDAGUFuncCallSteps.as_device_arrayc                 C   s   t j|| jdS Nr   )r   r&   r=   )r   hostaryr   r   r   r&   i      z_CUDAGUFuncCallSteps.to_devicec                 C   s   |j || jd}|S rQ   )r)   r=   )r   devaryrR   r.   r   r   r   to_hostl   s   z_CUDAGUFuncCallSteps.to_hostc                 C   s   t j||| jdS N)r    r   r   )r   device_arrayr=   )r   r    r   r   r   r   allocate_device_arrayp   s   z*_CUDAGUFuncCallSteps.allocate_device_arrayc                 C   s   |j || jd|  d S rQ   )forallr=   )r   kernelnelemr   r   r   r   launch_kernels   s   z"_CUDAGUFuncCallSteps.launch_kernel)r   r9   r:   	__slots__r   rJ   rO   r&   rU   rX   r\   __classcell__r   r   rC   r   r<   S   s    
r<   c                       s8   e Zd Z fddZedd Zdd Zdd Z  ZS )	CUDAGeneralizedUFuncc                    s   |j | _ t || d S r	   )r   r>   r   )r   	kernelmapenginer   rC   r   r   r   x   s   zCUDAGeneralizedUFunc.__init__c                 C      t S r	   )r<   r   r   r   r   _call_steps|      z CUDAGeneralizedUFunc._call_stepsc                 C   s   t jjj|d|j|jdS Nr8   r    stridesr   gpu_data)r   r#   r$   DeviceNDArrayr   ri   )r   aryr    r   r   r   _broadcast_scalar_input   s
   
z,CUDAGeneralizedUFunc._broadcast_scalar_inputc                 C   s:   t |t |j }d| |j }tjjj|||j|jdS rf   )	r   r    rh   r   r#   r$   rj   r   ri   )r   rk   newshapenewax
newstridesr   r   r   _broadcast_add_axis   s   
z(CUDAGeneralizedUFunc._broadcast_add_axis)	r   r9   r:   r   propertyrd   rl   rp   r^   r   r   rC   r   r_   w   s    
r_   c                   @   sL   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd ZdS )r   z%
    Provide CUDA specialization
    r   c                 C   s   |j ||d|  d S rQ   )rY   )r   funccountr   r   r   r   r   launch   s   zCUDAUFuncMechanism.launchc                 C   rE   r	   rF   rH   r   r   r   rJ      rK   z"CUDAUFuncMechanism.is_device_arrayc                 C   rL   r	   rM   rH   r   r   r   rO      rP   z"CUDAUFuncMechanism.as_device_arrayc                 C   s   t j||dS rQ   )r   r&   )r   rR   r   r   r   r   r&         zCUDAUFuncMechanism.to_devicec                 C   s   |j |dS rQ   )r)   )r   rT   r   r   r   r   rU      s   zCUDAUFuncMechanism.to_hostc                 C   s   t j|||dS rV   )r   rW   )r   r    r   r   r   r   r   rX      rS   z(CUDAUFuncMechanism.allocate_device_arrayc                    sn    fddt tD }tt j }dg| t j }|D ]}d||< q#tjjj| j	 j
dS )Nc                    s,   g | ]}| j ks j| | kr|qS r   )r   r    ).0axrk   r    r   r   
<listcomp>   s
    
z7CUDAUFuncMechanism.broadcast_device.<locals>.<listcomp>r   rg   )ranger   r    r   rh   r   r#   r$   rj   r   ri   )r   rk   r    
ax_differs
missingdimrh   rw   r   rx   r   broadcast_device   s   

z#CUDAUFuncMechanism.broadcast_deviceN)r   r9   r:   r;   DEFAULT_STREAMrt   rJ   rO   r&   rU   rX   r}   r   r   r   r   r      s    
r   z
def __vectorized_{name}({args}, __out__):
    __tid__ = __cuda__.grid(1)
    if __tid__ < __out__.shape[0]:
        __out__[__tid__] = __core__({argitems})
c                   @   s8   e Zd Zdd Zdd Zdd Zdd Zed	d
 ZdS )CUDAVectorizec                 C   s*   t j|ddd| j}||j|j jjfS )NT)deviceinline)r   jitr   	overloadsr   	signaturereturn_type)r   sigcudevfnr   r   r   _compile_core   s   zCUDAVectorize._compile_corec                 C   s    | j j }|t|d |S )N__cuda____core__)r   __globals__copyupdater   )r   corefnglblr   r   r   _get_globals   s
   zCUDAVectorize._get_globalsc                 C   rE   r	   r   r   r   fnobjr   r   r   r   _compile_kernel   rK   zCUDAVectorize._compile_kernelc                 C   s   t | j| jS r	   )r   r`   r   rc   r   r   r   build_ufunc   ru   zCUDAVectorize.build_ufuncc                 C   rb   r	   )vectorizer_stager_sourcerc   r   r   r   _kernel_template   re   zCUDAVectorize._kernel_templateN)	r   r9   r:   r   r   r   r   rq   r   r   r   r   r   r      s    r   zy
def __gufunc_{name}({args}):
    __tid__ = __cuda__.grid(1)
    if __tid__ < {checkedarg}:
        __core__({argitems})
c                   @   s0   e Zd Zdd Zdd Zedd Zdd Zd	S )
CUDAGUFuncVectorizec                 C   s"   t | j| j}t| j|| jdS )N)r`   ra   r   )r   GUFuncEngineinputsig	outputsigr_   r`   r   )r   ra   r   r   r   r      s
   zCUDAGUFuncVectorize.build_ufuncc                 C   s   t ||S r	   r   r   r   r   r   r      ru   z#CUDAGUFuncVectorize._compile_kernelc                 C   rb   r	   )_gufunc_stager_sourcerc   r   r   r   r      re   z$CUDAGUFuncVectorize._kernel_templatec                 C   s4   t j|dd| j}| jj }|t |d |S )NT)r   r   )r   r   r   py_funcr   r   r   )r   r   r   glblsr   r   r   r      s   z CUDAGUFuncVectorize._get_globalsN)r   r9   r:   r   r   rq   r   r   r   r   r   r   r      s    
r   N)numbar   numpyr   r(   numba.np.ufuncr   numba.np.ufunc.deviceufuncr   r   r   objectr   r<   r_   r   r   DeviceVectorizer   r   DeviceGUFuncVectorizer   r   r   r   r   <module>   s    K$0