o
    Tdy                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ z	eeddZW n e yt   dd ZY nw dd Z!dd Z"dd Z#G dd de
j$Z%G dd de%Z&edd Z'G dd de%Z(G dd de)Z*G d d! d!e%ej+Z,G d"d# d#e%ej+Z-d5d$d%Z.d5d&d'Z/d(d) Z0d*d+ Z1d,Z2d-d. Z3d6d1d2Z4d3d4 Z5dS )7z
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object.  If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
    N)c_void_p)_devicearray)devices)driver)typesconfig)to_fixed_tuple)
dummyarray)numpy_support)prepare_shape_strides_dtype)NumbaPerformanceWarning)warn	lru_cachec                 C   s   | S N )funcr   r   q/home/ncw/WWW/www-new/content/articles/pi-bbp/venv/lib/python3.10/site-packages/numba/cuda/cudadrv/devicearray.pyr      s   c                 C   s   t | ddS )z$Check if an object is a CUDA ndarray__cuda_ndarray__F)getattrobjr   r   r   is_cuda_ndarray#      r   c                    sB   t    fdd}|dt |dt |dtj |dt dS )z,Verify the CUDA ndarray interface for an objc                    s6   t  | s	t| tt | |std| |f d S )Nz%s must be of type %s)hasattrAttributeError
isinstancer   )attrtypr   r   r   requires_attr,   s
   
z4verify_cuda_ndarray_interface.<locals>.requires_attrshapestridesdtypesizeN)require_cuda_ndarraytuplenpr!   int)r   r   r   r   r   verify_cuda_ndarray_interface(   s   

r'   c                 C   s   t | stddS )z9Raises ValueError is is_cuda_ndarray(obj) evaluates Falsezrequire an cuda ndarray objectN)r   
ValueErrorr   r   r   r   r#   8   s   r#   c                   @   s   e Zd ZdZdZdZd%ddZedd Zd&d	d
Z	edd Z
d'ddZdd Zedd Zedd Zejd&ddZejd(ddZd&ddZdd Zdd Zd(dd Zd!d" Zed#d$ ZdS ))DeviceNDArrayBasez$A on GPU NDArray representation
    Tr   Nc                 C   s"  t |tr|f}t |tr|f}t|}t|| _t|| jkr%tdtj	d|||j
| _t|| _t|| _|| _tttj| jd| _| jdkrn|du rgt| j| j| jj
| _t | j}n"t|| _ntjrxtjd}ntd}tjt |dd}d| _|| _ || _!dS )a5  
        Args
        ----

        shape
            array shape.
        strides
            array strides.
        dtype
            data type as np.dtype coercible object.
        stream
            cuda stream.
        gpu_data
            user provided device memory for the ndarray data buffer
        zstrides not match ndimr      N)contextpointerr"   )"r   r&   r%   r!   lenndimr(   r	   Array	from_descitemsize_dummyr$   r   r    	functoolsreduceoperatormulr"   _drivermemory_size_from_info
alloc_sizer   get_contextmemallocdevice_memory_sizeUSE_NV_BINDINGbindingCUdeviceptrr   MemoryPointergpu_datastream)selfr   r    r!   rB   rA   nullr   r   r   __init__D   s>   








zDeviceNDArrayBase.__init__c                 C   s   t jr| jd urt| j}nd}n| jjd ur| jj}nd}t| jt| r(d nt| j|df| j	j
| jdkr?t| jddS d ddS )Nr   F   )r   r    datatypestrrB   version)r7   r=   device_ctypes_pointerr&   valuer$   r   is_contiguousr    r!   strrB   )rC   ptrr   r   r   __cuda_array_interface__w   s"   

z*DeviceNDArrayBase.__cuda_array_interface__c                 C   s   t  | }||_|S )zBind a CUDA stream to this object so that all subsequent operation
        on this array defaults to the given stream.
        )copyrB   )rC   rB   cloner   r   r   bind   s   
zDeviceNDArrayBase.bindc                 C      |   S r   	transposerC   r   r   r   T   s   zDeviceNDArrayBase.Tc                 C   st   |rt |t t| jkr| S | jdkrd}t||d ur0t|tt| jkr0td|f ddlm} || S )N   z2transposing a non-2D DeviceNDArray isn't supportedzinvalid axes list %rr   rT   )r$   ranger.   NotImplementedErrorsetr(   numba.cuda.kernels.transposerU   )rC   axesmsgrU   r   r   r   rU      s   
zDeviceNDArrayBase.transposec                 C   s   |s| j S |S r   rB   )rC   rB   r   r   r   _default_stream   s   z!DeviceNDArrayBase._default_streamc                 C   sR   d| j v }| jd r|sd}n| jd r|sd}nd}t| j}t|| j|S )n
        Magic attribute expected by Numba to get the numba type that
        represents this object.
        r   C_CONTIGUOUSCF_CONTIGUOUSFA)r    flagsr
   
from_dtyper!   r   r/   r.   )rC   	broadcastlayoutr!   r   r   r   _numba_type_   s   
zDeviceNDArrayBase._numba_type_c                 C   s,   | j du rtjrtjdS tdS | j jS )z:Returns the ctypes pointer to the GPU data buffer
        Nr   )rA   r7   r=   r>   r?   r   rJ   rV   r   r   r   rJ      s
   
z'DeviceNDArrayBase.device_ctypes_pointerc                 C   s   |j dkrdS t|  | |}t| t|}}t|r3t| t|| tj| || j|d dS t	j
||jd r=dndd|jd  d	}t|| tj| || j|d dS )
zCopy `ary` to `self`.

        If `ary` is a CUDA memory, perform a device-to-device transfer.
        Otherwise, perform a a host-to-device transfer.
        r   Nr_   rb   rc   re   T	WRITEABLE)ordersubokrP   )r"   sentry_contiguousr`   
array_corer7   is_device_memorycheck_array_compatibilitydevice_to_devicer9   r%   arrayrg   host_to_device)rC   aryrB   	self_coreary_corer   r   r   copy_to_device   s&   






z DeviceNDArrayBase.copy_to_devicec                 C   s   t dd | jD rd}t|| j| jdksJ d| |}|du r0tj| jtjd}nt	| | |}| jdkrFt
j|| | j|d |du rg| jdkr[tj| j| j|d	}|S tj| j| j| j|d
}|S )a^  Copy ``self`` to ``ary`` or create a new Numpy ndarray
        if ``ary`` is ``None``.

        If a CUDA ``stream`` is given, then the transfer will be made
        asynchronously as part as the given stream.  Otherwise, the transfer is
        synchronous: the function returns after the copy is finished.

        Always returns the host array.

        Example::

            import numpy as np
            from numba import cuda

            arr = np.arange(1000)
            d_arr = cuda.to_device(arr)

            my_kernel[100, 100](d_arr)

            result_array = d_arr.copy_to_host()
        c                 s   s    | ]}|d k V  qdS r   Nr   ).0sr   r   r   	<genexpr>	  s    z1DeviceNDArrayBase.copy_to_host.<locals>.<genexpr>z2D->H copy not implemented for negative strides: {}r   zNegative memory sizeNr   r!   r_   )r   r!   buffer)r   r!   r    r   )anyr    rZ   formatr9   r`   r%   emptybyterr   r7   device_to_hostr"   ndarrayr   r!   )rC   rv   rB   r^   hostaryr   r   r   copy_to_host   s.   



zDeviceNDArrayBase.copy_to_hostc                 c   s    |  |}| jdkrtd| jd | jjkrtdttt	| j
| }| j}| jj}t|D ])}|| }t|| | j
}|| f}	| j|| || }
t|	|| j||
dV  q3dS )zSplit the array into equal partition of the `section` size.
        If the array cannot be equally divided, the last section will be
        smaller.
        r*   zonly support 1d arrayr   zonly support unit strider!   rB   rA   N)r`   r.   r(   r    r!   r1   r&   mathceilfloatr"   rY   minrA   viewDeviceNDArray)rC   sectionrB   nsectr    r1   ibeginendr   rA   r   r   r   split!  s$   



zDeviceNDArrayBase.splitc                 C   s   | j S )zEReturns a device memory object that is used as the argument.
        )rA   rV   r   r   r   as_cuda_arg6  s   zDeviceNDArrayBase.as_cuda_argc                 C   s0   t  | j}t| j| j| jd}t||dS )z
        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
        to another process to share the local allocation.

        Note: this feature is only available on Linux.
        )r   r    r!   )
ipc_handle
array_desc)	r   r:   get_ipc_handlerA   dictr   r    r!   IpcArrayHandle)rC   ipchdescr   r   r   r   ;  s   z DeviceNDArrayBase.get_ipc_handlec                 C   s2   | j j|d\}}t|j|j| j| || jdS )a(  
        Remove axes of size one from the array shape.

        Parameters
        ----------
        axis : None or int or tuple of ints, optional
            Subset of dimensions to remove. A `ValueError` is raised if an axis
            with size greater than one is selected. If `None`, all axes with
            size one are removed.
        stream : cuda stream or 0, optional
            Default stream for the returned view of the array.

        Returns
        -------
        DeviceNDArray
            Squeezed view into the array.

        )axisr   r    r!   rB   rA   )r2   squeezer   r   r    r!   r`   rA   )rC   r   rB   	new_dummy_r   r   r   r   F  s   zDeviceNDArrayBase.squeezec                 C   s   t |}t| j}t| j}| jj|jkr;|  stdt|d | jj |j\|d< }|dkr6td|j|d< t	|||| j
| jdS )zeReturns a new object by reinterpretting the dtype without making a
        copy of the data.
        zHTo change to a dtype of a different size, the array must be C-contiguousr   zuWhen changing to a larger dtype, its size must be a divisor of the total size in bytes of the last axis of the array.r   )r%   r!   listr   r    r1   is_c_contiguousr(   divmodr   rB   rA   )rC   r!   r   r    remr   r   r   r   b  s0   



zDeviceNDArrayBase.viewc                 C   s   | j j| j S r   )r!   r1   r"   rV   r   r   r   nbytes  s   zDeviceNDArrayBase.nbytesrz   r   r   Nr   )__name__
__module____qualname____doc____cuda_memory__r   rE   propertyrO   rR   rW   rU   r`   rk   rJ   r   require_contextry   r   r   r   r   r   r   r   r   r   r   r   r)   >   s6    
3






.
%r)   c                       s   e Zd ZdZd fdd	Zedd Zedd	 Zej	d
d Z
ej	dddZdddZej	dd Zej	dddZdddZ  ZS )DeviceRecordz
    An on-GPU record type
    r   Nc                    s$   d}d}t t| ||||| d S Nr   )superr   rE   )rC   r!   rB   rA   r   r    	__class__r   r   rE     s
   zDeviceRecord.__init__c                 C      t | jjS z
        For `numpy.ndarray` compatibility. Ideally this would return a
        `np.core.multiarray.flagsobj`, but that needs to be constructed
        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
        aren't writeable).
        r   r2   rg   rV   r   r   r   rg        zDeviceRecord.flagsc                 C   s   t | jS )ra   )r
   rh   r!   rV   r   r   r   rk     s   zDeviceRecord._numba_type_c                 C   
   |  |S r   _do_getitemrC   itemr   r   r   __getitem__     
zDeviceRecord.__getitem__c                 C      |  ||S z0Do `__getitem__(item)` with CUDA stream
        r   rC   r   rB   r   r   r   getitem     zDeviceRecord.getitemc           
      C   s   |  |}| jj| \}}| j|}|jdkr9|jd ur$t|||dS tj	d|d}t
j|||j|d |d S t|jd |jd d\}}}	t|||	||dS )	Nr   r   r*   r!   dstsrcr"   rB   r   rc   r   r    r!   rA   rB   )r`   r!   fieldsrA   r   r   namesr   r%   r   r7   r   r1   r   subdtyper   )
rC   r   rB   r   offsetnewdatar   r   r    r!   r   r   r   r     s.   



zDeviceRecord._do_getitemc                 C   r   r   _do_setitemrC   keyrK   r   r   r   __setitem__  r   zDeviceRecord.__setitem__c                 C      | j |||dS z6Do `__setitem__(key, value)` with CUDA stream
        r_   r   rC   r   rK   rB   r   r   r   setitem     zDeviceRecord.setitemc                 C   s   |  |}| }|rt }| }| jj| \}}| j|}t| |||d}	t	|	j||d\}
}t
|	|
|
jj| |rG|  d S d S )Nr   r_   )r`   r   r:   get_default_streamr!   r   rA   r   typeauto_devicer7   rs   r1   synchronize)rC   r   rK   rB   synchronousctxr   r   r   lhsrhsr   r   r   r   r     s   
zDeviceRecord._do_setitemrz   r   )r   r   r   r   rE   r   rg   rk   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r     s"    
	



r   c                    s>   ddl m  dkr jdd }|S  j fdd}|S )z
    A separate method so we don't need to compile code every assignment (!).

    :param ndim: We need to have static array sizes for cuda.local.array, so
        bake in the number of dimensions into the kernel
    r   )cudac                 S   s   |d | d< d S r   r   )r   r   r   r   r   kernel     z_assign_kernel.<locals>.kernelc                    s     d}d}t| jD ]	}|| j| 9 }q||krd S  jjdftjd}td ddD ]&}|| j|  |d|f< || j|  |j| dk |d|f< || j|  }q/|t|d  | t|d < d S )Nr*   rX   r~   r   r   )	gridrY   r.   r   localrt   r   int64r   )r   r   location
n_elementsr   idxr   r.   r   r   r     s   
$$)numbar   jit)r.   r   r   r   r   _assign_kernel  s   
r   c                   @   s   e Zd ZdZdd Zedd Zdd Zdd	d
Zdd Z	dd Z
d ddZejdd Zejd!ddZd!ddZejdd Zejd!ddZd!ddZdS )"r   z
    An on-GPU array type
    c                 C      | j jS )zA
        Return true if the array is Fortran-contiguous.
        )r2   is_f_contigrV   r   r   r   is_f_contiguous&     zDeviceNDArray.is_f_contiguousc                 C   r   r   r   rV   r   r   r   rg   ,  r   zDeviceNDArray.flagsc                 C   r   )z;
        Return true if the array is C-contiguous.
        )r2   is_c_contigrV   r   r   r   r   6  r   zDeviceNDArray.is_c_contiguousNc                 C   s   |r	|   |S |    S )zE
        :return: an `numpy.ndarray`, so copies to the host.
        )r   	__array__)rC   r!   r   r   r   r   <  s   zDeviceNDArray.__array__c                 C   s
   | j d S r   )r   rV   r   r   r   __len__E  s   
zDeviceNDArray.__len__c                 O   s   t |dkrt|d ttfr|d }t| }|| jkr(|| j| j| j| jdS | j	j
|i |\}}|| j	jgkrF||j|j| j| jdS td)z
        Reshape the array without changing its contents, similarly to
        :meth:`numpy.ndarray.reshape`. Example::

            d_arr = d_arr.reshape(20, 50, order='F')
        r*   r   )r   r    r!   rA   operation requires copying)r-   r   r$   r   r   r   r    r!   rA   r2   reshapeextentrZ   )rC   newshapekwsclsnewarrextentsr   r   r   r   H  s   


zDeviceNDArray.reshaperc   r   c                 C   sT   |  |}t| }| jj|d\}}|| jjgkr&||j|j| j| j|dS t	d)z
        Flattens a contiguous array without changing its contents, similar to
        :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
        exception.
        )rm   r   r   )
r`   r   r2   ravelr   r   r    r!   rA   rZ   )rC   rm   rB   r   r   r   r   r   r   r   `  s   

zDeviceNDArray.ravelc                 C   r   r   r   r   r   r   r   r   r  r   zDeviceNDArray.__getitem__c                 C   r   r   r   r   r   r   r   r   v  r   zDeviceNDArray.getitemc                 C   s   |  |}| j|}t| }t| }t|dkrW| jj|d  }|j	sK| j
jd ur4t| j
||dS tjd| j
d}tj||| jj|d |d S ||j|j| j
||dS | jj|j }||j|j| j
||dS )Nr*   r   r   r   r   r   )r`   r2   r   r   iter_contiguous_extentr   r-   rA   r   is_arrayr!   r   r   r%   r   r7   r   r1   r   r    r   )rC   r   rB   arrr   r   r   r   r   r   r   r   |  s0   


zDeviceNDArray._do_getitemc                 C   r   r   r   r   r   r   r   r     r   zDeviceNDArray.__setitem__c                 C   r   r   r   r   r   r   r   r     r   zDeviceNDArray.setitemc                 C   s^  |  |}| }|rt }| }| j|}| jj|j }t	|t
jr*d}d}	n|j}|j}	t| ||	| j||d}
t||dd\}}|j|
jkrUtd|j|
jf tj|
jtjd}|j||
j|j d < |j| }tt|
j|jD ]\}\}}|dkr||krtd|||f qwttj|
jd}t|
jj||d	|
| |r|  d S d S )
Nr   r   T)rB   user_explicitz$Can't assign %s-D array to %s-D selfr   r*   zCCan't copy sequence with size %d to array axis %d with dimension %dr_   ) r`   r   r:   r   r2   r   rA   r   r   r   r	   Elementr   r    r   r!   r   r.   r(   r%   onesr   r   	enumeratezipr3   r4   r5   r6   r   forallr   )rC   r   rK   rB   r   r   r  r   r   r    r   r   r   	rhs_shaper   lrr   r   r   r   r     sN   
	
zDeviceNDArray._do_setitemr   )rc   r   r   )r   r   r   r   r   r   rg   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   "  s(    
	
	



r   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )r   a"  
    An IPC array handle that can be serialized and transfer to another process
    in the same machine for share a GPU allocation.

    On the destination process, use the *.open()* method to creates a new
    *DeviceNDArray* object that shares the allocation from the original process.
    To release the resources, call the *.close()* method.  After that, the
    destination can no longer use the shared array object.  (Note: the
    underlying weakref to the resource is now dead.)

    This object implements the context-manager interface that calls the
    *.open()* and *.close()* method automatically::

        with the_ipc_array_handle as ipc_array:
            # use ipc_array here as a normal gpu array object
            some_code(ipc_array)
        # ipc_array is dead at this point
    c                 C   s   || _ || _d S r   )_array_desc_ipc_handle)rC   r   r   r   r   r   rE        
zIpcArrayHandle.__init__c                 C   s$   | j t }tdd|i| jS )z
        Returns a new *DeviceNDArray* that shares the allocation from the
        original process.  Must not be used on the original process.
        rA   Nr   )r  openr   r:   r   r  )rC   dptrr   r   r   r    s   zIpcArrayHandle.openc                 C   s   | j   dS )z5
        Closes the IPC handle to the array.
        N)r  closerV   r   r   r   r    s   zIpcArrayHandle.closec                 C   rS   r   )r  rV   r   r   r   	__enter__   s   zIpcArrayHandle.__enter__c                 C   s   |    d S r   )r  )rC   r   rK   	tracebackr   r   r   __exit__  s   zIpcArrayHandle.__exit__N)	r   r   r   r   rE   r  r  r  r  r   r   r   r   r     s    r   c                   @      e Zd ZdZdddZdS )MappedNDArrayz4
    A host array that uses CUDA mapped memory.
    r   c                 C      || _ || _d S r   rA   rB   rC   rA   rB   r   r   r   device_setup  r  zMappedNDArray.device_setupNr   r   r   r   r   r  r   r   r   r   r        r  c                   @   r  )ManagedNDArrayz5
    A host array that uses CUDA managed memory.
    r   c                 C   r  r   r  r  r   r   r   r    r  zManagedNDArray.device_setupNr   r  r   r   r   r   r    r  r  c                 C   s   t | j| j| j||dS )z/Create a DeviceNDArray object that is like ary.rB   rA   )r   r   r    r!   )rv   rB   rA   r   r   r   from_array_like  s   r  c                 C   s   t | j||dS )z.Create a DeviceRecord object that is like rec.r  )r   r!   )recrB   rA   r   r   r   from_record_like!  r   r!  c                 C   sF   | j r| js| S g }| j D ]}||dkrdntd q| t| S )aG  
    Extract the repeated core of a broadcast array.

    Broadcast arrays are by definition non-contiguous due to repeated
    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
    contiguity and copy the underlying data from such arrays, we must create
    a view without the repeated dimensions.

    r   N)r    r"   appendslicer$   )rv   
core_indexstrider   r   r   rp   &  s   

rp   c                 C   sR   | j j}tt| jt| jD ]\}}|dkr&|dkr&||kr" dS ||9 }qdS )z
    Returns True iff `ary` is C-style contiguous while ignoring
    broadcasted and 1-sized dimensions.
    As opposed to array_core(), it does not call require_context(),
    which can be quite expensive.
    r*   r   FT)r!   r1   r  reversedr   r    )rv   r"   r   r%  r   r   r   rL   8  s   rL   zArray contains non-contiguous buffer and cannot be transferred as a single memory region. Please ensure contiguous buffer with numpy .ascontiguousarray()c                 C   s,   t | }|jd s|jd sttd S d S )Nrb   rd   )rp   rg   r(   errmsg_contiguous_buffer)rv   corer   r   r   ro   N  s   ro   TFc                 C   s   t | r	| dfS t| drtj| dfS t| tjr#t	| |d}ntj
| ddd} t|  t| |d}|rVtjrO|sOt| tsOt| tjrOd}tt| |j| |d |dfS )z
    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
    host to device. If obj already represents device memory, it is returned and
    no copy is made.
    FrO   r_   T)rP   rn   zGHost array used in CUDA kernel will incur copy overhead to/from device.)r7   rq   r   r   r   as_cuda_arrayr   r%   voidr!  rt   ro   r  r   CUDA_WARN_ON_IMPLICIT_COPYr   r   r   r   ry   )r   rB   rP   r  devobjr^   r   r   r   r   T  s2   


r   c                 C   s   |   |  }}| j|jkrtd| j|jf |j|jkr)td| j|jf | jr<|j|jkr>td| j|jf d S d S )Nzincompatible dtype: %s vs. %szincompatible shape: %s vs. %szincompatible strides: %s vs. %s)r   r!   	TypeErrorr   r(   r"   r    )ary1ary2ary1sqary2sqr   r   r   rr   {  s   


rr   rz   )r   TF)6r   r   r3   r5   rP   ctypesr   numpyr%   r   r   numba.cuda.cudadrvr   r   r7   
numba.corer   r   numba.np.unsafe.ndarrayr   
numba.miscr	   numba.npr
   numba.cuda.api_utilr   numba.core.errorsr   warningsr   r   r   r   r   r'   r#   DeviceArrayr)   r   r   r   objectr   r   r  r  r  r!  rp   rL   r'  ro   r   rr   r   r   r   r   <module>   sZ      Sg
+ :,




'