o
    Td                     @   s:   d dl mZ d dlmZ d dlZd dlmZ dddZdS )    )cuda)driverN)numpy_supportc                    s   t | dd}|s%| j\}}| jj| | jjf}tjjj||f|| j|d}t	| j t
 j}ttdt|dd }t|| }||d ftj fdd}	t|jd | d t|jd | d f}
||f}|	|
||f | | |S )a  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    streamr   )dtyper         c           	         s   t jj d}t jj}t jj}t jjt jj }t jjt jj }|| }|| }|| | jd k rH|| | jd k rH| || || f |||f< t 	  ||jd k rf||jd k rh|||f |||f< d S d S d S )N)shaper   r   r   )
r   sharedarray	threadIdxxyblockIdxblockDimr	   syncthreads)	inputoutputtiletxtybxbyr   r   dt
tile_shape o/home/ncw/WWW/www-new/content/articles/pi-bbp/venv/lib/python3.10/site-packages/numba/cuda/kernels/transpose.pykernel)   s   $ztranspose.<locals>.kernel)getattrr	   r   itemsizer   cudadrvdevicearrayDeviceNDArraynps
from_dtyper   
get_deviceMAX_THREADS_PER_BLOCKintmathpowlogjit)abr   colsrowsstridestpb
tile_widthtile_heightr   blocksthreadsr   r   r   	transpose   s*   

,r7   )N)	numbar   numba.cuda.cudadrv.driverr   r)   numba.npr   r$   r7   r   r   r   r   <module>   s
    