§
    J/Ph  ã                   ó:   — d dl mZ d dlmZ d dlZd dlmZ dd„ZdS )é    )Úcuda)ÚdriverN)Únumpy_supportc           	      ó¾  ‡‡— t          | dd¦  «        }|sT| j        \  }}| j        j        |z  | j        j        f}t          j        j                             ||f|| j        |¬¦  «        }t          j	        | j        ¦  «        Št          j        ¦   «         j        }t          t          j        dt          j        |d¦  «        dz  ¦  «        ¦  «        }t          ||z  ¦  «        }||dz   fŠt          j        ˆˆfd„¦   «         }	t          |j        d         |z  dz   ¦  «        t          |j        d         |z  dz   ¦  «        f}
||f} |	|
||f         | |¦  «         |S )aá  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    Ústreamr   )Údtyper   é   é   c                 ó8  •— t           j                             ‰
‰	¬¦  «        }t           j        j        }t           j        j        }t           j        j        t           j        j        z  }t           j        j        t           j        j        z  }||z   }||z   }||z   | j        d         k     r)||z   | j        d         k     r| ||z   ||z   f         |||f<   t          j	        ¦   «          ||j        d         k     r"||j        d         k     r|||f         |||f<   d S d S d S )N)Úshaper   r   r
   )
r   ÚsharedÚarrayÚ	threadIdxÚxÚyÚblockIdxÚblockDimr   Úsyncthreads)ÚinputÚoutputÚtileÚtxÚtyÚbxÚbyr   r   ÚdtÚ
tile_shapes            €€ú\/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/numba/cuda/kernels/transpose.pyÚkernelztranspose.<locals>.kernel)   s  ø€ õ Œ{× Ò  z¸Ð Ñ<Ô<ˆåŒ^ÔˆÝŒ^ÔˆÝŒ]Œ_tœ}œÑ.ˆÝŒ]Œ_tœ}œÑ.ˆØ‰GˆØ‰Gˆà‰7U”[ ”^Ò#Ð#¨¨R©°%´+¸a´.Ò(@Ð(@Ø   b¡¨"¨r©'Ð!1Ô2ˆDR‰LÝÔÑÔÐØˆvŒ|˜AŒÒÐ 1 v¤|°A¤Ò#6Ð#6Ø  B œ<ˆF1a4‰LˆLˆLð ÐÐ#6Ð#6ó    )Úgetattrr   r   Úitemsizer   ÚcudadrvÚdevicearrayÚDeviceNDArrayÚnpsÚ
from_dtyper   Ú
get_deviceÚMAX_THREADS_PER_BLOCKÚintÚmathÚpowÚlogÚjit)ÚaÚbr   ÚcolsÚrowsÚstridesÚtpbÚ
tile_widthÚtile_heightr   ÚblocksÚthreadsr   r   s               @@r   Ú	transposer9      su  øø€ õ Q˜ !Ñ$Ô$€Fàð Ø”W‰
ˆˆdØ”'Ô" TÑ)¨1¬7Ô+;Ð;ˆÝŒLÔ$×2Ò2Ø4ˆLØØ”'Øð	 3ñ ô ˆõ 
Œ˜œÑ	 Ô	 €Bå
Ô
Ñ
Ô
Ô
3€Cå•T”X˜a¥¤¨#¨qÑ!1Ô!1°AÑ!5Ñ6Ô6Ñ7Ô7€JÝc˜JÑ&Ñ'Ô'€Kà˜z¨A™~Ð.€Jå	„Xð(ð (ð (ð (ð (ñ „Xð(õ$ ”˜”˜kÑ)¨AÑ-Ñ.Ô.µ°A´G¸A´JÀÑ4KÈaÑ4OÑ0PÔ0PÐP€Fà˜:Ð%€GØ#€Fˆ67˜FÐ"Ô# A qÑ)Ô)Ð)à€Hr    )N)	Únumbar   Únumba.cuda.cudadrv.driverr   r+   Únumba.npr   r&   r9   © r    r   ú<module>r>      sb   ðØ Ð Ð Ð Ð Ð Ø ,Ð ,Ð ,Ð ,Ð ,Ð ,Ø €€€Ø )Ð )Ð )Ð )Ð )Ð )ð:ð :ð :ð :ð :ð :r    