
    J/Ph$                     @    d Z ddlmZ dZdZd Z G d de          ZdS )	zC
A library written in CUDA Python for generating reduction kernels
    )
from_dtype       c                   	 ddl m                      d          |           	t          dz   t          t          z                      d          	fd                                d          	fd                                d          	fd            	fd	}                    |          S )
Nr   cudaT)device   c                 0   j         j        }|t          z  }|t          z  }| |ddf         }|||<                                    t          dz  }|rC||k     r ||         } 	||||z                      ||<                                    |dz  }|AdS dS )z8
        Compute reduction within a single warp
        N   )	threadIdxx	_WARPSIZEsyncwarp)
sm_partialsinittidwarpidlaneidsm_thiswidtholdr   	reduce_ops
           \/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/numba/cuda/kernels/reduction.pyinner_warp_reductionz1_gpu_reduce_factory.<locals>.inner_warp_reduction   s    
 n	!yfaaai(Q 	~~fo"+)C%1H"I"IMMOOOaKE  	 	 	 	 	    c                    j         j        }j        j        }j        j        }j        j        }|||z  z   }| j        }||z  }	| |         }
t          ||	z   ||	          D ]} |
| |                   }
                                  ||
                                            |dk     r8 ||df         ||dz   df                   ||df<                                    |dk    r |d         |d                   ||<   dS dS )a  
        Partially reduce `arr` into `partials` using `sm_partials` as working
        space.  The algorithm goes like:

            array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
                        block-0:  |   x |     |     |   x |     |
                        block-1:  |     |   x |     |     |   x |
                        block-2:  |     |     |   x |     |     |

        The array is divided into chunks of 128 (size of a threadblock).
        The threadblocks consumes the chunks in roundrobin scheduling.
        First, a threadblock loads a chunk into temp memory.  Then, all
        subsequent chunks are combined into the temp memory.

        Once all chunks are processed.  Inner-block reduction is performed
        on the temp memory.  So that, there will just be one scalar result
        per block.  The result from each block is stored to `partials` at
        the dedicated slot.
        r   r   r   r   )r
   r   N)	r   r   blockIdxblockDimgridDimsizerangesyncthreadsr   )arrpartialsr   r   blkidblkszgridszstartstopsteptmpir   r   r   s               r   device_reduce_full_blockz5_gpu_reduce_factory.<locals>.device_reduce_full_block(   sD   * n eem#xv~ %jut|T400 	) 	)A)CQ((CC[#... 77"+)KQ,?,7a
,C#E #EKQMMOOO!88'iD(9;t;LMMHUOOO 8r   c                    j         j        }j        j        }j        j        }|t          z  }|t          z  }| j        }j         j        }| |         }	|	|||f<                                    |dz   t          z  |k     r ||	           nM|dk    rG||ddf         }
|t          z  }t          d||z
            D ]} |
d         |
|                   |
d<                                    |dk    rK|t          z   dz
  t          z  }|d         }t          d|          D ]} |||df                   }|||<   dS dS )z
        This computes reduction on `arr`.
        This device function must be used by 1 threadblock only.
        The blocksize must match `arr.size` and must not be greater than 128.
        r
   r   Nr   )r   r   r   r    r   r"   r$   r#   )r%   r&   r   r   r'   r(   r   r   r"   valuer   baser.   num_active_warpsresultr   r   r   s                  r   device_reduce_partial_blockz8_gpu_reduce_factory.<locals>.device_reduce_partial_block_   s    n	!yxnC&+FFN#QJ)#d**  e4444 {{%faaai0	)q$+.. C CA!*71:wqz!B!BGAJJ!88 %	 1A 5)C &F1.// > >"6;q!t+<==$HUOOO 8r   c                 "   j         j        }j                            t          	f          }j        j        
k    r | ||           n | ||           |r-|dk    r)j        j        dk    r |d         |          |d<   dS dS dS dS )aJ  
        Perform reductions on *arr* and writing out partial reduction result
        into *partials*.  The length of *partials* is determined by the
        number of threadblocks. The initial value is set with *init*.

        Launch config:

        Blocksize must be multiple of warpsize and it is limited to 4 warps.
        )dtyper   N)r   r   sharedarray	_NUMWARPSr    r   )r%   r&   r   use_initr   r   r   r/   r5   inner_sm_sizemax_blocksizenbtyper   s         r   gpu_reduce_block_stridedz5_gpu_reduce_factory.<locals>.gpu_reduce_block_strided   s     nk''M(B.4 ( 6 6=?m++$$S(K@@@@''X{CCC 	7qT]_%9%9#)HQK66HQKKK	7 	7%9%9r   )numbar   jitr   r:   )
fnr>   r?   r   r/   r5   r<   r   r=   r   s
    ` @@@@@@@r   _gpu_reduce_factoryrC      sT   %%%b))IMM	)M	XXTX     ( 
XXTX4N 4N 4N 4N 4N 4N 4Nl 
XXTX)% )% )% )% )% )% )%V7 7 7 7 7 7 7 7 7 7 7, 88,---r   c                   *    e Zd ZdZi Zd Zd ZddZdS )ReducezCreate a reduction object that reduces values using a given binary
    function. The binary function is compiled once and cached inside this
    object. Keeping this object alive will prevent re-compilation.
    c                     || _         dS )z
        :param functor: A function implementing a binary operation for
                        reduction. It will be compiled as a CUDA device
                        function using ``cuda.jit(device=True)``.
        N)_functor)selffunctors     r   __init__zReduce.__init__   s      r   c                     | j         |f}|| j        v r| j        |         }n,t          | j         t          |                    }|| j        |<   |S )N)rG   _cacherC   r   )rH   r7   keykernels       r   _compilezReduce._compile   sS    mU"$+[%FF(
58I8IJJF%DKr   Nr   c                    ddl m} |j        dk    rt          d          |
|d|         }|j                            |          }|j        dk     r|S |                     |j                  }t          t          z  }|j        |z  |z  }	|j        |	z
  }
t          |	|z  t          dz            }|}|
r|dz  }|                    ||j                  }|	r' ||||f         |d|	         |d|         |d           |
r( |d|
|f         ||	d         ||d         ||            |j        dk    r |d||f         |||d	           |)|dd                             |dd         |
           dS |d         S )a'  Performs a full reduction.

        :param arr: A host or device array.
        :param size: Optional integer specifying the number of elements in
                    ``arr`` to reduce. If this parameter is not specified, the
                    entire array is reduced.
        :param res: Optional device array into which to write the reduction
                    result to. The result is written into the first element of
                    this array. If this parameter is specified, then no
                    communication of the reduction output takes place from the
                    device to the host.
        :param init: Optional initial value for the reduction, the type of which
                    must match ``arr.dtype``.
        :param stream: Optional CUDA stream in which to perform the reduction.
                    If no stream is specified, the default stream of 0 is
                    used.
        :return: If ``res`` is specified, ``None`` is returned. Otherwise, the
                result of the reduction is returned.
        r   r   r
   zonly support 1D arrayNr   )shaper7   TF)stream)r@   r   ndim	TypeErrorr7   typer"   rO   r:   r   mindevice_arraycopy_to_device)rH   r%   r"   resr   rR   r   rN   	blocksize	size_fullsize_partialfull_blockctpartials_sizer&   s                 r   __call__zReduce.__call__   s   ( 	 8q==3444 ete*Cy~~d## 8a<<Ksy)) 	)	X*i7	x)+9	19q=AA % 	QM$$=	$JJ 	:3F<F23C

O4<]l]4K4848: : :
  	>+F1lF*+C	

O,4\]],C,00<,<> > >
 =1,F1mV+,XxuMMM ?G""8BQB<"???FA;r   )NNr   r   )__name__
__module____qualname____doc__rL   rJ   rO   r_    r   r   rE   rE      s]         
 F       I I I I I Ir   rE   N)rc   numba.np.numpy_supportr   r   r:   rC   objectrE   rd   r   r   <module>rg      s     . - - - - - 		U. U. U.pb b b b bV b b b b br   