
    J/Phݜ                        d dl Zd dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m
Z* d dl+m,Z, d dl+m-Z- d dl.m/Z/ g dZ0 G d dej1                  Z2 G d de3          Z4 G d d          Z5 G d de          Z6 G d de          Z7 G d deej1                  Z8dS )     N)config	serializesigutilstypestypingutils)Cache	CacheImpl)global_compiler_lock)
Dispatcher)NumbaPerformanceWarning)Purposetypeof)get_current_device)wrap_arg)compile_cudaCUDACompiler)driver)get_context)cuda_target)missing_launch_config_msgnormalize_kernel_dimensions)r   cuda)_dispatcher)warn)hsinhcoshloghlog10hlog2hexphexp10hexp2hsqrthrsqrthfloorhceilhrcphrinthtrunchdivc                   b    e Zd ZdZe	 	 	 d fd	            Zed             Zed             Zd Z	ed	             Z
ed
             Ze fd            Zd Zd Zed             Zed             Zed             Zed             Zed             Zd Zd Zd Zd ZddZddZddZd Z xZS ) _Kernelz
    CUDA Kernel specialized for a given set of argument types. When called, this
    object launches the kernel on the device.
    NFTc                 z   |rt          d          t                                                       d| _        d | _        || _        || _        || _        || _        |pg | _	        ||
rdndd}t                      j        }t          | j        t          j        | j        | j        |||||	  	        }|j        }| j        j        }|j        }|j        }|                    |j        |j        ||||||	          \  }|sg }d                                v | _        | j        rd_        fd	t2          D             }|rvt4          j                            t4          j                            t<                              }t4          j                            |d
          }|                     |           |D ]}!                    |           |j"        | _#        |j$        | _$        |j%        | _&        | _'        |j(        | _(        || _        |j        | _        |j)        | _)        g | _*        g | _+        g | _,        d S )Nz,Cannot compile a device function as a kernelF   r   )fastmathoptdebuglineinfoinliner1   nvvm_optionscccudaCGGetIntrinsicHandleTc                 F    g | ]}d |                                  v |S )__numba_wrapper_)get_asm_str).0fnlibs     U/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/numba/cuda/dispatcher.py
<listcomp>z$_Kernel.__init__.<locals>.<listcomp>l   sB     B B Bb*b**coo.?.??? ???    zcpp_function_wrappers.cu)-RuntimeErrorsuper__init__
objectmodeentry_pointpy_funcargtypesr4   r5   
extensionsr   compute_capabilityr   r   voidtarget_context__code__co_filenameco_firstlinenoprepare_cuda_kernellibraryfndescr<   cooperativeneeds_cudadevrtcuda_fp16_math_funcsospathdirnameabspath__file__joinappendadd_linking_filename
entry_name	signaturetype_annotation_type_annotation_codelibrarycall_helperenvironment_referenced_environmentsliftedreload_init)selfrH   rI   linkr4   r5   r6   r1   rJ   max_registersr2   devicer7   r8   crestgt_ctxcodefilenamelinenumkernelresbasedirfunctions_cu_pathfilepathr?   	__class__s                           @r@   rE   z_Kernel.__init__.   s}   
  	OMNNN     
 $* !?11
 

  !!4DL%*dm"&*%-#)%-)5!# # # %|$#%11$,27<2:G2?A AV
  	D 69J9JJ 	'"&CB B B B0 B B B  	+goobgooh&?&?@@G "W-G!I !IKK)*** 	+ 	+H  **** !+ $ 4+ &k+(*%rB   c                     | j         S N)rd   rj   s    r@   rR   z_Kernel.library   s      rB   c                     | j         S rz   )rc   r{   s    r@   rb   z_Kernel.type_annotation   s    $$rB   c                     | j         S rz   )rg   r{   s    r@   _find_referenced_environmentsz%_Kernel._find_referenced_environments   s    ,,rB   c                 4    | j                                         S rz   )rM   codegenr{   s    r@   r   z_Kernel.codegen   s    "**,,,rB   c                 4    t          | j        j                  S rz   )tuplera   argsr{   s    r@   argument_typesz_Kernel.argument_types   s    T^()))rB   c	                    |                      |           }	t          | |	                                           d|	_        ||	_        ||	_        ||	_        d|	_        ||	_        ||	_	        ||	_
        ||	_        ||	_        |	S )&
        Rebuild an instance.
        N)__new__rD   rE   rG   rT   r`   ra   rc   rd   r4   r5   re   rJ   )clsrT   r_   ra   codelibraryr4   r5   re   rJ   instancerx   s             r@   _rebuildz_Kernel._rebuild   s     ;;s##c8%%'''#*"&$(! +$*(rB   c           
          t          | j        | j        | j        | j        | j        | j        | j        | j                  S )a  
        Reduce the instance for serialization.
        Compiled definitions are serialized in PTX form.
        Type annotation are discarded.
        Thread, block and shared memory configuration are serialized.
        Stream information is discarded.
        )rT   r_   ra   r   r4   r5   re   rJ   )	dictrT   r`   ra   rd   r4   r5   re   rJ   r{   s    r@   _reduce_statesz_Kernel._reduce_states   sE      0t"n$:K*t} $ 0T_N N N 	NrB   c                 8    | j                                          dS )z7
        Force binding to current CUDA context
        N)rd   
get_cufuncr{   s    r@   bindz_Kernel.bind   s     	$$&&&&&rB   c                 H    | j                                         j        j        S )zN
        The number of registers used by each thread for this kernel.
        )rd   r   attrsregsr{   s    r@   regs_per_threadz_Kernel.regs_per_thread   s    
  ++--388rB   c                 H    | j                                         j        j        S )zD
        The amount of constant memory used by this kernel.
        )rd   r   r   constr{   s    r@   const_mem_sizez_Kernel.const_mem_size       
  ++--399rB   c                 H    | j                                         j        j        S )zM
        The amount of shared memory used per block for this kernel.
        )rd   r   r   sharedr{   s    r@   shared_mem_per_blockz_Kernel.shared_mem_per_block   s    
  ++--3::rB   c                 H    | j                                         j        j        S )z:
        The maximum allowable threads per block.
        )rd   r   r   
maxthreadsr{   s    r@   max_threads_per_blockz_Kernel.max_threads_per_block   s    
  ++--3>>rB   c                 H    | j                                         j        j        S )zM
        The amount of local memory used per thread for this kernel.
        )rd   r   r   localr{   s    r@   local_mem_per_threadz_Kernel.local_mem_per_thread   r   rB   c                 4    | j                                         S )z6
        Returns the LLVM IR for this kernel.
        )rd   get_llvm_strr{   s    r@   inspect_llvmz_Kernel.inspect_llvm   s      --///rB   c                 8    | j                             |          S )z7
        Returns the PTX code for this kernel.
        )r8   )rd   r<   )rj   r8   s     r@   inspect_asmz_Kernel.inspect_asm   s      ,,,333rB   c                 4    | j                                         S )zv
        Returns the CFG of the SASS for this kernel.

        Requires nvdisasm to be available on the PATH.
        )rd   get_sass_cfgr{   s    r@   inspect_sass_cfgz_Kernel.inspect_sass_cfg   s      --///rB   c                 4    | j                                         S )zp
        Returns the SASS code for this kernel.

        Requires nvdisasm to be available on the PATH.
        )rd   get_sassr{   s    r@   inspect_sassz_Kernel.inspect_sass   s      ))+++rB   c                     | j         t          d          |t          j        }t	          | j        d| j        |           t	          d|           t	          | j         |           t	          d|           dS )
        Produce a dump of the Python source of this function annotated with the
        corresponding Numba IR and type information. The dump is written to
        *file*, or *sys.stdout* if *file* is *None*.
        Nz Type annotation is not available filezP--------------------------------------------------------------------------------zP================================================================================)rc   
ValueErrorsysstdoutprintr`   r   )rj   r   s     r@   inspect_typesz_Kernel.inspect_types  s      (?@@@<:D$*=*=>TJJJJhT""""d#$////hT""""""rB   r   c                     t                      }| j                                        }t          |t                    rt          j        d |          }|                    |||          }|j        j	        }||z  S )a  
        Calculates the maximum number of blocks that can be launched for this
        kernel in a cooperative grid in the current context, for the given block
        and dynamic shared memory sizes.

        :param blockdim: Block dimensions, either as a scalar for a 1D block, or
                         a tuple for 2D or 3D blocks.
        :param dynsmemsize: Dynamic shared memory size in bytes.
        :return: The maximum number of blocks in the grid.
        c                     | |z  S rz    )xys     r@   <lambda>z5_Kernel.max_cooperative_grid_blocks.<locals>.<lambda>&  s
    QU rB   )
r   rd   r   
isinstancer   	functoolsreduce$get_active_blocks_per_multiprocessorrm   MULTIPROCESSOR_COUNT)rj   blockdimdynsmemsizectxcufuncactive_per_smsm_counts          r@   max_cooperative_grid_blocksz#_Kernel.max_cooperative_grid_blocks  s     mm"--//h&& 	F '(:(:HEEH@@AIALN N :2x''rB   c                 J   | j                                         | j        ruj        dz   }j                            |          \  }}|t          j        t          j                  k    sJ t          j                    }	|	                    d|           g }
g }t          | j        |          D ]\  }}|                     ||||
|           t          j        r t          j                            d          }nd }|r|j        p|}t          j        j        g|||||R d| j        i | j        rt          j        t          j        |	          ||           |	j        dk    rfdfddD             }fddD             }|	j        }| j                            |          \  }}}|d	}n1|\  }}}t2          j                            |          }d
|d|d|d}|d|d|}|r|d|d         f|dd          z   }n|f} || |
D ]} |             d S )N__errcode__r   )streamrT   c                     j                             j        d| d          \  }}t          j                    }t          j        t          j        |          ||           |j        S )N__)	moduleget_global_symbolr_   ctypesc_intr   device_to_host	addressofvalue)r_   memszvalr   s       r@   load_symbolz#_Kernel.launch.<locals>.load_symbolS  sk    $m==?E{{{?Ctt?E F FGC !,..C)&*:3*?*?bIII9$rB   c                 ,    g | ]} d |z             S )tidr   r=   ir   s     r@   rA   z"_Kernel.launch.<locals>.<listcomp>[  s'    ===!{{519--===rB   zyxc                 ,    g | ]} d |z             S )ctaidr   r   s     r@   rA   z"_Kernel.launch.<locals>.<listcomp>\  s'    AAAaWq[11AAArB    zIn function z, file z, line z, ztid=z ctaid=z:    )rd   r   r4   r_   r   r   r   sizeofr   memsetzipr   _prepare_argsr   USE_NV_BINDINGbindingCUstreamhandlelaunch_kernelrT   r   r   r   re   get_exceptionrW   rX   rZ   )rj   r   griddimr   r   	sharedmemexcnameexcmemexcszexcvalretr
kernelargstvzero_streamstream_handler   r   rp   excclsexc_argsloclocinfosymrw   linenoprefixwbr   r   s                               @@r@   launchz_Kernel.launch-  s   "--//: 	,kM1G"M;;GDDMFEFM&,777777\^^FMM!FM+++ 
+T22 	? 	?DAqq!VT:>>>>  	 .11!44KKK06=?K 	V] 	;%	;&	; '	; +		;
 (	; 	; 	; *.)9	; 	; 	; :  	(!&"26":":FEJJJ|q  % % % % % >===u===AAAA5AAA|(,(8(F(Ft(L(L%#; GG,/)C6!wx88HHFIccFNhhFLffOG 18eeD ',2FFHQKK @B  %HH  &wHfh''  	 	BBDDDD	 	rB   c                 ,   t          | j                  D ]}|                    ||||          \  }}t          |t          j                  rt          |                              ||          }t          j	        }t          j
        d          }	t          j
        d          }
 ||j                  } ||j        j                  }t          j        |          }t          j        rt#          |          }t          j
        |          }|                    |	           |                    |
           |                    |           |                    |           |                    |           t'          |j                  D ]+}|                     ||j        |                              ,t'          |j                  D ]+}|                     ||j        |                              ,dS t          |t          j                  r8 t1          t          d|z            |          }|                    |           dS |t          j        k    rZt          j        t7          j        |                              t6          j                            }|                    |           dS |t          j        k    r+t          j        |          }|                    |           dS |t          j         k    r+t          j!        |          }|                    |           dS |t          j"        k    r8t          j#        t#          |                    }|                    |           dS |t          j$        k    rZ|                    t          j!        |j%                             |                    t          j!        |j&                             dS |t          j'        k    rZ|                    t          j        |j%                             |                    t          j        |j&                             dS t          |t          j(        t          j)        f          rF|                    t          j*        |                    t6          j+                                       dS t          |t          j,                  rnt          |                              ||          }|j-        }t          j        r!t          j
        t#          |                    }|                    |           dS t          |t          j.                  rSt_          |          t_          |          k    sJ ta          ||          D ]\  }}| 1                    |||||           dS t          |t          j2                  rD	 | 1                    |j        |j3        |||           dS # th          $ r ti          ||          w xY wti          ||          )zF
        Convert arguments to ctypes and append to kernelargs
        )r   r   r   zc_%sN)5reversedrJ   prepare_argsr   r   Arrayr   	to_devicer   	c_ssize_tc_void_psizedtypeitemsizer   device_pointerr   intr]   rangendimshapestridesIntegergetattrfloat16c_uint16npviewuint16float64c_doublefloat32c_floatbooleanc_uint8	complex64realimag
complex128
NPDatetimeNPTimedeltac_int64int64Recorddevice_ctypes_pointer	BaseTuplelenr   r   
EnumMemberr   NotImplementedError)rj   tyr   r   r   r   	extensiondevaryc_intpmeminfoparentnitemsr  ptrdataaxcvaldevrecr   r   s                       r@   r   z_Kernel._prepare_argsu  s    "$/22 	 	I,,	 -  GB b%+&& O	/c]],,T6::F%Foa((G_Q''FVFK((Fvfl344H'//C$ #hh?3''Dg&&&f%%%f%%%h'''d###FK(( < <!!&&b)9":":;;;;FK(( > >!!&&);"<"<====> > EM** 4	//766B;//44Dd#####5=  ?2:c??#7#7	#B#BCCDd#####5=  ?3''Dd#####5=  >#&&Dd#####5=  >#c((++Dd#####5?""fnSX66777fnSX66777775###foch77888foch7788888U-u/@ABB 	/fnSXXbh-?-?@@AAAAAEL)) 	/c]],,T6::F.C$ 0oc#hh//c"""""EO,, 	/r77c#hh&&&&B C C1""1azBBBBC C E,-- 		/3""Hciz     ' 3 3 3)"c2223 &b#...s   #W( (X)	NFFFFNNTFrz   )r   r   r   )__name__
__module____qualname____doc__r   rE   propertyrR   rb   r~   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   __classcell__rx   s   @r@   r.   r.   (   s?        
 ;@JN6;Z Z Z Z Z Zx ! ! X! % % X%- - - - - X- * * X*     [*N N N' ' ' 9 9 X9 : : X: ; ; X; ? ? X? : : X:0 0 04 4 40 0 0, , ,# # # #"( ( ( (,F F F FP\/ \/ \/ \/ \/ \/ \/rB   r.   c                        e Zd Zd Zd Zd ZdS )ForAllc                 |    |dk     rt          d|z            || _        || _        || _        || _        || _        d S )Nr   z0Can't create ForAll with negative task count: %s)r   
dispatcherntasksthread_per_blockr   r   )rj   rE  rF  tpbr   r   s         r@   rE   zForAll.__init__  sP    A::O%& ' ' '$ #"rB   c                     | j         dk    rd S | j        j        r| j        }n | j        j        | }|                     |          }| j         |z   dz
  |z  } |||| j        | j        f         | S )Nr   r   )rF  rE  specialized
specialize_compute_thread_per_blockr   r   )rj   r   rJ  r   r   s        r@   __call__zForAll.__call__  s    ;!F?& 	</KK4$/4d;K11+>>;)A-(:+{7Hdk>* +,02 	2rB   c                 $   | j         }|dk    r|S t                      }t          t          |j                                                            }t          |j                                        d| j	        d          } |j
        di |\  }}|S )Nr   i   )funcb2d_funcmemsizeblocksizelimitr   )rG  r   nextiter	overloadsvaluesr   rd   r   r   get_max_potential_block_size)rj   rE  rH  r   rs   kwargs_s          r@   rL  z ForAll._compute_thread_per_block  s    #!88J --C $z3::<<==>>F(3355#	  F 6S5????FAsJrB   N)r:  r;  r<  rE   rM  rL  r   rB   r@   rC  rC    sA        # # #2 2 2    rB   rC  c                       e Zd Zd Zd ZdS )_LaunchConfigurationc                     || _         || _        || _        || _        || _        t
          j        rFd}|d         |d         z  |d         z  }||k     r&d| d}t          t          |                     d S d S d S )N   r   r      z
Grid size zB will likely result in GPU under-utilization due to low occupancy.)	rE  r   r   r   r   r   CUDA_LOW_OCCUPANCY_WARNINGSr   r   )	rj   rE  r   r   r   r   min_grid_size	grid_sizemsgs	            r@   rE   z_LaunchConfiguration.__init__  s    $ "- 	3  M
WQZ/'!*<I=((AI A A A,S1122222	3 	3 )(rB   c                 f    | j                             || j        | j        | j        | j                  S rz   )rE  callr   r   r   r   rj   r   s     r@   rM  z_LaunchConfiguration.__call__  s2    ##D$,$(KA A 	ArB   N)r:  r;  r<  rE   rM  r   rB   r@   r[  r[    s7        3 3 3.A A A A ArB   r[  c                        e Zd Zd Zd Zd ZdS )CUDACacheImplc                 *    |                                 S rz   )r   )rj   rs   s     r@   r   zCUDACacheImpl.reduce   s    $$&&&rB   c                 $    t          j        di |S )Nr   )r.   r   )rj   rM   payloads      r@   rebuildzCUDACacheImpl.rebuild#  s    **'***rB   c                     dS )NTr   )rj   rn   s     r@   check_cachablezCUDACacheImpl.check_cachable&  s	     trB   N)r:  r;  r<  r   rk  rm  r   rB   r@   rg  rg    sA        ' ' '+ + +    rB   rg  c                   &     e Zd ZdZeZ fdZ xZS )	CUDACachezS
    Implements a cache that saves and loads CUDA kernels and compile results.
    c                     ddl m}  |d          5  t                                          ||          cd d d            S # 1 swxY w Y   d S )Nr   )target_overrider   )numba.core.target_extensionrq  rD   load_overload)rj   sigrM   rq  rx   s       r@   rs  zCUDACache.load_overload7  s     	@?????_V$$ 	> 	>77((n==	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   "AA	A)r:  r;  r<  r=  rg  _impl_classrs  r@  rA  s   @r@   ro  ro  1  sI           K> > > > > > > > >rB   ro  c                   ^    e Zd ZdZdZeZef fd	Ze	d             Z
d Z ej        d          d$d	            Zd
 Zd%dZe	d             Zd Zd Zd Zd Zd Ze	d             Zd&dZd&dZd&dZd&dZd&dZd Zd&dZd Zd Z d&dZ!d&dZ"d&dZ#d&d Z$d&d!Z%e&d"             Z'd# Z( xZ)S )'CUDADispatchera  
    CUDA Dispatcher object. When configured and called, the dispatcher will
    specialize itself for the given arguments (if no suitable specialized
    version already exists) & compute capability, and launch on the device
    associated with the current context.

    Dispatcher objects are not to be constructed by the user, but instead are
    created using the :func:`numba.cuda.jit` decorator.
    Fc                 l    t                                          |||           d| _        i | _        d S )N)targetoptionspipeline_classF)rD   rE   _specializedspecializations)rj   rH   ry  rz  rx   s       r@   rE   zCUDADispatcher.__init__R  sE    (6 	 	8 	8 	8 "  "rB   c                 *    t          j        |           S rz   )
cuda_typesrw  r{   s    r@   _numba_type_zCUDADispatcher._numba_type_b  s    (...rB   c                 8    t          | j                  | _        d S rz   )ro  rH   _cacher{   s    r@   enable_cachingzCUDADispatcher.enable_cachingf  s    --rB   r]  )maxsizer   c                 N    t          ||          \  }}t          | ||||          S rz   )r   r[  )rj   r   r   r   r   s        r@   	configurezCUDADispatcher.configurei  s,    7JJ#D'8VYOOOrB   c                 V    t          |          dvrt          d           | j        | S )N)r^  r0      z.must specify at least the griddim and blockdim)r*  r   r  re  s     r@   __getitem__zCUDADispatcher.__getitem__n  s1    t99I%%MNNNt~t$$rB   c                 *    t          | ||||          S )a3  Returns a 1D-configured dispatcher for a given number of tasks.

        This assumes that:

        - the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
          1-1 basis.
        - the kernel checks that the Global Thread ID is upper-bounded by
          ``ntasks``, and does nothing if it is not.

        :param ntasks: The number of tasks.
        :param tpb: The size of a block. An appropriate value is chosen if this
                    parameter is not supplied.
        :param stream: The stream on which the configured dispatcher will be
                       launched.
        :param sharedmem: The number of bytes of dynamic shared memory required
                          by the kernel.
        :return: A configured dispatcher, ready to launch on a set of
                 arguments.)rH  r   r   )rC  )rj   rF  rH  r   r   s        r@   forallzCUDADispatcher.foralls  s    ( dFFiPPPPrB   c                 6    | j                             d          S )aS  
        A list of objects that must have a `prepare_args` function. When a
        specialized kernel is called, each argument will be passed through
        to the `prepare_args` (from the last object in this list to the
        first). The arguments to `prepare_args` are:

        - `ty` the numba type of the argument
        - `val` the argument value itself
        - `stream` the CUDA stream used for the current call to the kernel
        - `retr` a list of zero-arg functions that you may want to append
          post-call cleanup work to.

        The `prepare_args` function must return a tuple `(ty, val)`, which
        will be passed in turn to the next right-most `extension`. After all
        the extensions have been called, the resulting `(ty, val)` will be
        passed into Numba's default argument marshalling logic.
        rJ   )ry  getr{   s    r@   rJ   zCUDADispatcher.extensions  s    & !%%l333rB   c                 *    t          t                    rz   )r   r   )rj   r   rX  s      r@   rM  zCUDADispatcher.__call__  s    2333rB   c                     | j         r4t          t          | j                                                            }nt          j        j        | g|R  }|                    |||||           dS )zJ
        Compile if necessary and invoke this kernel with *args*.
        N)	rJ  rS  rT  rU  rV  r   r   
_cuda_callr  )rj   r   r   r   r   r   rs   s          r@   rd  zCUDADispatcher.call  sp      	D$t~44667788FF +6tCdCCCFdGXvyAAAAArB   c                 l     |rJ  fd|D             }                      t          |                    S )Nc                 :    g | ]}                     |          S r   typeof_pyvalr=   arj   s     r@   rA   z4CUDADispatcher._compile_for_args.<locals>.<listcomp>  s'    777QD%%a((777rB   )compiler   )rj   r   kwsrI   s   `   r@   _compile_for_argsz CUDADispatcher._compile_for_args  s=    7777$777||E(OO,,,rB   c                     	 t          |t          j                  S # t          $ rF t	          j        |          r0t          t	          j        |d          t          j                  cY S  w xY w)NF)sync)r   r   argumentr   r   is_cuda_arrayas_cuda_array)rj   r   s     r@   r  zCUDADispatcher.typeof_pyval  s    		#w/000 	 	 	!#&&  d05AAA%.0 0 0 0 0 	s    AA,*A,c                      j         rt          d          t                      j        }t	           fd|D                       } j                            ||f          }|r|S  j        }t           j	        |          }|
                    |           |                                 d|_        | j        ||f<   |S )zd
        Create a new instance of this dispatcher specialized for the given
        *args*.
        zDispatcher already specializedc              3   B   K   | ]}                     |          V  d S rz   r  r  s     r@   	<genexpr>z,CUDADispatcher.specialize.<locals>.<genexpr>  s1      <<!**1--<<<<<<rB   )ry  T)rJ  rC   r   rK   r   r|  r  ry  rw  rH   r  disable_compiler{  )rj   r   r8   rI   specializationry  s   `     r@   rK  zCUDADispatcher.specialize  s    
  	A?@@@!!4<<<<t<<<<<-112x.AA 	"!!*'6CE E Ex(((&&(((&*#-;R\*rB   c                     | j         S )z>
        True if the Dispatcher has been specialized.
        )r{  r{   s    r@   rJ  zCUDADispatcher.specialized  s    
   rB   Nc                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the number of registers used by each thread in this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get register
                          usage for. This may be omitted for a specialized
                          kernel.
        :return: The number of registers used by the compiled variant of the
                 kernel for the given signature and current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r=   rt  overloads      r@   
<dictcomp>z6CUDADispatcher.get_regs_per_thread.<locals>.<dictcomp>  s7     A A A%X 1 A A ArB   )rU  r   r   rJ  rS  rT  rV  itemsrj   ra   s     r@   get_regs_per_threadz"CUDADispatcher.get_regs_per_thread  s      >).1AA 	AT^22445566FFA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the size in bytes of constant memory used by this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get constant
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The size in bytes of constant memory allocated by the
                 compiled variant of the kernel for the given signature and
                 current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z5CUDADispatcher.get_const_mem_size.<locals>.<dictcomp>  s7     A A A%X 0 A A ArB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_const_mem_sizez!CUDADispatcher.get_const_mem_size  s      >).1@@ 	AT^22445566EEA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the size in bytes of statically allocated shared memory
        for this kernel.

        :param signature: The signature of the compiled kernel to get shared
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of shared memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z;CUDADispatcher.get_shared_mem_per_block.<locals>.<dictcomp>  7     A A A%X 6 A A ArB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_shared_mem_per_blockz'CUDADispatcher.get_shared_mem_per_block        >).1FF 	AT^22445566KKA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a(  
        Returns the maximum allowable number of threads per block
        for this kernel. Exceeding this threshold will result in
        the kernel failing to launch.

        :param signature: The signature of the compiled kernel to get the max
                          threads per block for. This may be omitted for a
                          specialized kernel.
        :return: The maximum allowable threads per block for the compiled
                 variant of the kernel for the given signature and current
                 device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z<CUDADispatcher.get_max_threads_per_block.<locals>.<dictcomp>,  s7     A A A%X 7 A A ArB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_max_threads_per_blockz(CUDADispatcher.get_max_threads_per_block  s      >).1GG 	AT^22445566LLA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the size in bytes of local memory per thread
        for this kernel.

        :param signature: The signature of the compiled kernel to get local
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of local memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z;CUDADispatcher.get_local_mem_per_thread.<locals>.<dictcomp>?  r  rB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_local_mem_per_threadz'CUDADispatcher.get_local_mem_per_thread/  r  rB   c                    | j         r"|                     t          |                     | j        j        }d                    |          }t          j        ||| j                  }t          j
        | j                  }||||fS )z
        Get a typing.ConcreteTemplate for this dispatcher and the given
        *args* and *kws* types.  This allows resolution of the return type.

        A (template, pysig, args, kws) tuple is returned.
        zCallTemplate({0}))key
signatures)_can_compilecompile_devicer   rH   r:  formatr   make_concrete_templatenopython_signaturesr   pysignature)rj   r   r  	func_namer_   call_templatepysigs          r@   get_call_templatez CUDADispatcher.get_call_templateB  s      	-d,,, L)	")))445iD,DF F F!$,//eT3..rB   c                 R   || j         vr| j        5  | j                            d          }| j                            d          }| j                            d          }| j                            d          }| j                            d          rdnd|d}t	                      j        }t          | j        ||||||||		  	        }	|	| j         |<   |	j        	                    |	j
        |	j        |	j        g           d
d
d
           n# 1 swxY w Y   n| j         |         }	|	S )zCompile the device function for the given argument types.

        Each signature is compiled once by caching the compiled function inside
        this object.

        Returns the `CompileResult`.
        r4   r5   r6   r1   r2   r0   r   )r2   r1   r3   N)rU  _compiling_counterry  r  r   rK   r   rH   rM   insert_user_functionrG   rS   rR   )
rj   r   return_typer4   r5   r6   r1   r7   r8   rn   s
             r@   r  zCUDADispatcher.compile_device]  s    t~%%( I I*..w77-11*==+//99-11*== !% 2 6 6u = =D111 (   
 ())<#DL+t*/-5+1-51=')+ + + (,t$#889I9=:>,I I I-I I I I I I I I I I I I I I I4 >$'Ds   C0DDDc                 b    d |D             }|                      ||d           || j        |<   d S )Nc                     g | ]	}|j         
S r   )_code)r=   r  s     r@   rA   z/CUDADispatcher.add_overload.<locals>.<listcomp>  s    +++Q+++rB   Tr   )_insertrU  )rj   rs   rI   c_sigs       r@   add_overloadzCUDADispatcher.add_overload  s?    ++(+++UF...#)x   rB   c                    t          j        |          \  }}||t          j        k    sJ | j        r3t          t          | j                                                            S | j        	                    |          }||S | j
                            || j                  }|| j        |xx         dz  cc<   nr| j        |xx         dz  cc<   | j        st!          d          t#          | j        |fi | j        }|                                 | j
                            ||           |                     ||           |S )z
        Compile and bind to the current context a version of this kernel
        specialized for the given signature.
        Nr   zCompilation disabled)r   normalize_signaturer   nonerJ  rS  rT  rU  rV  r  r  rs  	targetctx_cache_hits_cache_missesr  rC   r.   rH   ry  r   save_overloadr  )rj   rt  rI   r  rs   s        r@   r  zCUDADispatcher.compile  s\   
 !) <S A A+"kUZ&?&?&?&?  	T^224455666^''11F! **3??S!!!Q&!!!! s###q(###$ ;"#9:::T\8JJt7IJJFKKMMMK%%c6222&(+++rB   c                 T   | j                             d          }|E|r$| j        |         j                                        S | j        |                                         S |r#d | j                                        D             S d | j                                        D             S )z
        Return the LLVM IR for this kernel.

        :param signature: A tuple of argument types.
        :return: The LLVM IR for the given signature, or a dict of LLVM IR
                 for all previously-encountered signatures.

        rm   Nc                 H    i | ]\  }}||j                                          S r   )rR   r   r  s      r@   r  z/CUDADispatcher.inspect_llvm.<locals>.<dictcomp>  sC     E E E)C X-::<< E E ErB   c                 >    i | ]\  }}||                                 S r   )r   r  s      r@   r  z/CUDADispatcher.inspect_llvm.<locals>.<dictcomp>  s@     E E E)C X2244 E E ErB   )ry  r  rU  rR   r   r   r  )rj   ra   rm   s      r@   r   zCUDADispatcher.inspect_llvm  s     #''11  @~i08EEGGG~i0==??? EE E-1^-A-A-C-CE E E EE E-1^-A-A-C-CE E E ErB   c                    t                      j        | j                            d          }|G|r%| j        |         j                                      S | j        |                                       S |r%fd| j                                        D             S fd| j                                        D             S )a+  
        Return this kernel's PTX assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The PTX code for the given signature, or a dict of PTX codes
                 for all previously-encountered signatures.
        rm   Nc                 L    i | ] \  }}||j                                       !S r   )rR   r<   r=   rt  r  r8   s      r@   r  z.CUDADispatcher.inspect_asm.<locals>.<dictcomp>  sF     E E E)C X-99"== E E ErB   c                 B    i | ]\  }}||                               S r   )r   r  s      r@   r  z.CUDADispatcher.inspect_asm.<locals>.<dictcomp>  sC     E E E)C X11"55 E E ErB   )	r   rK   ry  r  rU  rR   r<   r   r  )rj   ra   rm   r8   s      @r@   r   zCUDADispatcher.inspect_asm  s      !!4#''11  A~i08DDRHHH~i0<<R@@@ EE E E E-1^-A-A-C-CE E E EE E E E-1^-A-A-C-CE E E ErB   c                     | j                             d          rt          d          || j        |                                         S d | j                                        D             S )a  
        Return this kernel's CFG for the device in the current context.

        :param signature: A tuple of argument types.
        :return: The CFG for the given signature, or a dict of CFGs
                 for all previously-encountered signatures.

        The CFG for the device in the current context is returned.

        Requires nvdisasm to be available on the PATH.
        rm   z'Cannot get the CFG of a device functionNc                 >    i | ]\  }}||                                 S r   )r   r=   rt  defns      r@   r  z3CUDADispatcher.inspect_sass_cfg.<locals>.<dictcomp>  s:     = = =!T ..00 = = =rB   )ry  r  rC   rU  r   r  r  s     r@   r   zCUDADispatcher.inspect_sass_cfg  s{     !!(++ 	JHIII >),==???= =%)^%9%9%;%;= = = =rB   c                     | j                             d          rt          d          || j        |                                         S d | j                                        D             S )a  
        Return this kernel's SASS assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The SASS code for the given signature, or a dict of SASS codes
                 for all previously-encountered signatures.

        SASS for the device in the current context is returned.

        Requires nvdisasm to be available on the PATH.
        rm   z(Cannot inspect SASS of a device functionNc                 >    i | ]\  }}||                                 S r   )r   r  s      r@   r  z/CUDADispatcher.inspect_sass.<locals>.<dictcomp>  s:     = = =!T **,, = = =rB   )ry  r  rC   rU  r   r  r  s     r@   r   zCUDADispatcher.inspect_sass  s{     !!(++ 	KIJJJ >),99;;;= =%)^%9%9%;%;= = = =rB   c                     |t           j        }| j                                        D ]\  }}|                    |           dS )r   Nr   )r   r   rU  r  r   )rj   r   rY  r  s       r@   r   zCUDADispatcher.inspect_types  sU     <:D~++-- 	* 	*GAtD))))	* 	*rB   c                      | ||          }|S )r   r   )r   rH   ry  r   s       r@   r   zCUDADispatcher._rebuild  s    
 3w..rB   c                 8    t          | j        | j                  S )zd
        Reduce the instance for serialization.
        Compiled definitions are discarded.
        )rH   ry  )r   rH   ry  r{   s    r@   r   zCUDADispatcher._reduce_states  s%    
 DL"&"46 6 6 	6rB   r9  )r   r   r   rz   )*r:  r;  r<  r=  
_fold_argsr   targetdescrr   rE   r>  r  r  r   	lru_cacher  r  r  rJ   rM  rd  r  r  rK  rJ  r  r  r  r  r  r  r  r  r  r   r   r   r   r   r?  r   r   r@  rA  s   @r@   rw  rw  @  s         JK>J " " " " " "  / / X/. . . Y%%%P P P &%P% % %
Q Q Q Q, 4 4 X4(4 4 4	B 	B 	B- - -    0 ! ! X!A A A A&A A A A(A A A A&A A A A*A A A A&/ / /6% % % %N* * *
" " "HE E E E.E E E E0= = = =*= = = =,
* 
* 
* 
*   [6 6 6 6 6 6 6rB   rw  )9numpyr  rW   r   r   r   
numba.corer   r   r   r   r   r   numba.core.cachingr	   r
   numba.core.compiler_lockr   numba.core.dispatcherr   numba.core.errorsr   numba.core.typing.typeofr   r   numba.cuda.apir   numba.cuda.argsr   numba.cuda.compilerr   r   numba.cuda.cudadrvr   numba.cuda.cudadrv.devicesr   numba.cuda.descriptorr   numba.cuda.errorsr   r   
numba.cudar~  numbar   r   warningsr   rV   ReduceMixinr.   objectrC  r[  rg  ro  rw  r   rB   r@   <module>r     s       				 



      H H H H H H H H H H H H H H H H / / / / / / / / 9 9 9 9 9 9 , , , , , , 5 5 5 5 5 5 4 4 4 4 4 4 4 4 - - - - - - $ $ $ $ $ $ : : : : : : : : % % % % % % 2 2 2 2 2 2 - - - - - -< < < < < < < < * * * * * *                  * * * i/ i/ i/ i/ i/i# i/ i/ i/X+ + + + +V + + +\A A A A A A A A:    I   $> > > > > > > >a6 a6 a6 a6 a6Z!6 a6 a6 a6 a6 a6rB   