
    0Ph6                    $   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZ d dlmZ d Zej                            d	g d
          d             Zej                            d	g d
          d             Zej                            dej        ej        ej        g          ej                            dej        ej        ej        g          d                         Zej                            dej        ej        ej        g          d             Zd Zd Zd Z d Z!d Z"ej                            dg dg dg ej#        g dg dg           ej#        g dg dge$           ej#        g ddej%        d gge$           ej#        g dd e&d!          d gge$           ej#        g d"g d#ge$           ej#        g d$dej%        dgge$           ej#        g d$d e&d!          dgge$          gg d%&          d'             Z'ej                            d	g d
          ej                            d(d)d*g          ej                            d+dd,g          d-                                     Z(ej                            d(d)d*g          ej                            d.d/d0gd1d0gd/d0ggg d2g d3g d2gfd4dgd5dgd6d7gd5dggg d8g d9g d:gfg          d;                         Z)d< Z*ej                            d+g d=          ej                            d>g d=          d?                         Z+ej                            d@dAdBg          ej                            dd1d/g ej#        dCdDg          g          dE                         Z,ej                            d@dAdBg          dF             Z-ej                            dGdHd0gdId0ggdHdIgd0ggej.        f ej#        d1d/gdJd/gg          d1dJgd/ggej/        f ej#        dKd gdLd gge$          dKdLgd ggej.        f ej#        dKd gdLd gg          dKdLgd ggej0        f ej#        d1d/gej%        d/gg          d1ej%        gd/ggej        f ej#        dKej%        gdej%        gge$          dKdgej%        ggej.        f ej#        dK e&d!          gd e&d!          gge$          dKdg e&d!          ggej.        fgg dM&          dN             Z1ej                            d	g d
          ej                            dO ej#        dd7gge$          j2         ej#        ddPgge$          j2        g dQgej.        f ej#        d1d/ggdR          j2         ej#        d1dSggdR          j2        g dTgej3        f ej#        dd7gge$          j2         ej#        ddPgge$          j2         ej#        g dQ          gej.        f ej#        ddgge$          j2         ej#        dd7gge$          j2        g dUge$f ej#        dd7gge$          j2         ej#        dej%        gge$          j2        g dVge$f ej#        ddgge$          j2         ej#        dej%        gge$          j2        g dWge$fgg dX&          dY                         Z4dZ Z5ej                            d[e
eg          d\             Z6d] Z7d^ Z8ej                            d_d,d`dagfdbg dcfg dddedfgfgg dg&          dh             Z9di Z:ej                            dg dg dg ej#        g djg dkg           ej#        g dg dge$          gg dl&          dm             Z;ej                            dO ej#        dd7gge$          j2         ej#        ddPgge$          j2        g dQgej.        f ej#        d1d/ggdR          j2         ej#        d1dSggdR          j2        g dTgej3        f ej#        dd7gge$          j2         ej#        ddPgge$          j2         ej#        g dQ          gej.        fgg dn&          do             Z<dp Z=dq Z>ej                            dre&e?g          ds             Z@dt ZAdu ZBdv ZCdw ZDdx ZEdy ZFej                            d+dbd,g          dz             ZGej                            d{ej%        d e&d!          g          d|             ZHej                            d+dHdJgg d}g          d~             ZIej                            dd*d)gddg&          ej                            d+d,g dgd,dg&          d                         ZJej                            d[e
eg          d             ZKej                            ddd/iddiddid/dddSddg          ej                            ddg dgg          d                         ZLej                            d+dbd,d7gg          d             ZMej                            d+dgdPgg          d             ZNej                            dddJiddiddiddiddidJdddSddg          d             ZOej                            d+d,d7gg          d             ZPej                            d+dgdPgg          d             ZQd ZRej                            ddJd1dddSig          d             ZSd ZTd ZUd ZVd ZWd ZXej                            ddd1dg          d             ZYej                            dd/dJdg          d             ZZej                            dg d          ej                            dg d          d                         Z[d Z\ej                            d{ej%        dg          d             Z]d Z^ej                            d	g d
          ej                            dddg          d                         Z_ej                            d	g d
          d             Z`ej                            d	g d
          d             Zaej                            d	g d
          d             Zbd Zcd Zdej                            dej%        dg          d             Zeej                            dddg          ej                            dej%        dg          d                         Zfej                            dO ej#        dej%        gge$          j2         ej#        dd7gge$          j2         ej#        ddPej%        ge$          gej.        f ej#        dej%        gge$          j2         ej#        dd7gge$          j2         ej#        ddPej%        ge$          gej.        f ej#        dej%        ggej                  j2         ej#        dCggej                  j2         ej#        ddDej%        g          gej        fgg d&          d             Zgej                            d[e
eg          d             Zhej                            d ej#        dej%        dCgg          j2         ej#        dej%        dgg          j2         ej#        dDgg          f ej#        g d¢g          j2         ej#        g dâg          j2         ej#        ej%        gg          f ej#        dej%        d7gge$          j2         ej#        dej%        dgg          j2         ej#        dPgge$          f ej#        g dŢge$          j2         ej#        g dƢg          j2         ej#        ej%        gge$          fg          dǄ             Ziej                            de          dɄ             Zjdʄ Zkej                            dddLgg ej#        ddLggdͬ           ej#        ddLggdά          g          ej                            ddKdLgg ej#        dKdLggdͬ           ej#        dKdLggdά          g          dЄ                         Zldф Zmd҄ Zndӄ Zoej                            dd*d)g          dՄ             Zpej                            d ej#        dgdgge$          d gej%        gej%        gg ejq        dgdgdgge$          f ej#        ej%        gdgdgge$          d gej%        gej%        gg ejq        dgej%        gej%        gge$          fg          d؄             Zrdل Zsdڄ Ztdۄ Zud܄ Zvd݄ Zwej                            dddJiddiddiddiddidJdddSddg          dބ             Zxd߄ Zyd Zzd Z{d Z|ej                            dddidd/ig          d             Z}ej                            ddd1iddig          d             Z~d Zd Zej                            d[e
eg          d             ZdS )    N)sparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equal)CSR_CONTAINERSc                     t          j        g dg dg          } t                      }t          d          }|                    |           }|                    |           }|j        dk    sJ |j        dk    sJ t          j        |          sJ t          j        |          rJ t          |                                g dg dg           t          |                                |           d S )N         r   r   r   Fsparse_outputr      )              ?r   r   r   )r   r   r   r   r   )	nparrayr   fit_transformshaper   issparser
   toarray)X
enc_sparse	enc_denseX_trans_sparseX_trans_denses        i/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_denser$      s    	)))YYY'((AJE222I--a00N++A..M6))))&((((?>*****}-----   #<#<#<>W>W>W"X   ~--//?????    handle_unknown)ignoreinfrequent_if_existwarnc                 n   t          j        g dg dg dg          }t          j        g dg          }t          d          }|                    |           t	          j        t          d          5  |                    |           d d d            n# 1 swxY w Y   t          |           }|                    |           |                                }t          |                    |          
                                t          j        g d	g                     t          ||           d S )
N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr&   Found unknown categoriesmatch)r   r   r   r   r   r   r   )r   r   r   fitpytestraises
ValueError	transformcopyr
   r   r	   r&   r   X2oh	X2_passeds        r#   #test_one_hot_encoder_handle_unknownr;   *   sa   
)))YYY			233A	999+		B 
g	.	.	.BFF1III	z)C	D	D	D  
R               
n	5	5	5BFF1III		I
Y''))
555677  
 B	"""""s   5BBBc                    t          j        g d                              d          }t          j        ddg                              d          }t          |           }|                    |           |                                }t          |                    |                                          t          j        g dg dg                     t          ||           d S )N)11111111223334444)r   55555r>   r-   )r   r   r   r   r   r   r   r   )	r   r   reshaper   r1   r6   r
   r5   r   r7   s        r#   +test_one_hot_encoder_handle_unknown_stringsrE   B   s    
22233;;GDDA	7D/	"	"	*	*7	3	3B
 
n	5	5	5BFF1III		I
Y''))
&&&(<(<(<=>>  
 r9%%%%%r%   output_dtypeinput_dtypec                 l   t          j        ddgg|           j        }t          j        ddgddgg|          }t          d|          }t	          |                    |                                          |           t	          |                    |                              |                                          |           t          d|d          }t	          |                    |          |           t	          |                    |                              |          |           d S )Nr   r   dtypeauto)
categoriesrJ   F)rL   rJ   r   )	r   asarrayTr   r
   r   r   r1   r5   )rG   rF   r   
X_expectedr9   s        r#   test_one_hot_encoder_dtyperP   U   s    	
QF8;///1AaVaV,LAAAJ	&	=	=	=Br''**2244jAAArvvayy**1--5577DDD	&E	R	R	RBr''**J777rvvayy**1--z:::::r%   c                    t          j        d          }|                    ddgddgd          }t          j        g dg dg| 	          }t          | 	          }t          |                    |                                          |           t          |	                    |          
                    |                                          |           t          | d
          }t          |                    |          |           t          |	                    |          
                    |          |           d S )Npandasabr   r   ABr   r   r   r   r   r   r   r   rI   F)rJ   r   )r2   importorskip	DataFramer   r   r   r
   r   r   r1   r5   )rF   pdX_dfrO   r9   s        r#   !test_one_hot_encoder_dtype_pandasr^   d   s    		X	&	&B<<sCj1v6677D<<<6lKKKJ	\	*	*	*Br''--5577DDDrvvd||--d33;;==zJJJ	\	?	?	?Br''--z:::rvvd||--d33Z@@@@@r%   c                     t                      } g dg dg dg dg}|                     |           |                                 }t          g d|           |                     g d          }t          g d|           t	          j        t          d	          5  |                     d
dg           d d d            d S # 1 swxY w Y   d S )N)Maler   girlr   r   )Female)   ra   r   
   )r`   3   boy   r   )r`   [   ra         )	x0_Femalex0_Malex1_1x1_41x1_51x1_91x2_boyx2_girlx3_1x3_2x3_12x3_21x4_3x4_10x4_30)onetwothreefourfive)
one_Femaleone_Maletwo_1two_41two_51two_91	three_boy
three_girlfour_1four_2four_12four_21five_3five_10five_30z!input_features should have lengthr/   rz   r{   )r   r1   get_feature_names_outr
   r2   r3   r4   )encr   feature_namesfeature_names2s       r#   "test_one_hot_encoder_feature_namesr   t   sT   
//C!!!%%%"""$$$		A GGAJJJ--//M	
 	
 	
" 	%  * ../V/V/VWWN	
 	
 	
" 	%  * 
z)L	M	M	M 2 2!!5%.1112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2s   CCCc                  0   t                      } t          j        ddggt                    j        }|                     |           |                                 }t          ddg|           |                     dg          }t          dd	g|           d S )
Nu   c❤t1dat2rI   u	   x0_c❤t1x0_dat2u   n👍meinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrN   r1   r   r
   )r   r   r   s      r#   *test_one_hot_encoder_feature_names_unicoder      s    
//C
8V$%V4446AGGAJJJ--//MY/???--i[-IIM(.9=IIIIIr%   c                     d } t          |           }t          j        ddggt                    j        }|                    |           |                                }t          ddg|           |                    dg	          }t          d
dg|           d }t          |                              |          }d}t          j	        t          |          5  |                                 ddd           dS # 1 swxY w Y   dS )z=Check the behaviour of `feature_name_combiner` as a callable.c                 ,    | dz   t          |          z   S )N_)reprfeaturecategorys     r#   name_combinerzHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner   s    }tH~~--r%   )feature_name_combinerNoneNrI   z	x0_'None'x0_NonerS   r   za_'None'a_Nonec                     dS )Nr    r   s     r#   wrong_combinerzItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combiner   s    qr%   zMWhen `feature_name_combiner` is a callable, it should return a Python string.r/   )r   r   r   r   rN   r1   r   r
   r2   r3   	TypeError)r   r   r   r   r   err_msgs         r#   1test_one_hot_encoder_custom_feature_name_combinerr      sg   . . . m
<
<
<C
64.!0002AGGAJJJ--//MY/???--cU-CCM
H-}===   n
=
=
=
A
A!
D
DCW  
y	0	0	0 $ $!!###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   D  DDc                     t          j        ddgg          j        } t                      }|                    g dg           |                                d         g dgk    sJ |                    |                                           j        dk    sJ |                    g dg           |                    |                                           j        dk    sJ d S )	Nr   r   )r   r   r   r   rL   rL   )r   r+   )r   r   r   r   r+   r   )	r   r   rN   r   
set_params
get_paramsr   r   r   )r   r9   s     r#   test_one_hot_encoder_set_paramsr      s    
1a&A	BMMlll^M,,,==??<(\\\N::::A&&((.&8888MMooo.M///A&&((.&888888r%   c                 F   t          d          }|                    |           }t          dd          }|                    |           }t          |                                |           t	          j        |          r|j        dk    sJ |                                S )NrK   r   FrL   r   csr)r   r   r	   r   r   r   format)r   r   Xtr1Xtr2s       r#   check_categorical_onehotr      s    
6
*
*
*CQD
6
?
?
?CQDDLLNND)))?4  9T[E%9%9%9%9<<>>r%   r   defr   7   abcr   r   )rd   r   r   )r   r   r   )rT   rV   cat)rS   rW   r   rI   )rT   r   r   rS   r   nan)Nr   r   )rS   r   r   )Nr   N)mixednumericr   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)idsc                    t          t          j        |           d d dgf                   }t          |ddgddgg           t          t          j        |           d d ddgf                   }t          |g dg dg           t	          d                              |           }t          |                                g dg dg           d S )	Nr   r   )r   r   r   r   r   r   r   r   rK   r   )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r	   r   r   r   )r   Xtrs     r#   test_one_hot_encoderr      s    0 #28A;;qqq1#v#6
7
7CC1a&1a&)***
"28A;;qqq1a&y#9
:
:CC,,,5666
6
*
*
*
8
8
;
;CCKKMMOOO___#EFFFFFr%   sparse_FTdropfirstc                    g dg dg dg}t          ||          }|                    |          }t          j        |t                    }t          |                    |          |           ddgddgd	dgg}t          |d
|          }|                    |          }t          j        |          }t          |                    |          |           |g dg dg dg}t          || ddgddgg dg          }|                    |          }t          j        |t                    }d |d<   t          |                    |          |           ddgddgd	dgg}t          |ddgddgg|           }|                    |          }t          j        |t                    }d |d<   d |d d df<   t          |                    |          |           t          j        g dg dg          }t          j        d          }t          j
        t          |          5  |                    |           d d d            d S # 1 swxY w Y   d S )Nr   r   )r   r   r   r   r   rI   r   r   r   r   rK   )r   rL   r   r   r   )6   r   8   )r   r&   rL   )r   r   r   r   )r   rL   r&   r   r   r   r   r   r   )Shape of the passed X data is not correctr/   )r   r   r   r   r   r
   inverse_transformreescaper2   r3   r4   )r&   r   r   r   r   X_trexpmsgs           r#   test_one_hot_encoder_inverser     s    
8A
gD
9
9
9CQD
(1F
#
#
#Cs,,T22C888
R1b'Ar7#A
g&t
L
L
LCQD
(1++Cs,,T22C888| ^^^^^^^^<!)A=
 
 

   ##hq'''D	30066<<< Wq"g2w'!AR))
 
 

   ##hq'''D	AAAqD	30066<<< 8YYY			*++D
)?
@
@C	z	-	-	- $ $d###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   'I

IIz
X, X_transr   r   r   r   r   r   r   rz   r{   r|   rT   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                    t          |                              |           }d}|rt          |d          }t          j        t
          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r/   N)r   r1   r   r2   r3   r4   r   )r   X_transr   r   r   s        r#   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownr   A  s    & g
.
.
.
2
21
5
5C	A 
  8$Wh77	z	-	-	- ' 'g&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 's   A66A:=A:c                      t          j        ddgddgddggt                    } t          dd	          }|                    |           }t          |                    |          |            d S )
Nr`   r   rb   r   r   rI   	if_binaryFr   r   )r   r   r   r   r   r
   r   )r   oher   s      r#   &test_one_hot_encoder_inverse_if_binaryr   a  sr    
61+!}xm<FKKKA
[
>
>
>CQDs,,T22A66666r%   )r   r   N
reset_dropc                    t          j        ddgddgddggt                    }t          | d          }|                    |           |                    |          }|                                }|                    |	           t          |	                    |          |           t          |                    |          |           t          |                                |           d S )
Nr`   r   rb   r   r   rI   Fr   r   )r   r   r   r   r1   r5   r   r   r
   r   r	   )r   r   r   r   r   r   s         r#   test_one_hot_encoder_drop_resetr   h  s     	61+!}xm<FKKKA
T
7
7
7CGGAJJJ==D--//MNN
N###s,,T22A666CMM!$$d+++s0022MBBBBBr%   methodr1   r         @      @c                     t                      }d}t          j        t          |          5   t	          ||          |            d d d            d S # 1 swxY w Y   d S )Nz'Expected 2D array, got 1D array insteadr/   )r   r2   r3   r4   getattr)r   r   r9   r   s       r#   test_X_is_not_1Dr   w  s     
B
3C	z	-	-	-  FA                 s   AAAc                 8   t          j        d          }|                    g d          }t                      }dt	          |           d}t          j        t          |          5   t          ||           |           d d d            d S # 1 swxY w Y   d S )NrR   )   r   r+   r   z+Expected a 2-dimensional container but got z	 instead.r/   )r2   rZ   Seriesr   typer3   r4   r   )r   r\   r   r9   r   s        r#   test_X_is_not_1D_pandasr     s    		X	&	&B
		,,,A	B
JQ
J
J
JC	z	-	-	-  FA                 s   (BBBzX, cat_exp, cat_dtyper   r   r   rV   rW   )r   r   r   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                    | | d d d         fD ]}t          d          }|                    |           t          |j        t                    sJ t          |j        |          D ]\  }}|                                }t          |d                   r0t          |d                   sJ |d d         |d d         k    sJ n|                                |k    sJ t          j	        |j
        |          sJ d S )NrA   rK   r   )r   r1   
isinstancecategories_listziptolistr   r   
issubdtyperJ   )r   cat_exp	cat_dtypeXir   resr   res_lists           r#   test_one_hot_encoder_categoriesr     s   F !DDbD'l 7 7v...#/400000COW55 	7 	7HCzz||HSW%% +$Xb\22222}CRC00000zz||s****=I666666	77 7r%   zX, X2, cats, cat_dtypedrS   rT   cint64r+   r   r   r   )NrS   z)rS   rT   r  )rS   Nr  )r   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanc                 `   t          |          }t          j        g dg dg          }t          |                    |                                           |           t          |j        d                   t          |d                   k    sJ |j        d         	                                t          |d                   k    sJ |j        d         j
        |k    sJ t          |          }t          j        t          d          5  |                    |           d d d            n# 1 swxY w Y   t          ||          }t          j        g dg dg          }t          |                    |                              |                                          |           d S )	Nr   r   r   r   r   r   r   r   r.   r/   rL   r&   )r   r   r   )r   r   r   r
   r   r   r   rL   r   r   rJ   r2   r3   r4   r1   r5   )r   r8   catsr   r&   r   r   s          r#   )test_one_hot_encoder_specified_categoriesr	    s   f 4
(
(
(C
(OOO___5
6
6Cs((++3355s;;;q!""d47mm3333?1$$&&$tAw--7777 ?1#y0000 4
(
(
(C	z)C	D	D	D                
4
G
G
GC
(OOO___5
6
6Cswwr{{,,R0088::C@@@@@s   D00D47D4c                  D   t          j        ddggt                    j        } t	          g dg          }t          j        g dg dg          }t          |                    |                               |                                           |           t          |	                    |                                           |           |j
        d                                         g dk    sJ t          j        |j
        d         j        t           j                  sJ t          j        d	d
gg          j        } t	          g dg          }d}t          j        t"          |          5  |	                    |            d d d            d S # 1 swxY w Y   d S )NrS   rT   rI   )rT   rS   r   r   r  r  r   r   r   )r   r   r   z%Unsorted categories are not supportedr/   )r   r   r   rN   r   r
   r1   r5   r   r   r   r   r   rJ   object_r2   r3   r4   )r   r   r   r   s       r#   (test_one_hot_encoder_unsorted_categoriesr    s   
3*V,,,.A
OOO#4
5
5
5C
(OOO___5
6
6Cswwqzz++A..6688#>>>s((++3355s;;;?1$$&&///9999=+12:>>>>> 	1a&A
III;
/
/
/C
1C	z	-	-	-  !                 s   2FFFEncoderc                 :   t          j        dt           j        dg          g} | |          }t          j        ddggt                    j        }t          j        t          d          5  |                    |           ddd           dS # 1 swxY w Y   dS )zTest encoder for specified categories that nan is at the end.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    r   r   r   rI   zNan should be the last elementr/   N)	r   r   r   r   rN   r2   r3   r4   r1   r  r  r   r   s       r#   ,test_encoder_nan_ending_specified_categoriesr    s     Ha^$$%D
'T
"
"
"C
1a&(((*A	z)I	J	J	J  


                    -BBBc                  |   t          j        ddgddggt                    j        } t	          g dg dg          }t          j        g d	g d
g          }t          |                    |                                           |           |j        d         	                                g dk    sJ t          j
        |j        d         j        t           j                  sJ |j        d         	                                g dk    sJ t          j
        |j        d         j        t           j                  sJ d S )NrS   rT   r   r   rI   r   )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   rN   r   r
   r   r   r   r   r   rJ   r  r   r   r   s      r#   7test_one_hot_encoder_specified_categories_mixed_columnsr  $  s-   
3*q!f%V4446A
OOOYYY#?
@
@
@C
(2224R4R4RS
T
TCs((++3355s;;;?1$$&&///9999=+12:>>>>>?1$$&&)))3333=+12:>>>>>>>r%   c                      t          j        d          } |                     ddgddgd          }t          |          }t	          |g dg dg           d S )	NrR   rS   rT   r   r   rU   rX   rY   )r2   rZ   r[   r   r	   )r\   r]   r   s      r#   test_one_hot_encoder_pandasr  1  sc    		X	&	&B<<sCj1v6677D
"4
(
(CC,,,566666r%   zdrop, expected_namesx0_cx2_br   )r  x1_2r  )r   r   rT   x0_bx2_a)r   binarymanualc                     g dg dg}t          |           }|                    |           |                                }t          ||           d S )N)r   r   rS   )rT   r   rT   r   )r   r1   r   r
   )r   expected_namesr   r   r   s        r#   'test_one_hot_encoder_feature_names_dropr   :  s\     
&A
T
"
"
"CGGAJJJ--//M~}55555r%   c                  *   ddgddgddgg} t          j        g dg dg dg          }t          j        d d	g          }t          d
d          }|                    |           }t	          |j        |           t          ||           ddgddgddgg} t          j        ddgddgddgg          }t          j        d	d g          }t          d
d          }|                    |           }t	          |j        |           t          ||           d S )Nrd   yes   norj   )r   r   r   r   rC   )r   r   r   r   r   r   Fr   truerS   falser   r   )r   r   r   r   r
   	drop_idx_r	   )r   expectedexpected_drop_idxr   results        r#   *test_one_hot_encoder_drop_equals_if_binaryr+  L  sE   
er4j2u+.Ax			3335I5I5IJ H $++
[
>
>
>Cq!!Fs}&7888FH%%% ###7Ax#sc3Z#s<==H!T++
[
>
>
>Cq!!Fs}&7888FH%%%%%r%   )rd   r   r   )r#  r   r   )r   r   r   c                 ,   t                      }t          j        g dg dgd          }t          |                    |           |                    d                     t          d          }t          |                    |           |           d S )Nr   r   r   r   r   r   r  rI   float64)r   r   r   r
   r   astyper  s      r#   test_ordinal_encoderr1  d  s     

C
(IIIyyy)
9
9
9Cs((++SZZ	-B-BCCC
w
'
'
'Cs((++S11111r%   )r   r   zobject-string-catc                 T   t          |          }t          j        dgdgg          }t          |                    |           |           t          |j        d                   t          |d                   k    sJ |j        d                                         t          |d                   k    sJ |j        d         j	        |k    sJ t          |          }t          j        t          d          5  |                    |           d d d            d S # 1 swxY w Y   d S )Nr   r   r   r   r.   r/   )r   r   r   r
   r   r   rL   r   r   rJ   r2   r3   r4   r1   )r   r8   r  r   r   r   s         r#   )test_ordinal_encoder_specified_categoriesr3  u  sY   2 D
)
)
)C
(SEC5>
"
"Cs((++S111q!""d47mm3333?1$$&&$tAw--7777 ?1#y0000 D
)
)
)C	z)C	D	D	D                   s   :DD!$D!c                     g dg dg} t                      }|                    |           }t          j        | t                    }t          |                    |          |           t          j        g dg dg          }t          j        d          }t          j
        t          |          5  |                    |           d d d            d S # 1 swxY w Y   d S )Nr   r   rI   )r   r   r   r   rX   r   r/   )r   r   r   r   r   r
   r   r   r   r2   r3   r4   )r   r   r   r   r   s        r#   test_ordinal_encoder_inverser5    s   	(A


CQD
(1F
#
#
#Cs,,T22C888 8\\\<<<011D
)?
@
@C	z	-	-	- $ $d###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   3CCCc                     t          dd          } t          j        ddgddgdd	ggt          
          }t          j        ddgddgddggt          
          }|                     |           |                     |          }t          j        ddgddgddggd
          }t          ||           |                     |          }t          j        dd gd dgddggt          
          }t          ||           d S )Nuse_encoded_valuer&   unknown_valuerS   xrT   yr   r  rI   xyblar   r   r   r  )r   r   r   r   r1   r5   r
   r   )r   X_fitr   X_trans_encr   X_trans_invinv_exps          r#   +test_ordinal_encoder_handle_unknowns_stringrC    s
   
(;2
N
N
NCHsCj3*sCj9HHHEhdeS\C:>fMMMGGGENNN--((K
(QGb!Wq!f-W
=
=
=C{C(((''44KhddC[3*=VLLLG{G,,,,,r%   rJ   c                    t          dd          }t          j        ddgddgdd	gg| 
          }t          j        ddgddgddgg| 
          }|                    |           |                    |          }t          j        ddgddgddggd
          }t          ||           |                    |          }t          j        dd gd dgddggt          
          }t          ||           d S )Nr7  r9  r      r      r   	   rI   rg      r   r  )r   r   r   r1   r5   r
   r   r   )rJ   r   r?  r   r@  r   rA  rB  s           r#   ,test_ordinal_encoder_handle_unknowns_numericrJ    s
   
(;4
P
P
PCHq!fq!fq!f-U;;;EhB"a1a&1???GGGENNN--((K
(QIay1a&1
A
A
AC{C(((''44KhD	D!9q!f5VDDDG{G,,,,,r%   c                     t          dt          j                  } t          j        dgdgdgg          }|                     |           |                     dgdgdgg          }t          |dgdgt          j        gg           d S )Nr7  r9  r   r   r   r+   r   )r   r   r   r   r1   r5   r
   )r   r?  r   s      r#   (test_ordinal_encoder_handle_unknowns_nanrL    s     (;26
R
R
RCHqcA3_%%EGGENNNmmaS1#sO,,Gw!qcBF8 455555r%   c                     t          dt          j        t                    } t          j        dgdgdgg          }t          j        t          d          5  |                     |           d d d            d S # 1 swxY w Y   d S )Nr7  )r&   r:  rJ   r   r   r   z'dtype parameter should be a float dtyper/   )	r   r   r   intr   r2   r3   r4   r1   )r   r?  s     r#   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtyperO    s     *"&  C HqcA3_%%E	z)R	S	S	S                   s   A::A>A>c                  
   t          j        g dgt                    j        } g d}t	          |          }d}t          j        t          |          5  |                    |            d d d            d S # 1 swxY w Y   d S )N)LowMediumHighrR  rQ  rI   )rQ  rR  rS  r   z*Shape mismatch: if categories is an array,r/   )	r   r   r   rN   r   r2   r3   r4   r1   )r   r  r   r   s       r#   +test_ordinal_encoder_raise_categories_shaperT    s    
<<<=VLLLNA$$$D
D
)
)
)C
6C	z	-	-	-  


                 s   A88A<?A<c            	      L   t          d          t          j        g dg dgd          } t          j        ddgd	d
ggd          t          j        ddgd	d
ggd          t          j        ddgddgg          t          j        ddgddgg          t          j        ddgd	dggd          fD ]w                               t	          fdt          d          D                       sJ t                                                                        |            xddgd	d
gg                               t	          fdt          d          D                       sJ t                                                                        |            ddgd	dgg                               t	          fdt          d          D                       sJ t                                                                        |            d S )NrK   r   )r   r   r   r   )r   r   r   r   r/  rI   r   r   r   r+   r  rS   rT   r   r      a   b   c   dr   c                 F    g | ]}j         |         j        j        k    S r   r   rJ   ).0ir   r   s     r#   
<listcomp>z'test_encoder_dtypes.<locals>.<listcomp>  s*    JJJACOA&,7JJJr%   c                 n    g | ]1}t          j        j        |         j        t           j                  2S r   )r   r   r   rJ   integerr\  r]  r   s     r#   r^  z'test_encoder_dtypes.<locals>.<listcomp>  s1    VVVcoa06
CCVVVr%   c                 <    g | ]}j         |         j        d k    S )r   r[  ra  s     r#   r^  z'test_encoder_dtypes.<locals>.<listcomp>  s(    GGG"(H4GGGr%   )	r   r   r   r1   allranger
   r5   r   )r   r   r   s    @@r#   test_encoder_dtypesre    sD   
6
*
*
*C
((((*>*>*>?y
Q
Q
QC 	1a&1a&!111
1a&1a&!333
3*sCj)**
4,t-..
1c(QH%X666 	< 	< 	


JJJJJqJJJKKKKK3==++3355s;;;;
Q!QAGGAJJJVVVVUSTXXVVVWWWWWs}}Q''//113777
SAs8AGGAJJJGGGGeAhhGGGHHHHHs}}Q''//11377777r%   c                  (   t          j        d          } t          d          t          j        g dg dgd          }|                     dd	gd
dgddgdd          }                    |           t          fdt          d	          D                       sJ t          
                    |                                          |           |                     dd	gddgddgd          }|d         j        |d         j        |d         j        g                    |           t          fdt          d
          D                       sJ t          
                    |                                          |           d S )NrR   rK   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r/  rI   r   r   r   r+   r   r   rV   rW   Cr  c                 <    g | ]}j         |         j        d k    S )r  r[  ra  s     r#   r^  z.test_encoder_dtypes_pandas.<locals>.<listcomp>  s(    FFF"(G3FFFr%   rS   rT   r   r   rV   rW   rh  c                 H    g | ]}j         |         j        |         k    S r   r[  )r\  r]  X_typer   s     r#   r^  z.test_encoder_dtypes_pandas.<locals>.<listcomp>  s,    HHH!"(F1I5HHHr%   )r2   rZ   r   r   r   r[   r1   rc  rd  r
   r5   r   rJ   )r\   r   r   rk  r   s      @@r#   test_encoder_dtypes_pandasrl    s   		X	&	&B
6
*
*
*C
(	'	'	')G)G)GH  C
 	Aq6AaV<<GLLAGGAJJJFFFFU1XXFFFGGGGGs}}Q''//113777
Aq6c
#sDDEEAflAcFL!C&,7FGGAJJJHHHHHuQxxHHHIIIIIs}}Q''//11377777r%   c                      t                      } ddgddgg}t          j                    5  t          j        d           |                     |           d d d            d S # 1 swxY w Y   d S )Nr`   r   rb   r   r,   )r   warningscatch_warningssimplefilterr   )r   r   s     r#   test_one_hot_encoder_warningrq    s    
//C
!xm$A		 	"	"  g&&&!                 s   *A!!A%(A%c                 d   ddgddgddgg}t          | ddddgddgg          }|                    |           d	dgg}t          j        ddgg          }d
}t	          j        t          |          5  |                    |          }ddd           n# 1 swxY w Y   t          ||           dS )z,Check handle_unknown='warn' works correctly.rS   r   rT   r   r   Fr)   r   r   r&   rL   r   qFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr/   N	r   r1   r   r   r2   warnsUserWarningr5   r	   )r   r   r   X_testrO   warn_msgr   s          r#   test_ohe_handle_unknown_warnrz  %  s    qC8c1X&A
#JA'	  C GGAJJJAhZFAq6(##J	A  
k	2	2	2 ( (--''( ( ( ( ( ( ( ( ( ( ( ( ( ( (GZ(((((   3BBBmissing_valuec                    dddd| g}t          |          }g dg ddddd| gg}|                    |                                          }g dg d	g d
g}t          ||           |j        |u sJ d t          |j        |j                  D             }|                    |          }t          j
        |t                    }t          |d                   rt          |d d         |d d                    t          |d                   sJ t          |d                   sJ t          |d d d df         |d d d df                    t          |dd df         |dd df                    t          |d                   sJ t          |d                   sJ d S t          ||           t          ||           d S )Nr   rg   r   r   r   )r   rg   r   r   rS   )r   rg   r   r   rS   )r   r   r   r   r   )r   r   r   r   r   r   c                 $    g | ]\  }}||         S r   r   )r\  r   r   s      r#   r^  z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>M  s-       %gG  r%   rI   rA   )rA   rA   )r   r   r   r
   r   r   r   r'  r   r   r   r   r   )	r|  cats_to_dropr   r   transr   dropped_catsX_inv_transX_arrays	            r#    test_one_hot_encoder_drop_manualr  ?  s   2q"m4L
\
*
*
*C	Ar=)	A
 a  ((**E??OOO___
=Cuc"""8|#### ),S_cm)L)L  L ''..Khq'''G \"%&& 1<,l3B3.?@@@\"-.....\"-.....7111crc6?K3B3,?@@@ 	72ss7+[SbS-ABBBWV_-----[01111111<6667K00000r%   )r   r   rc   rS   c                     t          |           }d}t          j        t          |          5  |                    g dg dg dg           d d d            d S # 1 swxY w Y   d S )Nr   z-`drop` should have length equal to the numberr/   r   r   )r   r   ;   )r   r2   r3   r4   r1   )r   r   r   s      r#   test_invalid_drop_lengthr  d  s    
T
"
"
"C=G	z	1	1	1 B B@AAAB B B B B B B B B B B B B B B B B Bs   AA!Adensityr   denserS   r   rT   r  c                    t          |           }t          | |          }g dg dg}|                    |           |                    |           t          |j        |j                   |dk    rt          |j        d           n=t          ||j        |j                  D ]!\  }}}|t          |                   |k    sJ "t          |j        t          j	                  sJ |j        j
        t          k    sJ d S )Nr   r   )r   r   rS   r  r   r   )r   r1   r
   r   r'  r   rN  r   r   ndarrayrJ   r   )r  r   ohe_baseohe_testr   drop_catdrop_idxcat_lists           r#   test_categoriesr  l  s    7333H7>>>H	&ALLOOOLLOOOx+X-ABBBw8-q1111,/($h&:-
 -
 	7 	7(Hh CMM*h66666h("*55555#v------r%   c                 V     |                                              j        j        sJ d S )N)__sklearn_tags__
input_tagscategorical)r  s    r#   "test_encoders_has_categorical_tagsr    s,    799%%''2>>>>>>r%   kwargsmax_categoriesmin_frequency   g(\?r   )r  r  rg   rL   rK   rS   rT   r   r   c                 H   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d|d	d
d|                     |          }t          |j        g dg           dgdgdgdgdgg}t          j        ddgddgddgddgddgg          }|                    |          }t          ||           d dgdgdz  z   D             }|	                    |          }t          ||           |
                                }	t          ddg|	           dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.rS   r   rT   r#  r   rd   r   r   r(   F)rL   r&   r   rS   r   r   er   r   c                     g | ]}|gS r   r   r\  cols     r#   r^  z2test_ohe_infrequent_two_levels.<locals>.<listcomp>      HHHcSEHHHr%   infrequent_sklearnr+   r  x0_infrequent_sklearnNr   r   r   rN   r   r1   r
   infrequent_categories_r5   r	   r   r   )
r  rL   X_trainr   rx  r(  r   expected_invX_invr   s
             r#   test_ohe_infrequent_two_levelsr    st    h	SEBJ.#;seaiGHIIKG
 ,  	 
 
c'll  s1OOO3DEEEecUSEC53%0Fx!Q!Q!Q!Q!Q@AAHmmF##GHg&&&HHcU.B-Ca-G%GHHHL!!'**E|U+++--//M 78-HHHHHr%   c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|                               |          }|j        d         |j        d                  dk    sJ t          j        dgdgg          }|                    |          }t          dgdgg|           |	                                }t          dg|           |                    |          }t          dgdgg|           dS )z3Test two levels and dropping the frequent category.rS   r   rT   r#  r   rd   r   r   r(   Fr   r&   r   r  r   r   r   r  r  N)r   r   rN   r   r1   r   r'  r5   r	   r   r
   r   )r   r  r   rx  r   r   	X_inverses          r#   ,test_ohe_infrequent_two_levels_drop_frequentr    s/    h	SEBJ.#;seaiGHIIKG
,	  
 
c'll  ?1cmA./36666Xusen%%FmmF##GaS1#J(((--//M/0-@@@%%g..I 456	BBBBBr%   c                 <   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|           }d| d         d}t	          j        t          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.rS   r   rT   r#  r   rd   r   r   r(   Fr   r  Unable to drop category r   ( from feature 0 because it is infrequentr/   Nr   r   rN   r   r2   r3   r4   r1   r   r  r   r   s       r#   5test_ohe_infrequent_two_levels_drop_infrequent_errorsr    s   
 h	SEBJ.#;seaiGHIIKG
,	  C YT!W
X
X
XC	z	-	-	-                      .BBBrH  gQ?g{Gz?rG  c                 6   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          dd	d
d|                     |          }t          |j        ddgg           dgdgdgdgdgg}t          j        g dg dg dg dg dg          }|                    |          }t          ||           dgdgdgdgdgg}|	                    |          }t          ||           |
                                }t          g d|           dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.rS   r   rT   r#  r   rd   r   r   r(   Fr&   r   r  r.  r   r   r   r-  r  )r  r  r  Nr   r  )	r  r  r   rx  r(  r   r  r  r   s	            r#    test_ohe_infrequent_three_levelsr    sp     h	SEBJ.#;seaiGHIIKG
 ,E EK 	c'll  s1S#J<@@@ecUSEC53%0FxIIIyyy)))YYYOPPHmmF##GHg&&& 
				L !!'**E|U+++--//M@@@-PPPPPr%   c                 ^   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|                               |          }t          j        dgdgdgg          }t          ddgddgddgg|                    |                     |                    d                              |           d}t          j	        t          |          5  |                    dgdgg          }ddd           n# 1 swxY w Y   t          ddgddgg|           dS )z5Test three levels and dropping the frequent category.rS   r   rT   r#  r   rd   r   r   r(   Fr  r   r   r'   r-   r.   r/   r  N)r   r   rN   r   r1   r	   r5   r   r2   rv  rw  )r   r  r   rx  r   r   s         r#   .test_ohe_infrequent_three_levels_drop_frequentr    s    h	SEBJ.#;seaiGHIIKG
,	  
 
c'll  XusecU+,,FaVaVaV,cmmF.C.CDDD NN(N++//888
$C	k	-	-	- 0 0--#//0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 aVaV$g.....s   &DDDc                 <   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|           }d| d         d}t	          j        t          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )z7Test three levels and dropping the infrequent category.rS   r   rT   r#  r   rd   r   r   r(   Fr  r  r   r  r/   Nr  r  s       r#   7test_ohe_infrequent_three_levels_drop_infrequent_errorsr    s    h	SEBJ.#;seaiGHIIKG
,	  C YT!W
X
X
XC	z	-	-	-                   r  c                  "   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        } t          d	d
d                              |           }t          |j        ddgg           dgdgdgdgg}t          j        g dg dg dg dg          }|                    |          }t          ||           dgg}d}t          j
        t          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.rS   r   rT   r#  r   rd   r   r   r,   F)r&   r   r  r.  r  r-  badz.Found unknown categories \['bad'\] in column 0r/   N)r   r   rN   r   r1   r
   r  r5   r	   r2   r3   r4   )r  r   rx  r(  r   r   s         r#   (test_ohe_infrequent_handle_unknown_errorr  '  s    h	SEBJ.#;seaiGHIIKG
eA  	c'll  s1S#J<@@@ ecUSEC5)FxIIIyyy)))DEEHmmF##GHg&&& gYF
;C	z	-	-	-  f                 s   !DDDc                    t          j        dgdz  dgdz  z   gt                    j        }t	          dg dgddd	|                     |          }dgd
gdgdgdgg}t          j        ddgddgddgddgddgg          }|                    |          }t          ||           dddgg}dgdgg}|D ]R}|                    |                              |           t          dgdgg|                    |                     SdS )zG'a' is the only frequent category, all other categories are infrequent.rS   r   r  rj   rI   r   r   rS   rT   Fr(   rL   r   r&   rT   r   r   r   r   r   r   r   Nr   )	r   r   r   rN   r   r1   r5   r	   r   )r  r  r   rx  r(  r   dropsr   s           r#   5test_ohe_infrequent_two_levels_user_cats_one_frequentr  ?  s^    h	SEBJ./v>>>@G
 (((),  	 
 
c'll  ecUSEC53%0Fx!Q!Q!Q!Q!Q@AAHmmF##GHg&&& kC5)EecU^F ; ;D!!%%g...!qc
CMM&$9$9::::; ;r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        } t	          g d
gddd                              |           }t          |j        g dg           dgdgdgdgdgg}t          j        ddgddgddgddgddgg          }|                    |          }t          ||           d dgdgdz  z   D             }|
                    |          }t          ||           dS )zFTest that the order of the categories provided by a user is respected.rS   r   rT   r#  r   rd   r   r   rI   r  Fr(   r   rL   r   r&   r  )r   r   rS   r  r   r   c                     g | ]}|gS r   r   r  s     r#   r^  z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>q  r  r%   r  r+   Nr   r   r   rN   r   r1   r
   r  r5   r	   r   r  r   rx  r(  r   r  r  s          r#   (test_ohe_infrequent_two_levels_user_catsr  [  s\   h
cURZ	3%"*	,uqy	89    (((),	  
 
c'll  s1OOO3DEEEecUSEC53%0Fx!Q!Q!Q!Q!Q@AAHmmF##GHg&&& IHcU.B-Ca-G%GHHHL!!'**E|U+++++r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        } t	          g d
gddd                              |           }t          |j        ddgg           dgdgdgdgdgg}t          j        g dg dg dg dg dg          }|                    |          }t          ||           dgdgdgdgdgg}|
                    |          }t          ||           dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.rS   r   rT   r#  r   rd   r   r   rI   r   r   rT   rS   Fr(   r  r  r-  r  r.  r  Nr  r  s          r#   *test_ohe_infrequent_three_levels_user_catsr  v  s^   
 h
cURZ	3%"*	,uqy	89    (((),	  
 
c'll  s1S#J<@@@ecUSEC53%0FxIIIyyy)))YYYOPPHmmF##GHg&&&
 
				L !!'**E|U+++++r%   c                      t           j        g dg df         } t          ddd          }|                    |            ddgddgg}|                    |          }t          |g d	g d
g           dS )zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   	r   r   r   r   r   r   r   r   r   r   r   F)r  r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r   r1   r5   r	   )r   r   rx  r   s       r#   test_ohe_infrequent_mixedr    s     	)))+F+F+FFGA
q{%
P
P
PCGGAJJJ!fq!fFmmF##G GlllLLL9:::::r%   c            
         t           j        g dg dg df         } t          ddd          }|                    |                                           }t          |j        d         d	d
g           t          |j        d	         d	dg           t          |j        d
         d           |                                }t          g d|           g dg dg dg dg dg dg dg dg dg	}t          ||           g dg dg}|	                    |          }g dg dg}t          ||                                           |
                    |          }t          j        g dg dgt                    }t          ||           t          ddd                              |           }t          j        t           d          5  |	                    |           ddd           n# 1 swxY w Y   g d g d!g}|	                    |          }g d"g dg}t          ||                                           |
                    |          }t          j        g d#g d$gt                    }t          ||           dS )%z?Test infrequent categories with feature matrix with 3 features.r  )	r   r   r   r   r   rd   r   r   r   )	r   r   r   r   r   r   r   r   r   rK   r   r(   rL   r  r&   r   r   r   rd   N)x0_0x0_3r  x1_0x1_5x1_infrequent_sklearnx2_0x2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r+   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r  N)r  r   NrI   r,   r.   r/   )r   r   r   )r   rd   r   )r   r   r   r   r   r   r   r   )r  r  r   )r   r  r   )r   r  r   r   r   r
   r  r   r	   r5   r   r   r   r1   r2   r3   r4   )	r   r   r   r   r(  rx  X_test_transr  r  s	            r#   'test_ohe_infrequent_multiple_categoriesr    sj    	###$$$###	%	A !<Q  C ""**,,Gs1!4q!f===s1!4q"g>>>s1!4d;;;
 --//M		
 		
 		
 	   	!                          
H Hg&&&ii#F==((L )((*B*B*BCHHl2244555!!,//E8	(	(	(*I*I*IJRX  L |U+++ !G  	c!ff  
z)C	D	D	D  f               ii$F==((L(((*B*B*BCHHl2244555!!,//E8	8	8	8:V:V:VW  L |U+++++s   
G,,G03G0c            
      B   t          j        d          } |                     g dg ddddg          }t          dd	d
          }|                    |                                          }t          |j        d         ddg           t          |j        d         g d           g dg dg dg dg dg dg dg dg dg	}t          ||           |                     ddgddgdddg          }g dg dg}|	                    |          }t          ||                                           |
                    |          }t          j        ddgddggt                    }t          ||           |                     ddgddgdddg          }|	                    |                                          }g dg dg}t          ||           |
                    |          }t          j        ddgddggt                    }t          ||           dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rR   	rS   fr   r  r  rS   r   rT   rT   	r   r   r   rd   rd   rg   r   r   r   )strrN  r  rN  columnsrK   r   r(   r  r   rS   rT   r   r   r   rg   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r     rg   r  rI   r   r   N)r2   rZ   r[   r   r   r   r
   r  r	   r5   r   r   r   r   )	r\   r   r   r   r(  rx  r  r  r  s	            r#   .test_ohe_infrequent_multiple_categories_dtypesr    s    
	X	&	&B
@@@111	
 	
  	 	 	A !<Q  C ""**,,Gs1!4sCjAAAs1!4jjjAAA 	
H Hg&&&\\3*b"X>>PU\WWF"""$6$6$67H==((LHl2244555!!,//E8
 4	5=Q7RS  L |U+++ \\3*b!W==u~\VVF==((0022L"""$6$6$67HHl+++!!,//E8
#	$';Q&?@  L |U+++++r%   ri   )r  r  c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          dd	d
d| }|                    |           |                    dgg          }t          |dgg           dS ),All user provided categories are infrequent.rS   r   rT   r#  r   rd   r   r   r(   Fr  r   Nr   )r   r   rN   r   r1   r5   r	   r  r  r   r   s       r#   $test_ohe_infrequent_one_level_errorsr  H  s     h	SEBJ.#;seaiGHIIKG
 ,E EK C GGGmmcUG$$GGqcU#####r%   c                     t          j        dgdz  gt                    j        }t	          dg dgddd|                     |          }|                    dgdgg          }t          |d	gd	gg           d
S )r  r  r   rI   r  Fr(   r  rS   r   Nr   )r   r   r   rN   r   r1   r5   r	   r  s       r#   5test_ohe_infrequent_user_cats_unknown_training_errorsr  V  s     h	{&1113G
 (((),  	 
 
c'll  mmcUSEN++GGqcA3Z(((((r%   zinput_dtype, category_dtype)OOOUUOUUSOSUSS
array_type)r   r   	dataframec                 :   t          j        dgdgg|           }t          j        ddg|          g}t          |d                              |          }t	          dgdgdgdgg||           }|                    |          }t          j        ddgddgddgddgg          }t          ||           t          |                              |          }	|	                    |          }t          j        dgdgdgdgg          }t          ||           d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    rT   rS   rI   Fr   r   r   r   N)	r   r   r   r1   r   r5   r	   r   r
   )
rG   category_dtyper  r   rL   r   rx  r   r(  oes
             r#   test_encoders_string_categoriesr  g  s7    	3%#{333A(C:^<<<=J
:U
C
C
C
G
G
J
JC
use$j  F mmF##Gx!Q!Q!Q!Q899HGX&&&	:	.	.	.	2	21	5	5Bll6""Gx!qcA3,--Hw)))))r%   c                  H   t          j        dgdggd          } t          j        ddgd          g}t          |d          }t          j        d          }t          j        t          |	          5  |                    |            d
d
d
           d
S # 1 swxY w Y   d
S )zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    rT   rS   UrI   SFr   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r/   N)	r   r   r   r   r   r2   r3   r4   r1   )r   rL   r   r   s       r#   $test_mixed_string_bytes_categoricalsr    s     	3%#s+++A(C:S1112J
:U
C
C
CC
)	' C
 
z	-	-	-  


                 s   4BBBc                     t          j        dd| d| ggt                    j        }t	          dd                              |          }|                                }t          |ddd	|  g           d S )
NrS   rT   rI   Fr'   r   r&   x0_ar  x0_)r   r   r   rN   r   r1   r   r
   )r|  r   r   namess       r#   )test_ohe_missing_values_get_feature_namesr    s     	3]C?@OOOQA
eH
E
E
E
I
I!
L
LC%%''Euvv/D]/D/DEFFFFFr%   c            	      6   t          j        d          } |                     g dt          j        dddt          j        gt                    ddd	g
          }t          j        g dg dg dg dg          }t          |          }t          ||           d S )NrR   )dogr   Nr   r   r   r+   rI   )col1col2r
  r  r  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r2   rZ   r[   r   r   r   floatr   r	   )r\   dfexpected_df_transr   s       r#   %test_ohe_missing_value_support_pandasr    s    		X	&	&B	///HaArv.e<<<	
 	
   
 
 
B !!!!!!!!!!!!		
  #2
&
&CC*+++++r%   pd_nan_typepd.NAznp.nanc           
      @   t          j        d          }| dk    r|j        nt          j        }|                    d|                    dd|ddgd          i          }t          j        g d	g d
g dg dg d
g          }t          d|          }|	                    |          }t          ||           t          |j                  dk    sJ t          |j        d         d d         g d           t          j        |j        d         d                   sJ d S )NrR   r  r
  r   rS   rT   r   rI   )r   r   r   r   )r   r   r   r   )r   r   r   r   r  Fr  r   r   rA   r   )r2   rZ   NAr   r   r[   r   r   r   r   r	   lenr   r
   isnan)r  r&   r\   pd_missing_valuer  r  r   df_transs           r#   1test_ohe_missing_value_support_pandas_categoricalr    s@    
	X	&	&B +w 6 6ruuBF	BIIsC)93DJIWW	

 
B
 LLLLLLLLLL	
  eN
K
K
KC  $$H%x000s1$$$$sq)#2#.@@@8COA&r*+++++++r%   c                 2   ddgddgddgg}t          dd|           }|                    |          }t          j        g d	g d
g dg          }t	          ||           ddgg}t          j        g d	g          }d}t          j        t          |          5  |                    |          }ddd           n# 1 swxY w Y   t	          ||           |	                    |          }t          |t          j        ddggt                               dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.rS   r   rT   r   r   r   Fr   r   r&   r   r   )r   r   r   r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr/   NrI   r   r   r   r   r	   r2   rv  rw  r5   r   r
   r   r&   r   r   r   rO   rx  ry  r  s           r#   /test_ohe_drop_first_handle_unknown_ignore_warnsr    s    qC8c1X&A
E.  C ""GIIIIII	
 J GZ((( AhZF999+&&J	 
 
k	2	2	2 ( (--''( ( ( ( ( ( ( ( ( ( ( ( ( ( (GZ((( !!*--Eubhaz@@@AAAAA   B;;B?B?c                 2   ddgddgddgg}t          dd|           }|                    |          }t          j        g d	g d
g dg          }t	          ||           ddgg}t          j        g dg          }d}t          j        t          |          5  |                    |          }ddd           n# 1 swxY w Y   t	          ||           |	                    |          }t          |t          j        ddggt                               dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.rS   r   rT   r   r   r   Fr  r  r   rX   r   r   )r   r   r   r   r  r/   NrI   r  r  s           r#   3test_ohe_drop_if_binary_handle_unknown_ignore_warnsr!    s    qC8c1X&A
n  C ""GLLLLLL	
 J GZ((( AhZF<<<.))J	 
 
k	2	2	2 ( (--''( ( ( ( ( ( ( ( ( ( ( ( ( ( (GZ((( !!*--Eubhd}FCCCDDDDDr  c                 d   ddgddgddgg}t          dd| ddgddgg          }|                    |           d	dgg}t          j        ddgg          }d
}t	          j        t          |          5  |                    |          }ddd           n# 1 swxY w Y   t          ||           dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.rS   r   rT   r   r   r   Frs  r   rt  r/   Nru  )r&   r   r   rx  rO   ry  r   s          r#   'test_ohe_drop_first_explicit_categoriesr#  &  s   
 qC8c1X&A
%#JA'	  C GGAJJJAhZFAq6(##J	A  
k	2	2	2 ( (--''( ( ( ( ( ( ( ( ( ( ( ( ( ( (GZ(((((r{  c                  
   t          j        d          } |                     g dg ddddg          }t          d	          }|                    d
           d}t          j        t          |          5  |                    |           ddd           n# 1 swxY w Y   |                    |           t          j        t          |          5  |	                    |           ddd           dS # 1 swxY w Y   dS )zJRaise informative error message when pandas output and sparse_output=True.rR   r  )r  rT   rT   )rS   rT   rS   rT   r  Tr   r5   zxPandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas outputr/   N)
r2   rZ   r[   r   
set_outputr3   r4   r   r1   r5   )r\   r  r   r   s       r#   'test_ohe_more_informative_error_messager'  A  s   		X	&	&B	IIIOOO<<sCj	Q	QB
d
+
+
+CNNXN&&&	S  
z	-	-	-  "               GGBKKK	z	-	-	-  b                 s$   8BB!BC88C<?C<c                  :   t          j        t           j        dddgg          j        } t	          t           j                  }dt           j         }t          j        t          |          5  |	                    |            ddd           dS # 1 swxY w Y   dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   rI   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r/   N)
r   r   r   rN   r   int32r2   r3   r4   r1   )r   r  r   s      r#   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtyper*  U  s     	263S)*++-A	bh	'	'	'B	;02	; 	;  
z	-	-	-  
q			                 r  encoded_missing_valuer8  c                    t          j        t           j        dddggt           j                  j        }t          |                               |          }t          |j                  dk    sJ t          |j        d         ddt           j        g           |
                    |          }t          || gdgdgdgg           |                    |          }t          ||           dS )	z.Test ordinal encoder with nan on float dtypes.r   r   rI   r+  r   r   r   N)r   r   r   r/  rN   r   r1   r  r   r	   r5   r   )r+  r   r  r   r  s        r#   5test_ordinal_encoder_passthrough_missing_values_floatr.  c  s     	263S)*"*===?A	.C	D	D	D	H	H	K	KBr~!####BN1%S"&'9:::ll1ooGG45usecUKLLL$$W--IIq!!!!!r%   c           
         t          j        d          }| dk    r|j        nt          j        }|                    d|                    dd|ddgd          i          }t          |	                              |          }t          |j
                  d
k    sJ t          |j
        d         dd         g d           t          j        |j
        d         d                   sJ |                    |          }t          |dgdg|gdgdgg           |                    |          }|j        dk    sJ t          |dddf         ddg           t          |dddf         ddg           t          j        |d                   sJ dS )z0Check ordinal encoder is compatible with pandas.rR   r  r
  r   rS   rT   r   rI   r-  r   r   Nr   r   rA          @r   r   )r   r   r   r   )r2   rZ   r  r   r   r[   r   r   r1   r  r   r
   r  r5   r	   r   r   )r  r+  r\   r  r  r  r  r  s           r#   =test_ordinal_encoder_missing_value_support_pandas_categoricalr1  u  s    
	X	&	&B +w 6 6ruuBF	BIIsC)93DJIWW	

 
B 
.C	D	D	D	H	H	L	LBr~!####r~a(!,ooo>>>8BN1%b)*****||BHHuse.C-DsecUSTTT$$X..I?f$$$$y!Q'#s444yQ'#s4448IdO$$$$$$$r%   r0  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                    t          |          }t          j        dgt          j        gg          }t	          |                    |           |           |j        d         j        |k    sJ t          |          }t          j	        t          d          5  |                    |           ddd           dS # 1 swxY w Y   dS )z.Test ordinal encoder for specified categories.r   r   r   r.   r/   N)r   r   r   r   r
   r   r   rJ   r2   r3   r4   r1   )r   r8   r  r   r  r   s         r#   =test_ordinal_encoder_specified_categories_missing_passthroughr3    s   L 
4	(	(	(B
(SEBF8$
%
%Cr''**C000 >!"i//// 
4	(	(	(B	z)C	D	D	D  
r


                 s   B<<C C c                 2   t          j        g dt                    g} | |          }t          j        ddggt                    j        }t	          j        t          d          5  |                    |           ddd           dS # 1 swxY w Y   dS )	zTest encoder for specified categories have duplicate values.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    )rS   rT   rS   rI   r   rS   rT   z5the predefined categories contain duplicate elements.r/   N)r   r   r   rN   r2   r3   r4   r1   r  s       r#   +test_encoder_duplicate_specified_categoriesr5    s     H___F3334D
'T
"
"
"C
3*V,,,.A	Q
 
 
   	


                 s   )BBBzX, expected_X_trans, X_testr   r   )r   r   r   )r   r0  r   r   )r   rS   rT   )r0  r   r   c                     t          dd          }|                    |           }t          ||           t          |                    |          dgg           dS )z>Test the interaction between missing values and handle_unknownr7  rA   r9  g      N)r   r   r	   r5   )r   expected_X_transrx  r  r   s        r#   /test_ordinal_encoder_handle_missing_and_unknownr8    sa    8 
':"	M	M	MBq!!GG-...BLL((D6(33333r%   csr_containerc                 b   t          j        g dg dg          } | |          }t                      }d}t          j        t
          |          5  |                    |           ddd           n# 1 swxY w Y   t          j        t
          |          5  |                    |           ddd           n# 1 swxY w Y   |                    |          } | |          }t          j        t
          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z2Sparse data was passed, but dense data is requiredr/   N)	r   r   r   r2   r3   r   r1   r   r   )r9  r   X_sparseencoderr   r   r!   s          r#   test_ordinal_encoder_sparser=    s    	)))YYY'((A}QHGBG	y	0	0	0  H              	y	0	0	0 ( (h'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ##A&&G"]7++N	y	0	0	0 2 2!!.1112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2s6   A33A7:A7B;;B?B?D$$D(+D(c                  h   t          j        g d          ddt           j        f         } t          g dgdd          }|                    |            t          g dgd          }t          j        t          d	
          5  |                    |            ddd           dS # 1 swxY w Y   dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)rA   r   r   r7  rE  )rL   r&   r:  r,   r  r.   r/   )r   r   newaxisr   r1   r2   r3   r4   )r   r  s     r#   -test_ordinal_encoder_fit_with_unseen_categoryr@    s     	###$$QQQ
]3A	JJ<0CSW
 
 
B FF1III	JJJ<	H	H	HB	z)C	D	D	D  
q			                 s   B''B+.B+r  AAOr  rx  c                     t          dd          }|                    |            |                    |          }t          |ddgg           dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r7  ir9  r   N)r   r1   r5   r	   )r  rx  r   r   s       r#   1test_ordinal_encoder_handle_unknown_string_dtypesrD  &  sV    * (;2
N
N
NCGGGmmF##GGr1gY'''''r%   c                  R   t          j        g d                              dd          } t                                          |           }t          |j        t          j        | d          j                   |	                    |           }t          |dgdgdgdgg           dS )	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6krA   r   r   )axisr   r   N)
r   r   rD   r   r1   r
   r   sortrN   r5   )r   r<  r   s      r#   #test_ordinal_encoder_python_integerrH  B  s     		
 	
 	
	 	 gb!nn  ""1%%Gw*BGAA,>,>,>,@AAA""Gw!qcA3 455555r%   c                      t          j        d          } g d}|                     g dg|          }t                                          |          }|                                }t          ||           dS )z-Check feature names out is same as the input.rR   )rT   r   rS   r  r  N)r2   rZ   r[   r   r1   r   r
   )r\   r  r   r   feature_names_outs        r#   .test_ordinal_encoder_features_names_out_pandasrK  V  sx    		X	&	&BOOE
iii[%00A




q
!
!C1133u/00000r%   c                  V   t          j        dgdgt           j        ggt                    } t	          dt           j        d                              |           }|                    |           }t          |dgdgdgg           t          j        d	gt           j        ggt                    }|                    |          }t          |t           j        gdgg           |                    |          }|d         d         J t          j	        |d         d                   sJ d
S )zECheck interactions between encode_unknown and missing value encoding.rS   rT   rI   r7  r&   r:  r+  r   r   r   N)
r   r   r   r   r   r1   r5   r	   r   r  )r   r  r   rx  r  X_roundtrips         r#   0test_ordinal_encoder_unknown_missing_interactionrP  b  s$    	3%#)888A	*f 
 
 
 
c!ff	  ll1ooGGqcA3-... Xurvh'v666F<<''LLBF8bT"2333 &&|44K q>!$$$ 8KN1%&&&&&&&r%   with_pandasc                    t          j        ddgddgdt           j        ggt                    }d}| r3t	          j        d          }|                    |d	d
g          }|dz   }n|dz   }t          d          }t	          j        t          |          5  |
                    |           ddd           dS # 1 swxY w Y   dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.rS   r	  rT   r   r   rI   zTencoded_missing_value \(1\) is already used to encode a known category in features: rR   letterpetr  z	\['pet'\]z\[1\]r   r-  r/   N)r   r   r   r   r2   rZ   r[   r   r3   r4   r1   )rQ  r   	error_msgr\   r  s        r#   0test_ordinal_encoder_encoded_missing_value_errorrV    s    	3,esBFm<FKKKA
	 
  ) **LLXu$5L66,		(		a	0	0	0B	z	3	3	3  
q			                 s   B99B= B=z4X_train, X_test_trans_expected, X_roundtrip_expected1c                    t          dt          j        t          j                                      |           }t          j        dgt          j        gdgg          }|                    |          }t          ||           |                    |          }|j        d         }t          |          D ]K}||df         }	||df         }
|	|
J t          |	          rt          j        |
          sJ C|
|	k    sJ LdS )znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r7  rN  rW  rT   r   N)r   r   r   r1   r   r5   r	   r   r   rd  r   r  )r  X_test_trans_expectedX_roundtrip_expectedr  rx  r  rO  	n_samplesr]  expected_valvals              r#   9test_ordinal_encoder_unknown_missing_interaction_both_nanr^    s"   4 
*f f
 
 
 
c'll	  Xurvh.//F<<''L L"7888&&|44K$*1-I9 	' 	'+AqD1!Q$;;;;<(( 	'8C==    ,&&&&&	' 	'r%   c                     t          j        d          } |                     ddgddgd          }t                      }|                    d           d}t          j        t          |	          5  |                    |           d
d
d
           n# 1 swxY w Y   t          d                              d          }t          d                              d          }|                    |          }|                    |          }t          |	                                |           t          |                                |j                   d
S )z*Check OneHotEncoder works with set_output.rR   rS   rT   r   r   rU   r%  zCPandas output does not support sparse data. Set sparse_output=Falser/   NFr   default)r2   rZ   r[   r   r&  r3   r4   r   r	   to_numpyr
   r   r  )r\   r]   r   r0   ohe_default
ohe_pandas	X_defaultX_pandass           r#   test_one_hot_encoder_set_outputrf    s~   		X	&	&B<<sCj1v6677D
//CNNXN&&&QE	z	/	/	/    $                               e444??)?TTKU333>>>RRJ))$//I''--HH%%''333z77998;KLLLLLs   2BBBc                     t          j        d          } |                     ddgddgd          }t                                          d          }t                                          d          }|                    |          }|                    |          }t          |                                |           t          |	                                |j
                   d	S )
z+Check OrdinalEncoder works with set_output.rR   rS   rT   r   r   rU   r`  r%  N)r2   rZ   r[   r   r&  r   r	   ra  r
   r   r  )r\   r]   ord_default
ord_pandasrd  re  s         r#   test_ordinal_set_outputrj    s    		X	&	&B<<sCj1v6677D ""--	-BBK!!,,x,@@J))$//I''--HH%%''333z77998;KLLLLLr%   c                  8   g dddgg} t          |           }|                    ddgg           t          |           t          |j                  k    sJ t	          |j                  D ]-\  }}|j        t          k    sJ t          | |         |           .dS )zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    )asmmaseasrasacsrW  2r   rl  N)r   r1   r  r   	enumeraterJ   r   r
   )rL   r   nr   s       r#    test_predefined_categories_dtypert    s    
 655SzBJ
:
.
.
.CGGdC[Mz??c#/222222CO,, / /3yF"""":a=#..../ /r%   c                     t          j        dgdgt           j        ggt                    } t	          d                              |           }t          |dgdgdgg           t	          dd	                              |           }t          j        d
gg          }|                    |          }t          |dgg           dS )zBCheck missing value or unknown encoding can equal the cardinality.r	  r   rI   r   r-  r   r   r7  r9  snakeN)	r   r   r   r   r   r   r	   r1   r5   )r   r   r   rx  s       r#   1test_ordinal_encoder_missing_unknown_encoding_maxrw    s    
5'E7RVH-V<<<A1555CCAFFGGqcA3_---
(;1
M
M
M
Q
QRS
T
TCXyk""FmmF##GGqcU#####r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt                    j        } t	          dd	d
                              |           }t          |                                g d           |j        d         |j	        d                  dk    sJ t          j        dgdz  dgdz  z   dgdz  z   gt                    j        } t	          dd	d                              |           }t          |                                dg           |j        d         |j	        d                  dk    sJ t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt                    j        } t	          dd	dg                              |           }t          |                                g d           |j        d         |j	        d                  dk    sJ t	          dd	d                              |           }t          |                                g d           |j	        J dS )zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    rS   r   rT   r+   r   r   r  rI   Fr   )r  r   r   )r  x0_dx0_er  r   rd   r   r  )r  r  rz  r  N)r  r  ry  rz  r  )
r   r   r   rN   r   r1   r
   r   r   r'  )r   r   s     r#   #test_drop_idx_infrequent_categoriesr{    s   
 	
cUQY	#	*cUQY	6#	BC6	 	 	  au7
K
K
K
O
OPQ
R
RC!!##%V%V%V   ?1cmA./36666
3%!)seai'3%"*45VDDDFA
au;
O
O
O
S
STU
V
VCs00225L4MNNN?1cmA./36666

cUQY	#	*cUQY	6#	BC6	 	 	  auC5
I
I
I
M
Ma
P
PC!!##%V%V%V   ?1cmA./36666
au4
H
H
H
L
LQ
O
OC!!##AAA   =     r%   c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          dd	d
d|                     |          }t          |j        g dg           t          |j        ddgg           dgdgdgdgdgg}dgdgdgdgd
gg}|                    |          }t          ||           |
                    |          }dgdgdgdgdgg}t          ||           dS )zGTest parameters for grouping 'a', and 'd' into the infrequent category.rS   r   rT   r#  r   rd   r   r   r7  rA   r9  r  r  r   r   r   r  Nr   )r   r   rN   r   r1   r
   r   r  r5   r	   r   )r  r  ordinalrx  expected_transr   r  expected_inverses           r#   ,test_ordinal_encoder_infrequent_three_levelsr  6  sR    h	SEBJ.#;seaiGHIIKG *" @F 	c'll  w*-A-A-A,BCCCw5c
|DDDecUSEC53%0FcA3aS2$/N''GG^,,,))'22I					 y"233333r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        } t	          g d
gddd                              |           }t          |j        g d
g           t          |j        ddgg           dgdgdgdgdgg}dgdgdgdgdgg}|	                    |          }t          ||           |                    |          }dgdgdgdgdgg}t          ||           dS )zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    rS   r   rT   r#  r   rd   r   r   rI   r  r7  rA   )rL   r  r&   r:  r  r   r   r   r  N)r   r   r   rN   r   r1   r
   r   r  r5   r	   r   )r  r}  rx  r~  r   r  r  s          r#   6test_ordinal_encoder_infrequent_three_levels_user_catsr  ]  sh    h
cURZ	3%"*	,uqy	89    ((()*	  
 
c'll  w*-A-A-A,BCCCw5c
|DDDecUSEC53%0FcA3aS2$/N''GG^,,,))'22I					 y"233333r%   c                     t          j        g dg df          } t          d                              |           }t	          |j        d         ddg           |j        d         J ddgddgg}ddgddgg}|                    |          }t          ||           |                    |          }t          j	        ddgd	dggt          
          }t	          ||           dS )zETest when feature 0 has infrequent categories and feature 1 does not.r  r  r   r  r   r   r   Nr  rI   )r   column_stackr   r1   r
   r  r5   r	   r   r   r   )r   r}  rx  r~  r   r  r  s          r#   %test_ordinal_encoder_infrequent_mixedr    s	    	4446Q6Q6QRSSAA...22155Gw5a81a&AAA)!,444!fq!fF!fq!f%N''GG^,,,))'22Ix!Q*>)B C6RRRy"233333r%   c            	         t          j        d          } |                     g d          }|                     g dg d|                     dgdz  dgdz  z   d	gz   d
gz   |          dg d          }t          d                              |          }t          |j        d         ddg           t          |j        d         g d           t          |j        d         d
d	g           |                     g dg d|                     dgd	gz   d
gz   dgz   |          dg d          }g dg dg dg dg}|	                    |          }t          ||           dS )zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rR   )birdr   r	  rv  r  r  r	  r+   r   r   rv  r  rI   )r  rN  r  r  r  r   rS   rT   r   r  r   )rS   rT   r  r   )rg   r   rd   r   )r   r   r   )r   r   r   )r   r   r   r  N)r2   rZ   CategoricalDtyper[   r   r   r1   r
   r  r5   r	   )r\   categorical_dtyper   r}  rx  r~  r   s          r#   :test_ordinal_encoder_infrequent_multiple_categories_dtypesr    s    
	X	&	&B++,K,K,KLL
@@@11199!ugk)WI5@' %  	
 	
 .-- 	 
	 
	A A...22155G w5a83*EEEw5a8***EEEw5a867:KLLL\\'''!>>997)#vh.%8' %  	
 	
 .--  
 
F  iiIIIyyyAN''GG^,,,,,r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   t           j        gz   gt          	          j        } t          d
ddd                              |           }t          |j        g dg           t          j        dgdgdgdgdgt           j        ggt          	          }dgdgdgdgdgdgg}|	                    |          }t          ||           dS )zJCheck behavior of unknown_value and encoded_missing_value with infrequent.rS   r   rT   r#  r   rd   r   r   rI   r7  r   )r&   r:  r  r+  r  r  r   r   N)r   r   r   r   rN   r   r1   r
   r  r5   r	   )r  r}  rx  r~  r   s        r#   .test_ordinal_encoder_infrequent_custom_mappingr    s   h
cURZ	3%"*	,uqy	8BF8	CDF    *	  
 
c'll  w57HIIIXusecUSEC526(C6RRRFcA3aS1#s3N''GG^,,,,,r%   c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        }t	          di | d
dd                    |          }t	          d
d                              |          }dgdgdgdgdgg}t          |                    |          |                    |                     dS )zMAll categories are considered frequent have same encoding as default encoder.rS   r   rT   r#  r   rd   r   r   rI   r7  rA   r9  r  Nr   r   r   r   rN   r   r1   r	   r5   )r  r  adjusted_encoderdefault_encoderrx  s        r#   !test_ordinal_encoder_all_frequentr    s    h
cURZ	3%"*	,uqy	89    &  
!4B   	c'll  %*"  	c'll  ecUSEC53%0F""6**O,E,Ef,M,M    r%   d   c                 4   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        }t	          di | d
dd                    |          }dgdgdgdgdgg}t          |                    |          dgdgdgdgdgg           dS )zAWhen all categories are infrequent, they are all encoded as zero.rS   r   rT   r#  r   rd   r   r   rI   r7  rA   r9  r  r   Nr   r  )r  r  r<  rx  s       r#   #test_ordinal_encoder_all_infrequentr    s     h
cURZ	3%"*	,uqy	89      
!4B   	c'll  ecUSEC53%0FG%%f--aS1#sRD/IJJJJJr%   c                     t          j        t           j        gdz  dgdz  z   dgdz  z   dgz   dgz   gt                    j        } t          d	
                              |           }t          j        dddt           j        ggt                    j        }|                    |          }t          |dgdgdgt           j        gg           dS )z5Check behavior when missing value appears frequently.r#  r	  rd   r   r   rv  deerrI   r   r  r   r   r   N	r   r   r   r   rN   r   r1   r5   r	   r   r}  rx  r   s       r#   -test_ordinal_encoder_missing_appears_frequentr  
	  s    

&B%2	%!	3wi	?6(	JK	 	 	   A...22155GXrv67vFFFHF''GGqcA3bfX677777r%   c            	         t          j        t           j        gdgdz  z   dgdz  z   dgz   dgz   dgdz  d	gdz  z   gt          
          j        } t          d                              |           }t          j        ddgdd	gt           j        d	gdd	gddggt          
          }|                    |          }t          |ddgddgt           j        dgddgddgg           dS )z7Check behavior when missing value appears infrequently.r	  rd   r   r   rv  r  redrH  greenrI   r+   )r  r   r   r   Nr  r  s       r#   /test_ordinal_encoder_missing_appears_infrequentr  	  s    
 	VHw|#ugk1WI=HGaK7)a-'	
 	 	 	   1---11!44GXeWVWGEN	
 	 	 	F ''GGq!fq!frvqkAq6Aq6JKKKKKr%   c                     t          j        dgdgdggt                    } | g dg          }t          j        t
                    5  |                    |           ddd           dS # 1 swxY w Y   dS )a!  Check that we raise a `NotFittedError` by calling transform before fit with
    the encoders.

    One could expect that the passing the `categories` argument to the encoder
    would make it stateless. However, `fit` is making a couple of check, such as the
    position of `np.nan`.
    rV   rW   rh  rI   rg  r   N)r   r   r   r2   r3   r   r5   )r  r   r<  s      r#   test_encoder_not_fittedr  3	  s     	3%#&f555Ag///!2333G	~	&	&  !                 s   
A--A14A1)r   rn  numpyr   r2   scipyr   sklearn.exceptionsr   sklearn.preprocessingr   r   sklearn.utils._missingr   sklearn.utils._testingr   r	   r
   sklearn.utils.fixesr   r$   markparametrizer;   rE   r)  float32r/  rP   r^   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r  r`  str_r   rN   r  r	  r  r  r  r  r   r+  r1  r3  r5  rC  rN  rJ  rL  rO  rT  re  rl  rq  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r#  r'  r*  r.  r1  r3  r5  r8  r=  r@  rD  rH  rK  rP  rV  rM   r^  rf  rj  rt  rw  r{  r  r  r  r  r  r  r  r  r  r  r   r%   r#   <module>r     s   				             - - - - - - ? ? ? ? ? ? ? ? 0 0 0 0 0 0         
 / . . . . .@ @ @. )+T+T+TUU# # VU#. )+T+T+TUU& & VU&$ "(BJ
)KLL2:rz(JKK
; 
; LK ML
; "(BJ
)KLLA A MLA92 92 92xJ J J$ $ $4	9 	9 	9
 
 
 	(+++zzz*++###%6%6%67vFFF///C#78GGG///Cuu#=>fMMM"""OOO4FCCC///C#67vFFF///Cut#<=VLLL			 		 		   .G G/ .G )+T+T+TUUUDM22$11,$ ,$ 21 32 VU,$^ UDM22b'Ar7QG	$yyy)))YYY&GHS\E3<'3%F__ooo?	
	 	' '	 	 32'*7 7 7 !=!=!=>>'C'C'CDD
C 
C ED ?>
C E?#;<<1vxrxc
';';<==  >= =< E?#;<<  =< "+r{	#uenrd%;RZH	Aq6Aq6"	#	#q!fqc]BJ?BHsElS%L1@@@3Z%!J	

 
C<#u.	/	/3*ug1FP	Aq6BFA;'	(	(Arv;*<bjIBHsBFmdBF^4FCCC4[26(#J	
 BHsEE%LL)D%%,,+?@OOO4[55<<.)J	
*	 	 	/     B7 7C   B7" )+T+T+TUU BHsCj\0002BHsCj\0002__J		
 BHq!fXW---/BHq!fXW---/YYKH		
 BHsCj\0002BHsCj\0002RXooo&&'J		
 BHtSk]&1113BHtSk]&1113		
 BHsCj\0002BHsBFm_F3335__		
 BHsDk]&1113BHsBFm_F3335		
?%L	 	 	Q  0 0bA Ac0 0 VUdA(  $ ]N$CDD
 
 ED

? 
? 
?7 7 7 	66"#	.../	()
 	&%%   6 6 6& & &0 	(+++{{{+,,###%6%6%67vFFF
 	'&&   2 2 2  BHsCj\0002BHsCj\0002__J		
 BHq!fXW---/BHq!fXW---/YYKH		
 BHsCj\0002BHsCj\0002RXooo&&'J		
( 	322-   0 1 0"$ $ $- - - 5#,//- - 0/-6 6 6	 	 	  8 8 868 8 8,   +w!788) ) 98)2 264u*FGG!1 !1 HG!1H 5!*.A.A.A!BCCB B DCB T5M'7JKK'===!9?RSS. . TS LK.$ ]N$CDD? ? ED? 	1	"	$q11r22	 	 1E1E1E0F'GHHI I IH	 	I6 +w!>??C C @?C. 3%#00  10" 	1	!	!	$	$q11q11 Q Q Q< 'C5!122/ / 32/. 3%#00  10  0 !a88?A:NO ; ; ;2, , ,6!, !, !,H; ; ;$X, X, X,v>, >, >,B bA$N$N#OPP
$ 
$ QP
$ a1$M$M#NOO) ) PO)  !#M#M#M  'E'E'EFF* * GF *6  * 264.99G G :9G, , ,. )+T+T+TUU((;<<, , =< VU,< )+T+T+TUU"B "B VU"BJ )+T+T+TUU!E !E VU!EH )+T+T+TUU) ) VU)4  (   0262,??" " @?"" ((;<<0262,??% % @? =<%>  3-77793*V44463RV,F;;;<
	 3-77793*V44463RV,F;;;<
	 3-
;;;=3%
33353RV,--.
	%4	 	 	9  ! !D E! !D$ ]N$CDD  ED ! BHsBFC()**,BHsBFC()**,BHseW	
 BHooo&'')BHooo&'')BHrvhZ  	
 BHsBFC()888:BHsBFC()**,BHseWF+++	
 BHooo&f5557BHooo&'')BHrvhZv...	
! 24 43 24 .992 2 :92,  " 
4+c***4+c***  
s3*S)))3*S))) 	( 	(   	(6 6 6(	1 	1 	1' ' '< u66  762 :
 BHsecU^6222S26(RVH%BJvv.f===	
 BHrvhu-V<<<S26(RVH%BJx"&2&AAA	
 &' '' &'BM M M.M M M / / /"	$ 	$ 	$!! !! !!H 	1	!	!	$	$q11q11 4 4 46!4 !4 !4H4 4 4*-- -- --`- - -* 	1	!   ( 	1	# 
K 
K 
K
8 
8 
8L L L8 ]N$CDD  ED  r%   