
    0Ph                     @   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 dZ1dZ2e1e2z   Z3d Z4d Z5d Z6d Z7d Z8d Z9ej:        ;                    deef          d             Z<d Z=d Z>d Z?d Z@d ZAd ZBd  ZCd! ZDd" ZEd# ZFd$ ZGd% ZHd& ZId' ZJd( ZKd) ZLd* ZMej:        N                    e.d+,          d-             ZOd. ZPd/ ZQd0 ZRd1 ZSd2 ZTej:        ;                    deef          d3             ZUd4 ZVd5 ZWd6 ZXd7 ZYd8 ZZej:        ;                    deef          d9             Z[d: Z\d; Z]d< Z^d= Z_d> Z`d? Zaej:        ;                    d@ejb        ejc        ejd        g          dA             ZedB ZfdC ZgdD ZhdE ZidF ZjdG ZkdH ZldI ZmdJ ZndK ZodL Zpej:        ;                    deeef          dM             Zqej:        ;                    dNejr        ejs        g          dO             Ztej:        ;                    dP ee/e0                    dQ             Zuej:        ;                    dRejv        ejs        dSfejw        ejs        dSfejr        ejr        dTfejs        ejs        dTfg          dU             Zxej:        ;                    dV edWX           edWX           edWX          g          dY             ZydZ Zzd[ Z{e,ej:        ;                    d\e0          d]                         Z|ej:        ;                    d^eeeg          d_             Z}ej:        ;                    d^eeeg          ej:        ;                    d`dae~dbfdceddfg          de                         Zej:        ;                    d^ee ej        e          g          ej:        ;                    dfdg dh g          ej:        ;                    didcdag          dj                                     Zej:        ;                    d^eeeg          dk             Zej:        ;                    deeeg          ej:        ;                    dldmdngdddoddpdqdrdsf	ddt ddoddpdudrdsf	ddv ddodwdxdydudzf	ddd{ dodwd| d}drd~f	dddddd ddrd~f	dg          d                         Zej:        ;                    deddddddgfee1ff          d             Zd Zd Zej:        ;                    d^eeeeg          d             Zej:        ;                    d\e0          d             Zej:        ;                    dejr        ejs        g          d             ZdS )    N)defaultdict)Mapping)partial)StringIO)product)assert_array_almost_equalassert_array_equal)sparse)clone)ENGLISH_STOP_WORDSCountVectorizerHashingVectorizerTfidfTransformerTfidfVectorizerstrip_accents_asciistrip_accents_unicode
strip_tags)GridSearchCVcross_val_scoretrain_test_split)Pipeline)	LinearSVC)assert_allclose_dense_sparseassert_almost_equalskip_if_32bit)_IS_WASMCSC_CONTAINERSCSR_CONTAINERS)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 D    t          |                                           S N)r   upperss    j/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercaser%   9   s     ##))+++    c                 .    |                      dd          S )N   ée)replacer"   s    r$   strip_eacuter+   =   s    99T3r&   c                 *    |                                  S r    splitr"   s    r$   split_tokenizer/   A   s    7799r&   c                     dgS )Nthe_ultimate_feature r"   s    r$   lazy_analyzer3   E   s    "##r&   c                  d   d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d	} d
}t          |           |k    sJ d} d}t          |           |k    sJ d} d
}t          |           |k    sJ d S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫ)r   aexpecteds     r$   test_strip_accentsrA   I   s    AH ##x////(A H ##x//// 	AH ##x//// 	AH ##x//// 	AH ##x//// 	#AH ##x//// 	AH ##x//////r&   c                      d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d S )	Nr5   r6   r7   r8   r9   r=   r:   r;   )r   r>   s     r$   test_to_asciirC   m   s     AHq!!X----(A Hq!!X---- 	AHq!!X---- 	AHq!!X------r&   
Vectorizerc                     | d                                           }d}g d} ||          |k    sJ d}g d} ||          |k    sJ  | d                                           }t          d	          }g d
} ||          |k    sJ  | t                                                     }d}g d} ||          |k    sJ  | t          d                                           }d}g d} ||          |k    sJ d S )Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.)
aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.)thisistestreallymetharry	yesterdayfile)input'This is a test with a file-like object!)rT   rU   rV   withr[   likeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.)
AIMANGEDU	KANGOUROUCEMIDIETAITPASTRESBON)	tokenizerrH   )
zj'airK   rL   rM   rN   zmidi,zc'etaitrQ   rR   zbon.)build_analyzerr   r%   r/   )rD   watextr@   s       r$   test_word_analyzer_unigramsrq      st   	'	*	*	*	9	9	;	;BGD  H 2d88x?DLLLH2d88x	&	!	!	!	0	0	2	2B=>>DGGGH2d88x 
	+	+	+	:	:	<	<BHD  H 2d88x 
nG	D	D	D	S	S	U	UBGD  H 2d88xr&   c                  |    t          ddd                                          } d}g d} | |          |k    sJ d S )Nwordunicode      analyzerrH   ngram_rangerI   )rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rn   )ro   rp   r@   s      r$   'test_word_analyzer_unigrams_and_bigramsr{      se    	yf
 
 
n  HD  H* 2d88xr&   c                     d} |                      d          }t          dd                                          }t          j        t
                    5   ||           d d d            n# 1 swxY w Y   t          ddd                                          }t          j        t
                    5   ||           d d d            d S # 1 swxY w Y   d S )	NrI   zutf-8ru   rF   )rz   encodingchar      )ry   rz   r}   )encoder   rn   pytestraisesUnicodeDecodeError)rp   
text_bytesro   cas       r$   test_unicode_decode_errorr      s]    HDW%%J 
Vg	>	>	>	M	M	O	OB	)	*	*  
:               
Vg
 
 
n  
)	*	*  
:                 s$   A,,A03A04CCCc                     t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ d
}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          ddd                                          } t          d          }g d} | |          d d         |k    sJ d S )Nr~   rt   r   rx   u9   J'ai mangé du kangourou  ce midi, c'était pas très bon)zj'az'aizai zi mz ma   )zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterday)thihisis zs iz is)z yesteyesteresterdsterdaterdayr[   r\   ry   rz   r]   r   rn   r   cngarp   r@   s      r$   test_char_ngram_analyzerr      sS   yf  n 	 GD222H4::bqb>X%%%%AAAH4::bcc?h&&&&BD222H4::bqb>X%%%%AAAH4::bcc?h&&&&v6  n 	 =>>D222H4::bqb>X%%%%%%r&   c                  f   t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          d
dd                                          } t          d          }g d} | |          d d         |k    sJ d S )Nchar_wbrt   r   rx   r   )z thr   r   r   z thir   )r   r   r   r   zerday r   r[   r   zA test with a file-like object!)z a z tetesestzst z tesr   r   r   s      r$   test_char_wb_ngram_analyzerr     s    )  n 	 CD333H4::bqb>X%%%%AAAH4::bcc?h&&&&yf  n 	 566D:::H4::bqb>X%%%%%%r&   c                  `   t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          d
dd                                          }t          |          } ||           | |          k    sJ d S )Nrs   rt   r   rx   r   )zthis is testzis test reallyztest really metr   )ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayr[   r   r   )r   rp   r@   	cnga_filer[   s        r$   test_word_ngram_analyzerr     s    yf  n 	 CDDDDH4::bqb>X%%%%  H
 4::bcc?h&&&&v6  n  D>>D9T??dd4jj((((((r&   c                     ddd} t          |                                           }t          t          t          t          t          t                    fD ]} ||           }t          |          }|	                    t                     t          |t                    r|j        | k    sJ nt          |j                  |k    sJ |                    t                    }|j        d         t!          |          k    sJ  ||           }t          |          }|                    |          }t!          |          |j        d         k    sJ d S )Nr   rv   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_	transformshapeleninverse_transform)vocabtermstypvvectXinvs          r$   &test_countvectorizer_custom_vocabularyr   6  sD   ##E

E dD'+s";";< & &CJJ!,,,   a!! 	2#u,,,,,t'((E1111NN>**wqzSZZ''''CJJ!,,,$$Q''3xx171:%%%%%& &r&   c                  D   ddg} t          dt          |           fdt                      fg          }|                    t                    }t          |j        d         j                  t          |           k    sJ |j        d         t          |           k    sJ d S )Nr   r   countr   tfidfrv   )
r   r   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r   r   )what_we_likepiper   s      r$   /test_countvectorizer_custom_vocabulary_pipeliner   K  s    V$Lo>>>?&(()	
 D 	=))At(455\9J9JJJJJ71:\********r&   c                      ddd} d}t          j        t          |          5  t          |           }|                    dg           d d d            d S # 1 swxY w Y   d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   pasta_sizilianar   r   
ValueErrorr   r   )r   msgr   s      r$   7test_countvectorizer_custom_vocabulary_repeated_indicesr   X  s    ##E
0C	z	-	-	- & &%000#$%%%& & & & & & & & & & & & & & & & & &s   'AAAc                      ddd} t          j        t          d          5  t          |           }|                    dg           d d d            d S # 1 swxY w Y   d S )Nrv   rw   r   zdoesn't contain indexr   r   pasta_verdurar   r   r   s     r$   0test_countvectorizer_custom_vocabulary_gap_indexr   `  s    ##E	z)@	A	A	A $ $%000/"###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   'AAAc                  z   t                      } |                     d           |                                 t          k    sJ |                     d           t	          j        t                    5  |                                  d d d            n# 1 swxY w Y   |                     d           t	          j        t                    5  |                                  d d d            n# 1 swxY w Y   g d}|                     |           |                                 t          |          k    sJ d S )Nenglish
stop_words_bad_str_stop__bad_unicode_stop_)someotherwords)r   
set_paramsget_stop_wordsr   r   r   r   r   )cvstoplists     r$   test_countvectorizer_stop_wordsr   g  s   			BMMYM'''"44444MM-M...	z	"	"  
              MM1M222	z	"	"  
              )))HMMXM&&&#h--//////s$   3BBBC//C36C3c                  p   t          j        t          d          5  t          g           } |                     dg           d d d            n# 1 swxY w Y   t          j        t          d          5  t          dd          }|                    g d           d d d            d S # 1 swxY w Y   d S )	Nzempty vocabularyr   r   foo      ?r   )max_dfr   )zto be or not to bez
and me toozand so do your   )r   r   s     r$   %test_countvectorizer_empty_vocabularyr   v  sL   	z);	<	<	<  "---%               
z);	<	<	< E E39===	CCCDDDE E E E E E E E E E E E E E E E E Es#   'AAA5)B++B/2B/c                      t                      } |                     t          d d                   }|                     t          dd                    }|j        d         |j        d         k    sJ d S )Nr   rv   )r   r   r   r   )r   X1X2s      r$   test_fit_countvectorizer_twicer     sh    			B			-+	,	,B			-+	,	,B8A;"(1+%%%%%%r&   c                      g d} d}t          |          }|                    |            g d}|                                }t          ||           dS )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_pattern)documentonesampleN)r   r   get_feature_names_outr	   )corpusr   
vectorizerr@   feature_names_outs        r$   )test_countvectorizer_custom_token_patternr     sr    
  F ?M }===JV$$$,,,H"88::((33333r&   c                      g d} d}d}t          |          }t          j        t          |          5  |                    |            ddd           dS # 1 swxY w Y   dS )zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   r   r   r   r   )r   r   err_msgr   s       r$   <test_countvectorizer_custom_token_pattern_with_several_groupr     s    
  F AM<G }===J	z	1	1	1  v                 s   AAAc                  z   g d} d}t          d|           }t          j        t          |          5  |                    |            d d d            n# 1 swxY w Y   t          j                    5  t          j        dt                     |                    |            d d d            d S # 1 swxY w Y   d S )N)SampleUpperCase
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   r   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   s      r$   'test_countvectorizer_uppercase_in_vocabr    sE    ;::J	)  !4JGGGJ	k	1	1	1 # #z"""# # # # # # # # # # # # # # # 
	 	"	" ) )g{333Z((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )s#   AAA30B00B47B4c                      g dg dg dg} t          dd                              |           }g d}|                    |          }t          ||           dS )	z0Check get_feature_names_out for TfidfTransformerrv   rv   rv   rv   rv   r   rv   r   r   Tl2
smooth_idfnorm)r?   cbN)r   r   r   r	   )r   trfeature_names_inr   s       r$   %test_tf_transformer_feature_names_outr    sq    	IIIyyy)A	T	5	5	5	9	9!	<	<B&001ABB'):;;;;;r&   c                     g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ t	          |dz                      d	
          g d           g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ d S )Nr	  r
  r  Tr  r  r   rw   rv   axisr   r   r   )r   r   toarrayallr   sumr   r  r   s      r$   test_tf_idf_smoothingr    s    	IIIyyy)A	T	5	5	5BQ''))EQJ uaxnn!n44oooFFF 
IIIyyy)A	T	5	5	5BQ''))EQJr&   zcno floating point exceptions, see https://github.com/numpy/numpy/pull/21895#issuecomment-1311525881)reasonc                     g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ t	          |dz                      d	
          g d           g dg dg dg} t          dd          }d}t          j        t          |          5  |                    |                                            d d d            d S # 1 swxY w Y   d S )Nr	  r
  r  Fr  r  r   rw   rv   r  r  zdivide by zeror   )	r   r   r  r  r   r  r   r  RuntimeWarning)r   r  r   in_warning_messages       r$   test_tfidf_no_smoothingr"    s]    
IIIyyy)A	U	6	6	6BQ''))EQJ uaxnn!n44oooFFF 
IIIyyy)A	U	6	6	6B)	n,>	?	?	? & &
##%%%& & & & & & & & & & & & & & & & & &s    (C55C9<C9c                  ,   dgdgdgg} t          ddd           }|                    |                                           }|d         dk    sJ |d         |d         k    sJ |d         |d         k    sJ |d         dk     sJ |d         dk     sJ d S )Nrv   rw   r   TF)sublinear_tfuse_idfr  r   )r   r   r  r  s      r$   test_sublinear_tfr&    s    
qcA3A	tU	F	F	FBQ''))E8q====8eAh8eAh8a<<<<8a<<<<<<r&   c                  
   t          t          d d                   } t          d         g}t          t                    dz
  }t          d          }|                    |           }t          |d          r|                                }|d|j        d         f         dk    sJ t          |j        	          }||fD ]}|                    |          }t          |d          r|                                }|j        }|d|d
         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ d|vsJ d|vsJ |d|d         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ t          d          }	|	
                    |                              |                                          }
t          |	j                  t          |j                  k    sJ |
j        |t          |j                  fk    sJ |	                    |                                          }|j        t          |          t          |j                  fk    sJ t          dd          }|
                    |                              |                                          }t          |d          rJ t          d          }t          j        t                     5  |                    |           d d d            n# 1 swxY w Y   t#          t%          j        |d          dg|z             t          t          d d                   } t)          d          }|j        |_        |                    |                                           }|j        rJ t#          |
|           |                    |                                          }t#          ||           t          d 	          }t          j        t                     5  |                    |            d d d            n# 1 swxY w Y   |                    dd           |                                }d}t3          |          } ||          }||k    sJ |                    dd            t          j        t                     5  |                                 d d d            n# 1 swxY w Y   d |_        t          j        t                     5  |                                 d d d            d S # 1 swxY w Y   d S )!Nrv         ?r   tocsrr   r   rw   r   saladtomatowaterthe	copyrightcokeburgerr   l1r  F)r  r%  idf_Tr%  r  r   rF   )rH   r   rI   _gabbledegook_)rH   rb   _invalid_analyzer_type_)r   r   r   r   r   hasattrr+  r   r   r   r   r  r5  r   r   r   r   r   npr  r   r   fixed_vocabulary_r   build_preprocessorr   rn   )
train_data	test_datan_trainv1counts_trainv2r   counts_testr   t1r   
tfidf_testt2tft3tvtfidf2tfidf_test2v3	processorrp   r@   results                          r$   test_vectorizerrO    s~   mCRC())Jr"#I-  1$G 
	$	$	$B##J//L|W%% ,#))++2>'223q8888 
BN	3	3	3B "X 8 8kk),,;(( 	.%++--K]
1j112a77771j223q88881j112a7777 J&&&& *,,,, 1j001Q66661j223q88881j001Q66661j112a77777 
t	$	$	$BFF<  **<88@@BBErw<<3r~......;7C$7$788888 k**2244JIBN0C0CDDDDD 
tU	3	3	3B				'	'	5	5	=	=	?	?Br6""""" 
$	'	'	'B	z	"	" # #
\"""# # # # # # # # # # # # # # # bfRa0003%'/BBB mCRC())J	d	#	#	#B	BIj))1133F####eV,,, ,,y))1133Kj+666 
D	)	)	)B	z	"	" ! !
Z   ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! MM5M999%%''IGD"4((HYt__Fv MM 0tMDDD	z	"	"    
                              .BM	z	"	"  
                 sH   L<<M M Q))Q-0Q-3TTT?U!!U%(U%c                     d\  } }}}t          | |||          }|                    t                     |j        j        | k    sJ |j        j        |k    sJ |j        j        |k    sJ |j        j        |k    sJ d|_        d|_        d|_        d|_        |j        j        | k    sJ |j        j        |k    sJ |j        j        |k    sJ |j        j        |k    sJ |                    t                     |j        j        |j        k    sJ |j        j        |j        k    sJ |j        j        |j        k    sJ |j        j        |j        k    sJ d S )N)r  FFF)r  r%  r  r$  r3  T)r   r   r   _tfidfr  r%  r  r$  )r  r%  r  r$  rI  s        r$   test_tfidf_vectorizer_settersrR  i  s   .G+D':|	7z
 
 
B FF>9>T!!!!9''''9:----9!\1111 BGBJBMBO9>T!!!!9''''9:----9!\1111FF>9>RW$$$$9
****92=00009!R_444444r&   c                     t                      } |                     t                    }|j        }|j        t          t                    | j        fk    sJ |j        | j        k    sJ t          j	        |j
                  dk    sJ t          j	        |j
                  dk     sJ t          j        |j
                  dk    sJ t          j        |j
                  dk     sJ t          |j        d                   D ];}t          t          j                            |d         j
        d          d           <t          dd          } |                     t                    }|j        t          t                    | j        fk    sJ |j        | j        k    sJ |j        }||k    sJ |d|z  k     sJ t          j	        |j
                  dk    sJ t          j        |j
                  dk     sJ t          |j        d                   D ];}t          t          j                            |d         j
        d          d           <d S )	Nr(  r   rv   rw   r   ru   r3  )rz   r  )r   r   r   nnzr   r   
n_featuresdtyper:  mindatamaxranger   linalgr  )r   r   	token_nnzi
ngrams_nnzs        r$   test_hashing_vectorizerr_    s'   A	M""AI7s=))1<888887ag 6!&>>B6!&>>A6!&>>A6!&>>A 171: ? ?BINN1Q49a88#>>>> 	f4888A	M""A7s=))1<888887ag J	!!!!I%%%% 6!&>>B6!&>>A 171: ? ?BINN1Q49a88#>>>>? ?r&   c                  j   t          d          } t          j        t                    5  |                                  d d d            n# 1 swxY w Y   | j        rJ |                     t                    }|j        \  }}t          | j
                  |k    sJ |                                 }t          |t          j                  sJ |j        t          k    sJ t          |          |k    sJ t!          g d|           t#          |          D ]%\  }}|| j
                            |          k    sJ &g d}t          |          } |                                 }t!          g d|           | j        sJ t#          |          D ]%\  }}|| j
                            |          k    sJ &d S )Nr)  r*  	r   r2  celerir1  r   r,  	sparklingr-  r.  r   )r   r   r   r   r   r;  r   r   r   r   r   r   r:  ndarrayrV  r`   r	   	enumerateget)r   r   	n_samplesrU  feature_namesidxnamer   s           r$   test_feature_namesrk    sG   		$	$	$B 
z	"	" # #
  """# # # # # # # # # # # # # # ##### 	''AGIzr~*,,,,,,..MmRZ00000&((((}++++
	
 
	
 
	
 	   }-- / /	Tbn((.......
 
 
E 
E	*	*	*B,,..M
	
 
	
 
	
 	   }-- / /	Tbn((......./ /s   AAAc                     h d} | dd          }|                     t                     t          |j                  |k    sJ d S )N>   r   r   r,  r2  g333333?   )r   max_features)r   r   r   r   )rD   expected_vocabularyr   s      r$   test_vectorizer_max_featuresrp    sY    >>> 3Q777JNN=!!!z%&&*=======r&   c                  N   t          d          } t          d          }t          d           }|                     t                                        d          }|                    t                                        d          }|                    t                                        d          }|                                 }|                                }|                                }d|                                k    sJ d|                                k    sJ d|                                k    sJ d|t          j        |                   k    sJ d|t          j        |                   k    sJ d|t          j        |                   k    sJ d S )Nrv   rn  r   r   r     r/  )r   r   r   r  r   rY  r:  argmax)	cv_1cv_3cv_Nonecounts_1counts_3counts_None
features_1
features_3features_Nones	            r$   "test_count_vectorizer_max_featuresr~    s    ***D***D4000G!!.115515==H!!.115515==H''77;;;CCK++--J++--J1133M !!!!!! Jry2233333Jry2233333M")K"8"89999999r&   c                     g d} t          dd          }|                    |            d|j                                        v sJ t	          |j                                                  dk    sJ d|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d	|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d S )
Nabcdeaeatr~   r   ry   r   r?   r   r)  rm  rv   )r   r   r   r   r   r   r>  r   s     r$   test_vectorizer_max_dfr    sA   %%%IF3777DHHY$"''))))))t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,,,r&   c                     g d} t          dd          }|                    |            d|j                                        v sJ t	          |j                                                  dk    sJ d|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d	|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d S )
Nr  r~   rv   )ry   min_dfr?   r   rw   r  g?)r   r   r   r   r   r  r  s     r$   test_vectorizer_min_dfr  )  sA   %%%IF1555DHHY$"''))))))t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,,,r&   c                  "   ddg} t          dd          }|                    |                                           }t          g d|                                           t          g dg dg|           t          ddd	
          }|                    |                                           }t          g dg dg|           t          ddd	t
          j                  }|                    |           }|j        t
          j        k    sJ d S )Naaabcabbder~   r   r  )r?   r  r  dr)   )r   rv   rv   r   r   )rv   rw   r   rv   rv   T)ry   r   binary)rv   rv   rv   r   r   )rv   rv   r   rv   rv   )ry   r   r  rV  )r   r   r  r	   r   r:  float32rV  )r>  r   r   X_sparses       r$   test_count_binary_occurrencesr  ;  s   '"IF3777D9%%--//A000$2L2L2N2NOOO91=== F3tDDDD9%%--//A91=== F3t2:VVVD!!),,H>RZ''''''r&   c                  v   ddg} t          ddd           }|                    |           }t          j        |dd         j                  dk    sJ t          j        |dd	         j                  d	k    sJ |j        t          j        k    sJ t          ddd
d           }|                    |           }t          j        |j                  dk    sJ |j        t          j        k    sJ t          ddd
d t          j                  }|                    |           }|j        t          j        k    sJ d S )Nr  r  Fr~   )alternate_signry   r  r   rv   r   rw   T)ry   r  r  r  )ry   r  r  r  rV  )r   r   r:  rY  rX  rV  float64)r>  r   r   s      r$   test_hashed_binary_occurrencesr  O  sC   '"IEFNNNDy!!A6!AaC&+!####6!AaC&+!####7bj     d  D 	y!!A6!&>>Q7bj     dRZ  D 	y!!A7bj      r&   c                    t           } |             }|                    |          }|                    |          }t          |t                    sJ |                                }t          ||          D ]j\  }}t          j        t          j	         ||                              }t          j        t          j	        |                    }t          ||           kt          j        |          sJ |j        dk    sJ |                                }	|                    |	          }
t          ||
          D ]9\  }}t          t          j        |          t          j        |                     :|                                }|                    |          }t          ||          D ]9\  }}t          t          j        |          t          j        |                     :d S )Ncsr)r   r   r   r   r   rn   zipr:  sortuniquer	   r
   issparseformatr  tocsc)rD   rX  r   transformed_datainversed_dataanalyzedocinversed_termsr   transformed_data2inversed_data2terms2transformed_data3inversed_data3terms3s                  r$   !test_vectorizer_inverse_transformr  i  s    DJ!//55001ABBMmT*****''))G"477 2 2^	''#,,//00>!:!:;;5.1111?+,,,,,"e++++ )0022112CDDN]N;; < <v275>>276??;;;; )..00112CDDN]N;; < <v275>>276??;;;;< <r&   c                     t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          | |dd          \  }}}}t	          dt                      fdt                      fg          }dd	gd
d}t          ||dd          }|                    ||          	                    |          }	t          |	|           |j        dk    sJ |j        j        d         }
|
j        dk    sJ d S )Nr(  rv   g?r   	test_sizerandom_stater   svcrv   rv   ru   hingesquared_hinge)vect__ngram_range	svc__lossr   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr	   best_score_best_estimator_r   rz   rX  targetr=  r>  target_traintarget_testpipeline
parametersgrid_searchpredbest_vectorizers              r$   -test_count_vectorizer_pipeline_grid_selectionr    s-   --D TC'''1#4E0F0F*FFF 8Hf!8 8 84J	< &/"3"34uikk6JKLLH %f-/ J xA!DDDK ??:|44<<YGGDt[)))
 "c))))!1=fEO&&000000r&   c                  :   t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          | |dd          \  }}}}t	          dt                      fdt                      fg          }dd	gd
dd}t          ||d          }|                    ||          	                    |          }	t          |	|           |j        dk    sJ |j        j        d         }
|
j        dk    sJ |
j        dk    sJ |
j        rJ d S )Nr(  rv   g?r   r  r   r  r  ru   )r3  r  r  )r  
vect__normr  )r  r   r  )r   r  r   r   r   r   r   r   r   r  r	   r  r  r   rz   r  r;  r  s              r$   'test_vectorizer_pipeline_grid_selectionr    sQ   --D TC'''1#4E0F0F*FFF 8Hf!8 8 84J	< &/"3"34uikk6JKLLH %f-"/ J xA>>>K ??:|44<<YGGDt[)))
 "c))))!1=fEO&&00004''''000000r&   c                  *   t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          dt	                      fdt                      fg          }t          || |d          }t          |g d           d S )Nr(  rv   r   r  r   )r   r  )r   r  r   r   r   r   r   r	   )rX  r  r  	cv_scoress       r$   )test_vectorizer_pipeline_cross_validationr    s    --D TC'''1#4E0F0F*FFF&/"3"34uikk6JKLLH$1===Iy///22222r&   c                  t   d} t                      }|                    | g          }|j        dk    sJ t          d d          }|                    | g          }|j        dk    sJ |j        |j        k    sJ t          t          j        |j	                  t          j        |j	                             d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)rv      F)r  r  )rv   i   )
r   r   r   r   r   rT  r	   r:  r  rX  )r   r   	X_countedX_hasheds       r$   test_vectorizer_unicoder    s    	1  D""H:..I?g%%%%$u===D~~xj))H>Z'''' =HL(((( rwy~..0F0FGGGGGr&   c                     ddg} t          |           }|                    t                    }|                    t                    }t	          |                                |                                           |j        sJ d S )Nr   rb  r   )r   r   r   r   r   r  r;  )r   r   X_1X_2s       r$   +test_tfidf_vectorizer_with_fixed_vocabularyr    su    8$Jj111D


]
+
+C
..
'
'CckkmmS[[]];;;!!!!!!r&   c                     t                      t          d          t          d          t          d          t                      t          t                    t          t                    t          t                                        t
                    t          t          	                              t
                    t                      t          t                    t                                          t
                    g} | D ]}t          j	        |          }t          j
        |          }t          |          |j        k    sJ |                                |                                k    sJ t          |                    t
                    |                    t
                               d S )
Nr3  r4  T)r  ru   rz   ra   )ry   rG   )r   r   r   r3   r   r   r+   r   pickledumpsloadstype	__class__
get_paramsr   r   )	instancesorigr#   copys       r$   test_pickling_vectorizerr    su   t$$$&&&f---Z000...Z00044^DDl33377GG...n--I  
 
L|ADzzT^++++  DOO$5$55555$~..~..	
 	
 	
 	

 
r&   factoryc                     t                      } | |          }d}t          j        t          j        |                    } ||          } ||          }||k    sJ dS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    rI   N)r   r  r  r  )r  vecfunctionrp   roundtripped_functionr@   rN  s          r$   test_pickling_built_processorsr    so     

Cws||HGD"Lh)?)?@@x~~H""4((FXr&   c                     t           j                            d          } t          j        g d          }t	          dd          D ]}t          |                     |dd                    }t          |          }t          j	        t          j
        |                    }|                    t                     |                    t                     t          |                                |                                           d S Nr   ra  d   r   F)sizer*   r   )r:  randomRandomStatearrayrZ  r   choicer   r  r  r  r   r   r	   r   )rngvocab_wordsx	vocab_setr   unpickled_cvs         r$   -test_countvectorizer_vocab_sets_when_picklingr  3  s     )


"
"C(
	
 
	
 
	
 K 1c]] 
 


;Q
FFGG		222|FL$4$455
}'''$$&&(J(J(L(L	
 	
 	
 	

 
r&   c                  Z   t           j                            d          } t          j        g d          }t	          dd          D ]}t                      }|                     |dd          }t	          dd          D ]}||||         <   t          |          }t          j	        t          j
        |                    }|                    t                     |                    t                     t          |                                |                                           d S r  )r:  r  r  r  rZ  r   r  r   r  r  r  r   r   r	   r   )r  r  r  
vocab_dictr   yr   r  s           r$   .test_countvectorizer_vocab_dicts_when_picklingr  O  s%   
)


"
"C(
	
 
	
 
	
 K 1c]] 
 
VV


;Q
>>q! 	% 	%A#$JuQx  
333|FL$4$455
}'''$$&&(J(J(L(L	
 	
 	
 	

 
r&   c                     t                                          t                    } t                                          |           }t          j        |          }t          j        |          }t          |          |j	        k    sJ t          |                    |                                           |                    |                                                      d S r    )r   r   r   r   r   r  r  r  r  r  r	   r  )r   r  r#   r  s       r$   test_pickling_transformerr  l  s    ''77A!!!$$DTA<??D::''''t))!,,44668J8J18M8M8U8U8W8WXXXXXr&   c                  |   t                                          t                    } t                                          |           }t                      }|j        |_        t          |                    |                                           |                    |                                                      d S r    )	r   r   r   r   r   r5  r	   r   r  )r   r  r  s      r$   test_transformer_idf_setterr  u  s    ''77A!!!$$DD	DIt~~a((0022DNN14E4E4M4M4O4OPPPPPr&   c                     t          d          } |                     t                     t          | j        d          }| j        |_        t          |                    t                                                    |                     t                                                               t          | j        d          }d}t          j	        t          |          5  | j        |_        d d d            d S # 1 swxY w Y   d S )NTr6  r   r%  Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r5  r	   r   r  r   r   r   )r  r  r   s      r$   test_tfidf_vectorizer_setterr  }  s(   4(((DHH^d&6EEED	DI~&&..00~&&..00  
 d&6FFFD;G	z	1	1	1  I	                 s   $C>>DDc                  F   t          d          } |                     t                     t          | j        d          }t	          | j                  }dg|dz   z  }t          j        t                    5  t          |d|           d d d            d S # 1 swxY w Y   d S )NTr6  r  r   rv   r5  )
r   r   r   r   r   r5  r   r   r   setattr)r   r  expected_idf_leninvalid_idfs       r$   %test_tfidfvectorizer_invalid_idf_attrr    s    4(((DHH^d&6EEED49~~%+a/0K	z	"	" + +fk***+ + + + + + + + + + + + + + + + + +s   7BBBc                      g d} t          |           }t          j        t                    5  |                    g            d d d            d S # 1 swxY w Y   d S )N)r?   r  r  r?   r?   r   r   r   s     r$   test_non_unique_vocabr    s    %%%Ee,,,D	z	"	"                   s   AAAc                      d} t           }d }t          j        ||           5   |             d d d            d S # 1 swxY w Y   d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  f    t                      } |                     dt          j        dg           d S )Nhello worldhello hello)r   r   r:  nan)hvs    r$   funcz0test_hashingvectorizer_nan_in_docs.<locals>.func  s0      
-?@@@@@r&   r   )r   r   r   )r  	exceptionr  s      r$   "test_hashingvectorizer_nan_in_docsr    s     PGIA A A 
y	0	0	0                   s   ;??c                  p   t          ddd           } | j        sJ |                     ddg                                          }t	          |                                g d           |                     ddg                                          }t	          |                                g d           d S )NTF)r  r%  r  r  r  )rv   rv   rv   r   )r   r  r   r  r	   ravelr   )r   r   r   s      r$   test_tfidfvectorizer_binaryr    s    tU>>>A8OOO	677??AAAqwwyy,,,///	
m]3	4	4	<	<	>	>Brxxzz<<<00000r&   c                      t          d          } |                     t                     t          | j        | j        j                   d S )NTr6  )r   r   r   r   r5  rQ  )r   s    r$   test_tfidfvectorizer_export_idfr    sA    4(((DHH^di)9:::::r&   c                      t          dg          } t          |           }|                     t                     |                    t                     |j        | j        k    sJ d S )Nr/  r   )r   r   r   r   r   )
vect_vocabvect_vocab_clones     r$   test_vectorizer_vocab_cloner    se     UG444JZ((NN=!!!'''':+AAAAAAAr&   c                    d} |             }t          j        t          |          5  |                    d           d d d            n# 1 swxY w Y   t          j        t          |          5  |                    d           d d d            n# 1 swxY w Y   |                    ddg           t          j        t          |          5  |                    d           d d d            d S # 1 swxY w Y   d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)r   r   r   r   r   r   )rD   r  r  s      r$   &test_vectorizer_string_object_as_inputr    s    SG
*,,C	z	1	1	1 * *.)))* * * * * * * * * * * * * * * 
z	1	1	1                                 GG[+,---	z	1	1	1 & &n%%%& & & & & & & & & & & & & & & & & &s5   A

AA0BBBC22C69C6X_dtypec                     t          j        dd| d          }t                                          |          }|j        |j        k    sJ d S N
    N  *   rV  r  )r
   randr   r   rV  )r  r   X_transs      r$   test_tfidf_transformer_typer#    sN    BW2>>>A  ..q11G=AG######r&   zcsc_container, csr_containerc                 >   t          j        ddt          j        d          } | |          } ||          }t	                                          |          }t	                                          |          }t          ||           |j        |j        k    sJ d S r  )r
   r!  r:  r  r   r   r   r  )csc_containercsr_containerr   X_cscX_csrX_trans_cscX_trans_csrs          r$   test_tfidf_transformer_sparser+    s     	BRZbAAAAM!EM!E"$$22599K"$$22599K k:::!3333333r&   z0vectorizer_dtype, output_dtype, warning_expectedTFc                    t          j        g d          }t          |           }d}|rIt          j        t
          |          5  |                    |          }d d d            n# 1 swxY w Y   nZt          j                    5  t          j	        dt
                     |                    |          }d d d            n# 1 swxY w Y   |j
        |k    sJ d S )N)numpyscipysklearnrV  z'dtype' should be used.r   r   )r:  r  r   r   r  r  r   r  r  r  rV  )vectorizer_dtypeoutput_dtypewarning_expectedr   r   warning_msg_matchX_idfs          r$   test_tfidf_vectorizer_typer6    s_    	...//A '7888J1 0\+->??? 	0 	0,,Q//E	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 $&& 	0 	0!';777,,Q//E	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 ;,&&&&&&s$   A((A,/A,0CC
Cr  )rw   rv   r  c                 *   | j         }t          j        d| d          }t          j        t
          |          5  |                     dg           d d d            n# 1 swxY w Y   t          j        t
          |          5  |                     dg           d d d            n# 1 swxY w Y   t          | t                    rLt          j        t
          |          5  | 
                    dg           d d d            d S # 1 swxY w Y   d S d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.r   zgood news everyone)rz   reescaper   r   r   r   r   r   r   r   )r  invalid_ranger  s      r$   $test_vectorizers_invalid_ngram_ranger;    s    OMi	9 	9 	9 	9 G
 
z	1	1	1 ( (%&'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( 
z	1	1	1 2 2/01112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 #()) 2]:W555 	2 	2MM/0111	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	22 2s5   AA"%A"B''B+.B+"DD
D
c                     |                                  }|                                 }|                                 }|                     |||          S r    )r   build_tokenizerr<  _check_stop_words_consistency)	estimatorr   tokenize
preprocesss       r$   r>  r>     sM    ))++J((**H--//J22:z8TTTr&   c                     d} d| z  }t                      t                      t                      fD ]x}|                    g d           t	          j        t          |          5  |                    dg           d d d            n# 1 swxY w Y   |`t          |          du sJ yt          j                    5  t          j        dt                     |                    dg           d d d            n# 1 swxY w Y   t          |          J |                    g d	           t	          j        t          |          5  |                    dg           d d d            d S # 1 swxY w Y   d S )
Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.)you'veyouyou'llANDr   r   r  Fr   )rC  rD  rE  blahrF  )r   r   r   r   r   r  r  r   _stop_words_idr>  r  r  r  )lstrr  r  s      r$   'test_vectorizer_stop_words_inconsistentrJ  '  sT   #D	')-	. 
  !!?#4#46G6I6IJ ; ;"D"D"DEEE\+W555 	/ 	/}o...	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ ,S11U::::: 
	 	"	" + +g{333=/***+ + + + + + + + + + + + + + + )--555 NNHHHNIII	k	1	1	1 + +=/***+ + + + + + + + + + + + + + + + + +s6   &B		B	B	=1C::C>C>	E--E14E1r&  c                 <    | dt           j                  }t           j        }|j                            |          |_        |j                            |          |_        dddd}t                                          ||          }||j        j        k    sJ dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )r   r   r0  r   rv   rw   )zscikit-learnrU   zgreat!N)r:  int64indicesastypeindptrr   _sort_featuresrV  )r&  r   INDICES_DTYPEr   Xss        r$   7test_countvectorizer_sort_features_64bit_sparse_indicesrS  B  s     	fBH---A HM	  //AIx}--AH"#1::J				)	)!Z	8	8BBJ,,,,,,,r&   	Estimatorc                    ddig} |             }t          |          du sJ  | d dg          }t          |          dk    sJ t          |          J |                    |            G d d	|           } |dg
          }t          |          dk    sJ  | d dg          }t          |          du sJ d S )Nrp   r  Tc                     | d         S Nrp   r2   r  s    r$   <lambda>z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>e  s
    1V9 r&   and)rb   r   r   c                       e Zd Zd ZdS )Ftest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                     d S )Nc                     | d         S rW  r2   rX  s    r$   rY  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>m  s
    QvY r&   r2   )selfs    r$   r<  zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorl  s    &&&r&   N)__name__
__module____qualname__r<  r2   r&   r$   CustomEstimatorr\  k  s#        	' 	' 	' 	' 	'r&   rc  r   c                 P    t          j        d                              |           S )Nz\w{1,})r8  compilefindallr  s    r$   rY  z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>s  s    bj33;;C@@ r&   )rm   r   )r>  r   )rT  rX  r  rc  s       r$   -test_stop_word_validation_custom_preprocessorrh  \  s*    [!"D
)++C(--5555
)!4!4%
I
I
IC(--8888(--555d' ' ' ' ') ' ' ' /eW
-
-
-C(--8888
)@@eW  C )--555555r&   zinput_type, err_type, err_msgfilenamer=   r[   z$'str' object has no attribute 'read'c                     dg}t          j        ||          5   | d |                              |           d d d            d S # 1 swxY w Y   d S )N"this is text, not file or filenamer   c                 *    |                                  S r    r-   rX  s    r$   rY  z.test_callable_analyzer_error.<locals>.<lambda>  s    QWWYY r&   ry   r\   )r   r   r   )rT  
input_typeerr_typer   rX  s        r$   test_callable_analyzer_errorrp  x  s     11D	xw	/	/	/ V V	..jAAAOOPTUUUV V V V V V V V V V V V V V V V V Vs   "A		AAry   c                 "    t          | d          S )Nr)openrg  s    r$   rY  rY    s    T#s^^ r&   c                 *    |                                  S r    )readrg  s    r$   rY  rY    s     r&   rn  c                     dg}t          j        t          t          f          5   | ||                              |           d d d            d S # 1 swxY w Y   d S )Nrk  rm  )r   r   FileNotFoundErrorAttributeErrorr   )rT  ry   rn  rX  s       r$   &test_callable_analyzer_change_behaviorry    s     11D	)>:	;	; K K	8:666DDTJJJK K K K K K K K K K K K K K K K K Ks   !AAAc                 
   d }|                      d          }|                    d           t          j        t          d          5   ||d                              |g           d d d            d S # 1 swxY w Y   d S )Nc                      t          d          )Ntesting)	Exceptionrg  s    r$   ry   z6test_callable_analyzer_reraise_error.<locals>.analyzer  s    	"""r&   zfile.txtzsample content
r|  r   r[   rm  )joinwriter   r   r}  r   )tmpdirrT  ry   fs       r$   $test_callable_analyzer_reraise_errorr    s    
# # # 	JAGG	y		2	2	2 F F	86222@@!EEEF F F F F F F F F F F F F F F F F Fs   	"A88A<?A<zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgrC  rE  r  r~   z'stop_words'
'analyzer'	!= 'word'c                 *    |                                  S r    r-   r"   s    r$   rY  rY        aggii r&   z'tokenizer'c                 *    |                                  S r    r-   r"   s    r$   rY  rY    r  r&   \w+rs   'token_pattern'zis not Nonec                 *    |                                  S r    r!   r"   s    r$   rY  rY    r  r&   c                 *    |                                  S r    r  r"   s    r$   rY  rY    r  r&   z'preprocessor'zis callableru   c                 *    |                                  S r    r  r"   s    r$   rY  rY    r  r&   z'ngram_range')	NNNr  r  r~   r  r  r  c
                    t           }
 |             }|                    ||||||           d|d|d|	}t          j        t          |          5  |                    |
           d d d            d S # 1 swxY w Y   d S )N)r   rm   rb   rz   r   ry   zThe parameter z will not be used since  r   )r   r   r   r  r  r   )rD   r   rm   rb   rz   r   ry   unused_name	ovrd_nameovrd_msgr=  r   r   s                r$   test_unused_parameters_warnr    s    r  J:<<DOO!#      			C
 
k	-	-	-                   s   A66A:=A:zVectorizer, Xrv   rw   )r   barr   )r   bazc                      |             }t          |d          rJ |                    |           t          |d          rJ d S )Nn_features_in_)r9  r   )rD   r   r   s      r$   test_n_features_inr    sU     Jz#344444NN1z#34444444r&   c                      t          d          } |                     ddg          j        }|                     ddg          j        }||k    sJ d S )Nrv   rr  helloworld)r   r   r   )r  vocab1vocab2s      r$   )test_tie_breaking_sample_order_invariancer  %  s]     q
)
)
)CWWgw'((4FWWgw'((4FVr&   c                  z    t          dd          } |                     dg          j        }|d         dk    sJ d S )Ni@B )rw   r   )rU  rz   z22pcs efuturer   )r   r   rM  )hashingrM  s     r$   2test_nonnegative_hashing_vectorizer_result_indicesr  .  sB    7GGGG 122:G1:??????r&   c                 >     |             }t          |d          rJ dS )z0Check that vectorizers do not define set_output.
set_outputN)r9  )rT  r   s     r$   'test_vectorizers_do_not_have_set_outputr  5  s+    
 )++CsL)))))))r&   c                    t          j        ddt          j        d          } | |          }|                                }t                                          |          }|                    |d          }t          ||           ||usJ |                    |d          }||u sJ t          j
        t                    5  t          ||           ddd           dS # 1 swxY w Y   dS )	zJCheck the behaviour of TfidfTransformer.transform with the copy parameter.r  r  r  r   T)r  FN)r
   r!  r:  r  r  r   r   r   r   r   r   AssertionError)r&  r   r(  X_csr_originaltransformerX_transforms         r$   test_tfidf_transformer_copyr  >  s;    	BRZbAAAAM!E ZZ\\N"$$((//K''D'99K 777e####''E'::K%	~	&	& < <$UN;;;< < < < < < < < < < < < < < < < < <s   C$$C(+C(rV  c                     d t          d          D             }t          |                               |          }|j        j        | k    sJ dS )zCheck that `idf_` has the same dtype as the input data.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/30016
    c                 N    g | ]"}t          t          j                              #S r2   )struuiduuid4).0r]  s     r$   
<listcomp>z<test_tfidf_vectorizer_perserve_dtype_idf.<locals>.<listcomp>Z  s&    333qTZ\\		333r&   i r0  N)rZ  r   r   r5  rV  )rV  r   r   s      r$   (test_tfidf_vectorizer_perserve_dtype_idfr  S  sW     	43E'NN333A u---11!44J? E))))))r&   )r  r8  r  r  collectionsr   collections.abcr   	functoolsr   ior   	itertoolsr   r-  r:  r   numpy.testingr   r	   r.  r
   sklearn.baser   sklearn.feature_extraction.textr   r   r   r   r   r   r   r   sklearn.model_selectionr   r   r   sklearn.pipeliner   sklearn.svmr   sklearn.utils._testingr   r   r   sklearn.utils.fixesr   r   r   r   r  r   r%   r+   r/   r3   rA   rC   markparametrizerq   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  xfailr"  r&  rO  rR  r_  rk  rp  r~  r  r  r  r  r  r  r  r  r  r  r  rn   r<  r=  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r+  int32rL  r6  r;  r>  rJ  rS  rh  rw  rx  rp  paramry  r  r  r  r  r  r  r  r  r2   r&   r$   <module>r     s    				   # # # # # # # # # # # #                        G G G G G G G G            	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 T S S S S S S S S S % % % % % % ! ! ! ! ! !         
 I H H H H H H H H H  !22, , ,       $ $ $!0 !0 !0H. . .* 9J'KLL:  :  ML: z     <  && & &4& & &() ) ).& & &*
+ 
+ 
+& & &$ $ $0 0 0E E E& & &4 4 4&  &) ) )*< < <    	M   & & &&  d d dN5 5 5:#? #? #?LD/ D/ D/N 'IJJ> > KJ>: : :4- - -$- - -$( ( ((! ! !4 'IJJ< < KJ<>!1 !1 !1H$1 $1 $1N
3 
3 
3H H H0" " "
 
 
6 &*' 
 
 

 
 
8
 
 
:Y Y YQ Q Q   + + +    1 1 1; ; ;B B B ?O5FG & & & RZ$<==$ $ >=$ "GGNN$K$K 4 4 4 6	2:t$	2:t$	RZ'	RZ'	 ' ' ' 	f---F+++F+++ 2 2 2(U U U+ + +6 .99- - :9 -0 /?4EF 6 6 62 /?4EF  #	&+	!GH V V  V &''  ++-C-CD  
';<<K K =<  K /?4EF 	F 	F 	F ?$5G  	5
 x 
	
 
	
 
	
 
	
 
	

	
qCI IT UI I Z@ 	Qq111Q3G3GHI	.) 5 5 5     /?4DFWX * * * .99< < :9<( 2:rz":;;* * <;* * *r&   