
    bMh*                     t   d Z ddlmZmZ ddlZddlZddlZddlZddl	Z	ddl
mZmZ ddlmZ e	j                            d          Ze	j                            d          Zd Zed             Zee	j                            d	d
dg          e	j                            dg d          d                                     Zd Zd Ze	j                            ddi  eddgi          fdddi eddgi          fdddgi edddgi          fddgdd eddgi          fddgdd edej        dgi          fg          d              Zd! Ze	j                            d"g d#          d$             Ze	j                            d%ddg          d&             Zd' Z e	j                            dg d(          d)             Z!e	j                            dg d*          d+             Z"d, Z#d- Z$e	j                            d.          e	j                            d/d0d1g          d2                         Z%dS )3zZ
Tests encoding functionality during parsing
for all of the parsers defined in parsers.py
    )BytesIOTextIOWrapperN)	DataFrameread_csvz=ignore:Passing a BlockManager to DataFrame:DeprecationWarningpyarrow_skipc                     d}| }t          d                    |                    }|                    |d|          }t          ddggddg	          }t	          j        ||           d S )
Ncp1255u   שלום:1234
562:123:)sepencodingi2  {   u   שלום1234columnsr   encoder   r   tmassert_frame_equal)all_parsersr   parserdataresultexpecteds         d/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/pandas/tests/io/parser/test_encoding.pytest_bytes_io_inputr      su    HF+228<<==D__TsX_>>F3*
F/CDDDH&(+++++    c                     | }t          d                                          }|                    |ddd           }t          ddgg          }t	          j        ||           d S )Nu   Łaski, Jan;1;utf-8)r   r   headeru   Łaski, Jan   r   )r   r   r   r   r   s        r   test_read_csv_unicoder"   (   si    F&--//00D__TsWT_JJF,a0122H&(+++++r   r   ,	r   )utf-16zutf-16lezutf-16bec                 l   | }d                     d|          }dt          j                     d}|dd}d}t          j        |          5 }|                    |          }t          |d          5 }	|	                    |           d d d            n# 1 swxY w Y   t          t          |                    |                    |	          5 }
 |j
        |fd
|i|} |j
        |
fd
|i|}d d d            n# 1 swxY w Y   t          j        ||           d d d            d S # 1 swxY w Y   d S )Nz)skip this
skip this too
A,B,C
1,2,3
4,5,6r#   __z__.csv   )r   skiprowsr   wbr   r   )replaceuuiduuid4r   ensure_cleanr   openwriter   r   r   r   )r   r   r   r   r   pathkwargsutf8
bytes_datafbytes_bufferr   r   s                r   test_utf16_bom_skiprowsr8   2   s%   
 F	 
S
 
	 	 %
$$$Da((FD			 	0$[[**
$ 	 GGJ	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  74;;t#4#455EEE 	N$V_TGGHGGGF&v|MMdMfMMH	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	fh///	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0sZ   &D)1BD)B	D)B	4D)!C<0D)<D 	 D)D 	D))D-0D-c                     t           j                            |d          }| }|                    |dd          }t	          |          dk    sJ d S )Nzutf16_ex.txtr%   r$   )r   r   2   )osr2   joinr   len)r   csv_dir_pathr2   r   r   s        r   test_utf16_exampler?   O   sP    7<<n55DF__TH$_??Fv;;"r   c                     t           j                            |d          }| }|                    |d d          }|                    d          }|d         d         }d}||k    sJ d S )Nunicode_series.csvlatin-1)r    r   r   r!   i`  u$   Á köldum klaka (Cold Fever) (1994))r;   r2   r<   r   	set_index)r   r>   r2   r   r   gotr   s          r   test_unicode_encodingrE   V   sj    7<<&:;;DF__T$_CCFa  F
)D/C9H(??????r   zdata,kwargs,expectedza
1ar!   z"a"
1	quotechar"zb
1namesb1
1T)rI   skip_blank_linesFc                    	 | }dd		fd}|j         dk    r1|dk    r+|                    dd          rt          j        d	            |j         ||          fd
	i|}t          j        ||           d S )Nu   ﻿r   c                 R    | z                                  }t          |          S )N)r   r   )_databom_databomr4   s     r   _encode_data_with_bomz,test_utf8_bom.<locals>._encode_data_with_bom{   s(    %K''--x   r   pyarrowrL   rM   Tz,https://github.com/apache/arrow/issues/38676)reasonr   )enginegetpytestskipr   r   r   )
r   r   r3   r   requestr   rS   r   rR   r4   s
           @@r   test_utf8_bomr[   b   s    * F
CD! ! ! ! ! !
 	""EMMJJ)400  	IJJJJV_22488RR4R6RRF&(+++++r   c                     t          dgdgd          }| }|                    |          }d                    |          }|                    t	          |          |          }t          j        ||           d S )Ng333333@test)mb_num	multibytezmb_num,multibyte
4.8,testr+   )r   formatr   r   r   r   r   )r   	utf_valueencoding_fmtr   r   r   r   r   s           r   test_read_csv_utf_aliasesrc      sz    SEAABBHF""9--H'..x88D__WT]]X_>>F&(+++++r   zfile_path,encoding)))ior   csvz	test1.csvr   ))rd   r   r   rA   rB   ))rd   r   r   zsauron.SHIFT_JIS.csvshiftjisc                 z   | } || }|                     ||          }t          ||          5 }|                     |          }|j        rJ 	 d d d            n# 1 swxY w Y   t          j        ||           t          |d          5 }	|                     |	|          }|	j        rJ 	 d d d            n# 1 swxY w Y   t          j        ||           t          |dd          5 }	|                     |	|          }|	j        rJ 	 d d d            n# 1 swxY w Y   t          j        ||           d S )Nr+   rb)moder   )ri   	buffering)r   r0   closedr   r   )
r   	file_pathr   datapathr   fpathr   far   fbs
             r   test_binary_mode_file_buffersrq      s    FHi Eux88H	eh	'	'	' 2$$9               (F+++	e$			 2h779               (F+++	e$!	,	,	, h779               (F+++++s5   AA #A !B;;B?B?-!DD"Dpass_encodingc                    | }|                     |          }|j        dk    r|du r|dv rt          j        d           t	          ddgi          }t          j        d|d          5 }|                    d	           |                    d
           |	                    ||r|nd           }t          j
        ||           d d d            d S # 1 swxY w Y   d S )NrT   T)       zThese cases freezefoobarzw+)ri   r   return_filelikezfoo
barr   r+   )r`   rV   rX   rY   r   r   r/   r1   seekr   r   )	r   ra   rb   rr   r   r   r   r6   r   s	            r   test_encoding_temp_filerz      s-    F""9--H}	!!mt&;&;	X@U@U()))%%)**H	dXt	L	L	L 0PQ	
	q			-PXXDQQ
fh///0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0s   (ACCCc                    | }d}d}d}t          ||gi          }t          j                    5 }|                    | d|                     |                     |                    d           |                    ||          }t          j        ||           |j	        rJ 	 d d d            d S # 1 swxY w Y   d S )Nz	shift-jisu	   てすとu   こむ
r   r+   )
r   tempfileNamedTemporaryFiler1   r   ry   r   r   r   rk   )r   r   r   titler   r   r6   r   s           r   test_encoding_named_temp_filer      s   FHED%$))H		$	&	& !	5""D""))(33444	q			X66
fh///8                 s   A8B44B8;B8)r   r%   z	utf-16-bez	utf-16-lezutf-32c                     d}t          |                    |                     }t          |d|           }t          ddgddgdd	ggd
dg          }t	          j        ||           d S )Nu   a	b
：foo	0
bar	1
baz	2r$   )	delimiterr   u   ：foor   rw   r!   bazr(   rF   rJ   )r   r   r   )r   r   encoded_datar   r   s        r   %test_parse_encoded_special_charactersr      s     -D4;;x0011LldXFFFFmeQZ%4c
  H &(+++++r   )r   Nr%   r	   rB   c                    | }t          g dg dg dd          }t          j                    5 }|                    |d|           |j        dk    r[d}t          j        t          |	          5  |                    ||d
           d d d            n# 1 swxY w Y   	 d d d            d S |                    ||d
          }d d d            n# 1 swxY w Y   t          j	        ||           d S )N)Raphael	DonatellozMiguel AngelLeonardo)redpurpleorangeblue)saizbo staffnunchunkkatana)namemaskweaponF)indexr   rT   BThe 'memory_map' option is not supported with the 'pyarrow' enginematchT)r   
memory_map)
r   r   r/   to_csvrV   rX   raises
ValueErrorr   r   )r   r   r   r   filemsgdfs          r   test_encoding_memory_mapr      s    FHHH777???	
 	
 H 
		 	GdEH====I%%VCz555 J JxDIIIJ J J J J J J J J J J J J J J	G 	G 	G 	G 	G 	G 	G 	G __TH_FF	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G 	G "h'''''s<   AC0B	CB	CB	C.CCCc                    | }t          dgdz            }d|j        d<   t          j        d          5 }|                    |ddd	           |j        d
k    r[d}t          j        t          |          5  |	                    |dd           ddd           n# 1 swxY w Y   	 ddd           dS |	                    |dd          }ddd           n# 1 swxY w Y   t          j
        ||           dS )zO
    Chunk splits a multibyte character with memory_map=True

    GH 43540
    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaai   )r   u   aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaąi  zbug-gh43540.csvFr   r   r    r   rT   r   r   NT)r    r   )r   ilocr   r/   r   rV   rX   r   r   r   r   )r   r   r   fnamer   dfrs         r    test_chunk_splits_multibyte_charr     s    F	d*	+	+	+B %BGDM	*	+	+ 	Cu
		%uUW	EEE=I%%VCz555 E EdtDDDE E E E E E E E E E E E E E E	C 	C 	C 	C 	C 	C 	C 	C ooeDToBB	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C #r"""""s<   AC7BCB 	 C#B 	$C5CC Cc           	      &   g }d}d}d}t          t          |          t          |          |          D ]q}d                    d t          ||dz             D                       dz   }	 |                    d           n# t          $ r Y Xw xY w|                    |           r| }t          |          }t          j        d          5 }	|	                    |	d	d	d
           |j
        dk    r\d}
t          j        t          |
          5  |                    |	ddd           ddd           n# 1 swxY w Y   	 ddd           dS |                    |	ddd          }ddd           n# 1 swxY w Y   t          j        ||           dS )zg
    GH 43787

    Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
        u   𐂀 c                 ,    g | ]}t          |          S  )chr).0cs     r   
<listcomp>z,test_readcsv_memmap_utf8.<locals>.<listcomp>/  s    AAA1AAAAr   r|   r   zutf8test.csvFr   rT   r   r   NT)r    r   r   )rangeordr<   r   UnicodeEncodeErrorappendr   r   r/   r   rV   rX   r   r   r   r   )r   linesline_length
start_charend_charlnumliner   r   r   r   r   s               r   test_readcsv_memmap_utf8r   "  sq    EKJH c*oos8}}kBB  wwAAdD4K(@(@AAABBTI	KK    ! 	 	 	H	TF	5		B		(	( 	UE
		%uUW	EEE=I%%VCz555 W WdtgVVVW W W W W W W W W W W W W W W	U 	U 	U 	U 	U 	U 	U 	U ooeDTGoTT	U 	U 	U 	U 	U 	U 	U 	U 	U 	U 	U 	U 	U 	U 	U "c"""""sO   *B  
BBAE1D3'E13D7	7E1:D7	;E1E11E58E5pyarrow_xfailri   zw+bzw+tc                 B   | }d}d|v rd}t          j        |d          5 }|                    |           |                    d           |                    |          }d d d            n# 1 swxY w Y   t          g dg          }t          j        ||           d S )Ns   abcdtabcdr   )ri   r   r   r   )r}   SpooledTemporaryFiler1   ry   r   r   r   r   )r   ri   r   contenthandler   r   s          r   test_not_readabler   D  s     FG
d{{		&D7	C	C	C %vWA__V$$% % % % % % % % % % % % % % % fX...H"h'''''s   A A--A14A1)&__doc__rd   r   r   r;   r}   r-   numpynprX   pandasr   r   pandas._testing_testingr   markfilterwarnings
pytestmarkusefixturesskip_pyarrowr   r"   parametrizer8   r?   rE   nanr[   rc   rq   rz   r   r   r   r   r   r   r   r   r   <module>r      s           
			                    [''C 
 {&&~66, , , , , , d,,%G%G%GHH0 0 IH -, 04  	 	 	  
YYaSz**+	K%yy#s'<'<=	'C5!99cC:->#?#?@	3%T::IIsQCj<Q<QR e77IsRVQK())	
 &, ,' &,,	, 	, 	,    , , ,0 4-880 0 980&  ( GGG , , , %S%S%STT( ( UT(.# # #4# # #D ))%00( ( 10 *)( ( (r   