
    0PhJ                         d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ dededefdZdedede
j        fdZd Z	 ddZ	 ddZ	 	 ddZdS )z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)chunk_generatorget_chunk_n_rows)check_pandas_support)	pd_fillna	arff_datainclude_columnsreturnc                    t                      t                      t                      f}d t          |          D             }t          | d         | d         | d                   D ]a\  }}}||v rW|d                             |           |d                             |           |d                             ||                    b|S )a  Obtains several columns from sparse ARFF representation. Additionally,
    the column indices are re-labelled, given the columns that are not
    included. (e.g., when including [1, 2, 3], the columns will be relabelled
    to [0, 1, 2]).

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    c                     i | ]\  }}||	S  r   .0	array_idx
column_idxs      ]/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/sklearn/datasets/_arff_parser.py
<dictcomp>z)_split_sparse_columns.<locals>.<dictcomp>.   +       "7)Z
I      r      r   )list	enumeratezipappend)r   r   arff_data_newreindexed_columnsvalrow_idxcol_idxs          r   _split_sparse_columnsr%      s    . *.(@M ;D_;U;U   "%Yq\9Q<1!N!N @ @Wgo%%!##C(((!##G,,,!##$5g$>???r   c                 @   t          | d                   dz   }|t          |          f}d t          |          D             }t          j        |t          j                  }t          | d         | d         | d                   D ]\  }}}||v r|||||         f<   |S )Nr   c                     i | ]\  }}||	S r   r   r   s      r   r   z)_sparse_data_to_array.<locals>.<dictcomp>@   r   r   dtyper   r   )maxlenr   npemptyfloat64r   )	r   r   num_obsy_shaper!   yr"   r#   r$   s	            r   _sparse_data_to_arrayr2   9   s    
 )A,!#GO,,-G ;D_;U;U   	
+++A!$Yq\9Q<1!N!N 9 9Wgo%%58Ag(112Hr   c                     | |         }t          |          dk    r	| |         }n$t          |          dk    r| |d                  }nd}||fS )a  Post process a dataframe to select the desired columns in `X` and `y`.

    Parameters
    ----------
    frame : dataframe
        The dataframe to split into `X` and `y`.

    feature_names : list of str
        The list of feature names to populate `X`.

    target_names : list of str
        The list of target names to populate `y`.

    Returns
    -------
    X : dataframe
        The dataframe containing the features.

    y : {series, dataframe} or None
        The series or dataframe containing the target.
    r   r   r   N)r+   )framefeature_namestarget_namesXr1   s        r   _post_process_framer8   K   s^    , 	mA
<A,	\		a		,q/"a4Kr   c                 |	  "#$ d } ||           }|dk    rt           j        nt           j        }|dk     }	t          j        |||	          }
||z   ##fd|
d         D             "|dk    rt	          d          }t          |
d                   }t          |                                          }t          |
d                   }|	                    |g|d	
          }|
                    d                                          }t          |          }#fd|D             }||         g}t          |
d         |          D ]3}|                    |	                    ||d	
          |                    4t          |          dk    r)|d                             |d         j                  |d<   |                    |d          }t'          ||          }~~i }|j        D ]\}|         d         }|                                dk    rd||<   .|                                dk    rd||<   L|j        |         ||<   ]|                    |          }t-          |||          \  }$n<|
d         }fd|D             }fd|D             }t/          |t0                    r|t3          d          |d         dk    rd}n|d         |d         z  }t5          j        t8          j                            |          d|          } |j        | }|dd|f         }|dd|f         $nt/          |t@                    rtC          ||          }tE          |d                   dz   }|t          |          f} tF          j$        %                    |d         |d         |d         ff| t4          j&                  }|'                                }tQ          ||          $nt3          dtS          |                     "fd |D             }!|!sn\tU          |!          r/t5          j+        "$fd!tY          |          D                       $nt[          |!          rt3          d"          $j.        d         dk    r$                    d#          $n$j.        d         dk    rd$|dk    r|$|dfS |$d"fS )$a  ARFF parser using the LIAC-ARFF library coded purely in Python.

    This parser is quite slow but consumes a generator. Currently it is needed
    to parse sparse datasets. For dense datasets, it is recommended to instead
    use the pandas-based parser, although it does not always handles the
    dtypes exactly the same.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    c              3   B   K   | D ]}|                     d          V  d S )Nutf-8)decode)	gzip_filelines     r   _io_to_generatorz+_liac_arff_parser.<locals>._io_to_generator   s:       	' 	'D++g&&&&&&	' 	'r   sparsepandas)return_typeencode_nominalc                 N    i | ]!\  }}t          |t                    r|v ||"S r   )
isinstancer   )r   namecatcolumns_to_selects      r   r   z%_liac_arff_parser.<locals>.<dictcomp>   sI       D#c4   &*->%>%> 	c%>%>%>r   
attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deepc                     g | ]}|v |	S r   r   r   colrH   s     r   
<listcomp>z%_liac_arff_parser.<locals>.<listcomp>   s$    TTT33BS;S;S3;S;S;Sr   r   r   r   )ignore_index	data_typeintegerInt64nominalcategoryc                 F    g | ]}t          |         d                    S indexintr   col_nameopenml_columns_infos     r   rQ   z%_liac_arff_parser.<locals>.<listcomp>   s<     %
 %
 %
 #H-g677%
 %
 %
r   c                 F    g | ]}t          |         d                    S rY   r[   r]   s     r   rQ   z%_liac_arff_parser.<locals>.<listcomp>   s<     $
 $
 $
 #H-g677$
 $
 $
r   Nz6shape must be provided when arr['data'] is a Generatorr.   )r)   count)shaper)   z-Unexpected type for data obtained from arff: c                     h | ]}|v S r   r   )r   r^   
categoriess     r   	<setcomp>z$_liac_arff_parser.<locals>.<setcomp>  s+     
 
 
'/H
"
 
 
r   c           
          g | ]j\  }}t          j        t          j                            |          d           dd||dz   f                             t
          d                    kS )Or(   Nr   F)rL   )r,   takeasarraypopastyper\   )r   ir^   re   r1   s      r   rQ   z%_liac_arff_parser.<locals>.<listcomp>  s       
 $8	 G
:>>(#;#;3GGG!!!QQY,..s.??   r   zAMix of nominal and non-nominal targets is not currently supported)ra   )/r   COO	DENSE_GENloadr   r   r   keysnext	DataFramememory_usagesumr
   r	   r   r+   rl   dtypesconcatr   rK   lowerr8   rE   r   
ValueErrorr,   fromiter	itertoolschainfrom_iterablereshapetupler%   r*   spr@   
coo_matrixr.   tocsrr2   typeallhstackr   anyrc   )%r=   output_arrays_typer_   feature_names_to_selecttarget_names_to_selectrc   r?   streamrB   rC   arff_containerpdcolumns_infocolumns_names	first_rowfirst_df	row_bytes	chunksizecolumns_to_keepdfsrJ   r4   rv   rF   column_dtyper7   r   feature_indices_to_selecttarget_indices_to_selectrb   arff_data_Xr/   X_shapeis_classificationre   rH   r1   s%     `                               @@@r   _liac_arff_parserr   k   s   n' ' ' i((F  2X==%))5?K -89NZK  N 02HH   '5  J
 X%%!"CDD">,#?@@\..0011 /00	<<]<OO))t)4488::	$Y//	 UTTT-TTT()#N6$:IFF 	 	DJJT=uEEoV   
 s88q==V]]3q6=11CF
 		#D	11"e$$ M 		2 		2D.t4[AL!!##y00  't##%%22)t$|D1tV$$"*,B
 
11 #6*	%
 %
 %
 %
3%
 %
 %
!$
 $
 $
 $
2$
 $
 $
 
 i++  	} L   Qx2~~a58+;--i88  D
  4<'DQQQ112AQQQ001AA	5)) 	/	;TUUK)A,''!+G$= > >?G	$$Q+a.+a.!ABj %  A
 		A%i1IJJAA QYQQ  
 
 
 
3I
 
 
 ! 	"## 		    
 (11G'H'H   AA "## 	S   71:??		%  AAWQZ1__AX%%!UD  az!!r   c           
      "   ddl | D ]>}|                    d                                                              d          r n?i |D ]K}||         d         }|                                dk    rd|<   .|                                dk    rd	|<   Lfd
t	          |          D             }	dddgddddd|	d	}
i |
|pi } j        | fi |}	 d |D             |_        n-# t          $ r }j        	                    d          |d}~ww xY w||z   fd|j        D             }||         }t          j        d          fd}fd|j                                        D             }|D ]%}||         j                            |          ||<   &t!          |||          \  }}|dk    r|||dfS |                                |                                }}fd|j                                        D             }||d|fS )a^  ARFF parser using `pandas.read_csv`.

    This parser uses the metadata fetched directly from OpenML and skips the metadata
    headers of ARFF file itself. The data is loaded as a CSV file.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The GZip compressed file with the ARFF formatted payload.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities are:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected to build `X`.

    target_names_to_select : list of str
        A list of the target names to be selected to build `y`.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    r   Nr;   z@datarS   rT   rU   rV   rW   c                 0    i | ]\  }}|v 	||         S r   r   )r   r$   rF   rv   s      r   r   z'_pandas_arff_parser.<locals>.<dictcomp>  s4       GT6>> 	>>r   F?%"T\)	header	index_col	na_valueskeep_default_nacomment	quotecharskipinitialspace
escapecharr)   c                     g | ]}|S r   r   )r   rF   s     r   rQ   z'_pandas_arff_parser.<locals>.<listcomp>  s    >>>$>>>r   zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.c                     g | ]}|v |	S r   r   rO   s     r   rQ   z'_pandas_arff_parser.<locals>.<listcomp>  s$    PPPss>O7O7Os7O7O7Or   z^'(?P<contents>.*)'$c                 `    t          j        |           }|| S |                    d          S )Ncontents)researchgroup)input_stringmatchsingle_quote_patterns     r   strip_single_quotesz0_pandas_arff_parser.<locals>.strip_single_quotes  s2    	.==={{:&&&r   c                 D    g | ]\  }}t          |j                  |S r   )rE   CategoricalDtyper   rF   r)   r   s      r   rQ   z'_pandas_arff_parser.<locals>.<listcomp>  s@       D%eR011  r   rA   c                 t    i | ]4\  }}t          |j                  ||j                                        5S r   )rE   r   re   tolistr   s      r   r   z'_pandas_arff_parser.<locals>.<dictcomp>  sQ       D%eR011e%%''  r   )rA   r<   rx   
startswithr   read_csvrK   ry   errorsParserErrorr   compilerv   itemsrG   rename_categoriesr8   to_numpy)r=   r   r_   r   r   read_csv_kwargsr>   rF   r   dtypes_positionaldefault_read_csv_kwargsr4   excr   r   categorical_columnsrP   r7   r1   re   rH   rv   r   r   s                       @@@@r   _pandas_arff_parserr   7  s   p    ;;w%%''227;; 	E	 F# & &*40=9,, #F4LL!!Y..%F4L   &':;;   U  "
 
 M0L_5JLOBK	55_55E

 ?>*=>>>   i##@
 
 	 02HHPPPPemPPPO/"E :&=>>' ' ' ' '    <--//  
 # K K3Z^556IJJc

u&=?UVVDAqX%%!UD  zz||QZZ\\1    <--//  J
 az!!s   C+ +
D5DDc                     |dk    rt          | |||||          S |dk    rt          | |||||          S t          d| d          )a6  Load a compressed ARFF file using a given parser.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    parser : {"pandas", "liac-arff"}
        The parser used to parse the ARFF file. "pandas" is recommended
        but only supports loading dense datasets.

    output_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    z	liac-arffrA   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.)r   r   ry   )r=   parseroutput_typer_   r   r   rc   r   s           r   load_arff_from_gzip_filer     s    v  #"
 
 	
 
8		"#"
 
 	
 MMMM
 
 	
r   )N)NN)__doc__r{   r   collectionsr   collections.abcr   typingr   numpyr,   scipyr   	externalsr   externals._arffr   utils._chunkingr	   r
   utils._optional_dependenciesr   utils.fixesr   r%   ndarrayr2   r8   r   r   r   r   r   r   <module>r      s   ? ?
     				 # # # # # # % % % % % %                     0 0 0 0 0 0 ? ? ? ? ? ? ? ? ? ? ? ? ? ? # # # # # # ! 48        F!48Z   $  L I" I" I" I"d U" U" U" U"~ P
 P
 P
 P
 P
 P
r   