
    ZPhl3              	       f   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlZdd	lmZ dd
lmZmZ ddlmZ dZdZdZg dZ e            Z e            Z ee          D ]\  ZZ edz   ee <   e eedz   <    ede!gde"gdgdgdgdgdd          dddddddd            Z#dS )aD	  Collection of imbalanced datasets.

This collection of datasets has been proposed in [1]_. The
characteristics of the available datasets are presented in the table
below.

 ID    Name           Repository & Target           Ratio  #S       #F
 1     ecoli          UCI, target: imU              8.6:1  336      7
 2     optical_digits UCI, target: 8                9.1:1  5,620    64
 3     satimage       UCI, target: 4                9.3:1  6,435    36
 4     pen_digits     UCI, target: 5                9.4:1  10,992   16
 5     abalone        UCI, target: 7                9.7:1  4,177    10
 6     sick_euthyroid UCI, target: sick euthyroid   9.8:1  3,163    42
 7     spectrometer   UCI, target: >=44             11:1   531      93
 8     car_eval_34    UCI, target: good, v good     12:1   1,728    21
 9     isolet         UCI, target: A, B             12:1   7,797    617
 10    us_crime       UCI, target: >0.65            12:1   1,994    100
 11    yeast_ml8      LIBSVM, target: 8             13:1   2,417    103
 12    scene          LIBSVM, target: >one label    13:1   2,407    294
 13    libras_move    UCI, target: 1                14:1   360      90
 14    thyroid_sick   UCI, target: sick             15:1   3,772    52
 15    coil_2000      KDD, CoIL, target: minority   16:1   9,822    85
 16    arrhythmia     UCI, target: 06               17:1   452      278
 17    solar_flare_m0 UCI, target: M->0             19:1   1,389    32
 18    oil            UCI, target: minority         22:1   937      49
 19    car_eval_4     UCI, target: vgood            26:1   1,728    21
 20    wine_quality   UCI, wine, target: <=4        26:1   4,898    11
 21    letter_img     UCI, target: Z                26:1   20,000   16
 22    yeast_me2      UCI, target: ME2              28:1   1,484    8
 23    webpage        LIBSVM, w7a, target: minority 33:1   34,780   300
 24    ozone_level    UCI, ozone, data              34:1   2,536    72
 25    mammography    UCI, target: minority         42:1   11,183   6
 26    protein_homo   KDD CUP 2004, minority        111:1  145,751  74
 27    abalone_19     UCI, target: 19               130:1  4,177    10

References
----------
.. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
   Imbalanced Data Learning and their Application in Bioinformatics."
   Dissertation, Georgia State University, (2011).
    N)OrderedDict)	signature)BytesIO)makedirs)isfilejoin)urlopen)get_data_home)Bunchcheck_random_state   )validate_paramszGhttps://zenodo.org/record/61452/files/benchmark-imbalanced-learn.tar.gzxzdata.npz)ecolioptical_digitssatimage
pen_digitsabalonesick_euthyroidspectrometercar_eval_34isoletus_crime	yeast_ml8scenelibras_movethyroid_sick	coil_2000
arrhythmiasolar_flare_m0oil
car_eval_4wine_quality
letter_img	yeast_me2webpageozone_levelmammographyprotein_homo
abalone_19   booleanrandom_state)	data_homefilter_datadownload_if_missingr-   shuffleverboseT)prefer_skip_nested_validationFc           
         t          |           } t          | d          }t                      }|t                                          }nt                                          }	g }|D ]}
t          |
t                    r.|
|	vrt          |
 d|	           |                    |
           Et          |
t                    rP|
dk     s|
dk    r#t          d|
 dt          dd	                     |                    t          |
                    t          d
t          |
           d          |D ]}
t          t          t          |
                   z   t          z   }t          ||          }t          |          }|r|st!          |d           |rt#          dt$          z             t'          t)          t$                                                              }t-          j        |          }dt1          |j                  j        v r|                    |d           n*|                    |           n|s|st7          d          t9          j        |          }|d         |d         }}|rSt9          j        |j        d                   }tA          |          }|!                    |           ||         }||         }tE          |||
          ||
<   |S )a  Load the benchmark datasets from Zenodo, downloading it if necessary.

    .. versionadded:: 0.3

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    filter_data : tuple of str/int, default=None
        A tuple containing the ID or the name of the datasets to be returned.
        Refer to the above table to get the ID and name of the datasets.

    download_if_missing : bool, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, default=None
        Random state for shuffling the dataset.
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    verbose : bool, default=False
        Show information regarding the fetching.

    Returns
    -------
    datasets : OrderedDict of Bunch object,
        The ordered is defined by ``filter_data``. Each Bunch object ---
        referred as dataset --- have the following attributes:

        dataset.data : ndarray of shape (n_samples, n_features)

        dataset.target : ndarray of shape (n_samples,)

        dataset.DESCR : str
            Description of the each dataset.

    Notes
    -----
    This collection of datasets have been proposed in [1]_. The
    characteristics of the available datasets are presented in the table
    below.

    +--+--------------+-------------------------------+-------+---------+-----+
    |ID|Name          | Repository & Target           | Ratio | #S      | #F  |
    +==+==============+===============================+=======+=========+=====+
    |1 |ecoli         | UCI, target: imU              | 8.6:1 | 336     | 7   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |2 |optical_digits| UCI, target: 8                | 9.1:1 | 5,620   | 64  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |3 |satimage      | UCI, target: 4                | 9.3:1 | 6,435   | 36  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |4 |pen_digits    | UCI, target: 5                | 9.4:1 | 10,992  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |5 |abalone       | UCI, target: 7                | 9.7:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |6 |sick_euthyroid| UCI, target: sick euthyroid   | 9.8:1 | 3,163   | 42  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |7 |spectrometer  | UCI, target: >=44             | 11:1  | 531     | 93  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |8 |car_eval_34   | UCI, target: good, v good     | 12:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |9 |isolet        | UCI, target: A, B             | 12:1  | 7,797   | 617 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |10|us_crime      | UCI, target: >0.65            | 12:1  | 1,994   | 100 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |11|yeast_ml8     | LIBSVM, target: 8             | 13:1  | 2,417   | 103 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |12|scene         | LIBSVM, target: >one label    | 13:1  | 2,407   | 294 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |13|libras_move   | UCI, target: 1                | 14:1  | 360     | 90  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |14|thyroid_sick  | UCI, target: sick             | 15:1  | 3,772   | 52  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |15|coil_2000     | KDD, CoIL, target: minority   | 16:1  | 9,822   | 85  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |16|arrhythmia    | UCI, target: 06               | 17:1  | 452     | 278 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |17|solar_flare_m0| UCI, target: M->0             | 19:1  | 1,389   | 32  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |18|oil           | UCI, target: minority         | 22:1  | 937     | 49  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |19|car_eval_4    | UCI, target: vgood            | 26:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |20|wine_quality  | UCI, wine, target: <=4        | 26:1  | 4,898   | 11  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |21|letter_img    | UCI, target: Z                | 26:1  | 20,000  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |22|yeast_me2     | UCI, target: ME2              | 28:1  | 1,484   | 8   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |23|webpage       | LIBSVM, w7a, target: minority | 33:1  | 34,780  | 300 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |24|ozone_level   | UCI, ozone, data              | 34:1  | 2,536   | 72  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |25|mammography   | UCI, target: minority         | 42:1  | 11,183  | 6   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |26|protein_homo  | KDD CUP 2004, minority        | 111:1 | 145,751 | 74  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |27|abalone_19    | UCI, target: 19               | 130:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+

    References
    ----------
    .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
       Imbalanced Data Learning and their Application in Bioinformatics."
       Dissertation, Georgia State University, (2011).
    )r.   zenodoNz8 is not a dataset available. The available datasets are r+      zThe dataset with the ID=z* is not an available dataset. The IDs are    z1The value in the tuple should be str or int. Got z	 instead.T)exist_okzDownloading %s)fileobjfilterdata)pathr:   )r<   z1Data not found and `download_if_missing` is Falselabelr   )r;   targetDESCR)#r
   r   r   MAP_NAME_IDkeys
isinstancestr
ValueErrorappendintrangeMAP_ID_NAMEtypePRE_FILENAMEPOST_FILENAMEr   r   printURLr   r	   readtarfileopenr   
extractall
parametersIOErrornploadarangeshaper   r1   r   )r.   r/   r0   r-   r1   r2   
zenodo_dirdatasetsfilter_data_	list_dataitfilename	availableftarr;   Xyindrngs                       Y/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/imblearn/datasets/_zenodo.pyfetch_datasetsrf   e   s&   N 	222Ii**J}}H"''))$$&&	 	 	B"c"" Y&&$ B B6?B B  
 !''++++B$$ 66R"WW$*2 * * B<<* *   !''B8888 0 HH0 0 0    9 9#k"o"6"66F
H--8$$	 	Oy 	OZ$//// .&,---))++,,A,q)))C9S^44???Jv>>>>J////$ 	OY 	OMNNNwx  F|T']1 	)AGAJ''C$\22CKK#A#A!AR888O    )$__doc__rO   collectionsr   inspectr   ior   osr   os.pathr   r   urllib.requestr	   numpyrT   sklearn.datasetsr
   sklearn.utilsr   r   utils._sklearn_compatr   rM   rJ   rK   MAP_NAME_ID_KEYSr@   rH   	enumeratevkrC   tuplerf    rg   re   <module>ry      s  ( (Z  # # # # # #                                   " " " " " "     * * * * * * 3 3 3 3 3 3 3 3 3 3 3 3 3 3O   < kmmkmmI&''  DAqUKNKA C[e} ){'(;;  #'
 
 
 ~ ~ ~ ~
 
~ ~ ~rg   