
    0PhU.                     H   d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	m
Z
 ddlmZmZ ddlZddlZddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$  eddd           eddd           eddd           eddd           eddd          fZ% edd d!          Z& ej'        e(          Z) ee*edg eh d"          gd#gd$gd#gd#g eeddd%&          g eed'dd(&          gd)d*+          dd,d*dd-d-d.d/d)d0            Z+d1 Z,d2 Z-dS )3zhRCV1 dataset.

The dataset page is available at

    http://jmlr.csail.mit.edu/papers/volume5/lewis04a/
    N)GzipFile)IntegralReal)PathLikemakedirsremove)existsjoin   )Bunch)shuffle)Interval
StrOptionsvalidate_params   )get_data_home)RemoteFileMetadata_fetch_remote_pkl_filepath
load_descr)load_svmlight_filesz.https://ndownloader.figshare.com/files/5976069@ed40f7e418d10484091b059703eeb95ae3199fe042891dcec4be6696b9968374z lyrl2004_vectors_test_pt0.dat.gz)urlchecksumfilenamez.https://ndownloader.figshare.com/files/5976066@87700668ae45d45d5ca1ef6ae9bd81ab0f5ec88cc95dcef9ae7838f727a13aa6z lyrl2004_vectors_test_pt1.dat.gzz.https://ndownloader.figshare.com/files/5976063@48143ac703cbe33299f7ae9f4995db49a258690f60e5debbff8995c34841c7f5z lyrl2004_vectors_test_pt2.dat.gzz.https://ndownloader.figshare.com/files/5976060@dfcb0d658311481523c6e6ca0c3f5a3e1d3d12cde5d7a8ce629a9006ec7dbb39z lyrl2004_vectors_test_pt3.dat.gzz.https://ndownloader.figshare.com/files/5976057@5468f656d0ba7a83afc7ad44841cf9a53048a5c083eedc005dcdb5cc768924aezlyrl2004_vectors_train.dat.gzz.https://ndownloader.figshare.com/files/5976048@2a98e5e5d8b770bded93afc8930d88299474317fe14181aee1466cc754d0d1c1zrcv1v2.topics.qrels.gz>   alltesttrainbooleanrandom_stateleft)closedg        neither)	data_homesubsetdownload_if_missingr%   r   
return_X_y	n_retriesdelayT)prefer_skip_nested_validationr!   F   g      ?c                 
   d}d}	d}
d}t          |           } t          | d          }|rt          |          st          |           t	          |d          }t	          |d          }t	          |d	          }t	          |d
          }|rt          |          rt          |          srg }t
          D ]Z}t                              d|j        z             t          ||||          }|
                    t          |                     [t          ||	          }t          j        |d         |d         |d         |d         |d         g                                          }t!          j        |d         |d         |d         |d         |d         f          }|                    t           j        d          }t)          j        ||d           t)          j        ||d           |D ]*}|                                 t/          |j                   +n(t)          j        |          }t)          j        |          }|r%t          |          rt          |          st                              dt4          j        z             t          t4          |||          }d}d}d}t!          j        ||
ft           j                  }t!          j        |t           j                  }i }t          |d          5 }|D ]}|                    d                               d!          } tA          |           dk    rB| \  }!}"}#|!|vr
|dz  }|||!<   tC          |"          }"|"|k    r|"}|dz  }|"||<   d||||!         f<   	 d"d"d"           n# 1 swxY w Y   t/          |           tE          ||          }$||$d"d"f         }t!          j#        |
tH                    }%|%                                D ]}&|&|%||&         <   t!          j&        |%          }'|%|'         }%t          j'        |d"d"|'f                   }t)          j        ||d           t)          j        |%|d           n(t)          j        |          }t)          j        |          }%|d#k    rnl|d$k    r'|d"|d"d"f         }|d"|d"d"f         }|d"|         }n?|d%k    r'||d"d"d"f         }||d"d"d"f         }||d"         }ntQ          d&|z            |rtS          ||||'          \  }}}tU          d(          }(|r||fS tW          ||||%|()          S )*a-  Load the RCV1 multilabel dataset (classification).

    Download it if necessary.

    Version: RCV1-v2, vectors, full sets, topics multilabels.

    =================   =====================
    Classes                               103
    Samples total                      804414
    Dimensionality                      47236
    Features            real, between 0 and 1
    =================   =====================

    Read more in the :ref:`User Guide <rcv1_dataset>`.

    .. versionadded:: 0.17

    Parameters
    ----------
    data_home : str or path-like, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    subset : {'train', 'test', 'all'}, default='all'
        Select the dataset to load: 'train' for the training set
        (23149 samples), 'test' for the test set (781265 samples),
        'all' for both, with the training samples first if shuffle is False.
        This follows the official LYRL2004 chronological split.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    return_X_y : bool, default=False
        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
        object. See below for more information about the `dataset.data` and
        `dataset.target` object.

        .. versionadded:: 0.20

    n_retries : int, default=3
        Number of retries when HTTP errors are encountered.

        .. versionadded:: 1.5

    delay : float, default=1.0
        Number of seconds between retries.

        .. versionadded:: 1.5

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object. Returned only if `return_X_y` is False.
        `dataset` has the following attributes:

        - data : sparse matrix of shape (804414, 47236), dtype=np.float64
            The array has 0.16% of non zero values. Will be of CSR format.
        - target : sparse matrix of shape (804414, 103), dtype=np.uint8
            Each sample has a value of 1 in its categories, and 0 in others.
            The array has 3.15% of non zero values. Will be of CSR format.
        - sample_id : ndarray of shape (804414,), dtype=np.uint32,
            Identification number of each sample, as ordered in dataset.data.
        - target_names : ndarray of shape (103,), dtype=object
            Names of each target (RCV1 topics), as ordered in dataset.target.
        - DESCR : str
            Description of the RCV1 dataset.

    (data, target) : tuple
        A tuple consisting of `dataset.data` and `dataset.target`, as
        described above. Returned only if `return_X_y` is True.

        .. versionadded:: 0.20

    Examples
    --------
    >>> from sklearn.datasets import fetch_rcv1
    >>> rcv1 = fetch_rcv1()
    >>> rcv1.data.shape
    (804414, 47236)
    >>> rcv1.target.shape
    (804414, 103)
    i>F i  g   imZ  )r)   RCV1zsamples.pklzsample_id.pklzsample_topics.pklztopics_names.pklzDownloading %s)dirnamer-   r.   )r   )
n_features   r   r         	   r   r0         F)copy)compressdtyperb)r   modeascii Nr!   r#   r"   zLUnknown subset parameter. Got '%s' instead of one of ('all', 'train', test'))r%   zrcv1.rst)datatarget	sample_idtarget_namesDESCR),r   r
   r	   r   r   XY_METADATAloggerinfor   r   appendr   r   spvstacktocsrnphstackastypeuint32joblibdumpcloser   nameloadTOPICS_METADATAzerosuint8int32decodesplitlenint_find_permutationemptyobjectkeysargsort
csr_matrix
ValueErrorshuffle_r   r   ))r)   r*   r+   r%   r   r,   r-   r.   	N_SAMPLES
N_FEATURESN_CATEGORIESN_TRAINrcv1_dirsamples_pathsample_id_pathsample_topics_pathtopics_pathfileseach	file_pathXyXrG   ftopics_archive_pathn_catn_docdoc_previousysample_id_biscategory_nameslineline_componentscatdoc_permutation
categorieskorderfdescrs)                                            V/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/sklearn/datasets/_rcv1.py
fetch_rcv1r   L   s'   f IJLG	222IIv&&H h 	X =99L"8_==N&x1DEE*<==K  0F<$8$8 0~@V@V 0 	7 	7DKK(483444%h)5  I LL95556666 :>>> Ir!ubeRUBqE2a59::@@BBIr!ubeRUBqE2a5ABB	$$RYU$;;	A|a0000I~::::  	 	AGGIII16NNNN	 K%%K//	  5.%&&5..4[.A.A5. 	$'::;;;+X%
 
 

 Hi.bh???"(;;;2>>> 	6! 6 6"&++g"6"6"<"<S"A"A''1,,"1KCa.00
.3s+c((Cl**'*
/2e,45Ae^C0016	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6" 	"### (yAAk111n Xl&999
$$&& 	. 	.A,-J~a()) 
:&&&
M!AAAuH+&&A)A6666Ja88888K*++[--
	7		hwhkNhwhkNhwh'			6		ghhkNghhkNghh'		*,23
 
 	

  O"1aNNN1i
##F !tqIJf   s   BNNNc                     | j         }t          j        |t          j                  }t          j        |t          j                  }t          j        || |           |S )zInverse permutation p.r?   )sizerQ   r[   r]   arangeput)pnsis       r   _inverse_permutationr   @  sK    	A
"(###A
	!28$$$AF1aOOOH    c                     t          j        |           }t          j        |          }t          |          }||         S )z!Find the permutation from a to b.)rQ   rf   r   )abtuu_s        r   rb   rb   I  s2    

1A

1A	a	 	 BR5Lr   ).__doc__logginggzipr   numbersr   r   osr   r   r   os.pathr	   r
   rU   numpyrQ   scipy.sparsesparserN   utilsr   r   ri   utils._param_validationr   r   r    r   _baser   r   r   r   _svmlight_format_ior   rJ   rZ   	getLogger__name__rK   strr   r   rb    r   r   <module>r      s            " " " " " " " " ) ) ) ) ) ) ) ) ) )                                  ' ' ' ' ' ' K K K K K K K K K K       O O O O O O O O O O O O 4 4 4 4 4 4 <S3  
 <S3  
 <S3  
 <S3  
 <S0  +< %$8O%   
	8	$	$ 8T*:666778 ){'(; khxD@@@A(4d9===>	 	 #'   
d d d d dN      r   