
    0PhK              	          d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z#  edd          \  Z$Z% ee$e%d          \  Z$Z% e            &                    e$          Z$g dZ'dhd  ej(                    D             z  Z)dRdZ*ej+        ,                    de          d             Z-d Z.ej+        ,                    dg e#e"          d             Z/d Z0ej+        ,                    d e'          ej+        ,                    d!e          d"                         Z1d# Z2ej+        ,                    d$d%          d&             Z3d' Z4d( Z5d) Z6d* Z7ej+        ,                    d+d,d-g          d.             Z8ej+        ,                    d/e#          d0             Z9ej+        ,                    d1e'          d2             Z:d3 Z;d4 Z<ej+        ,                    d5d!d6i ej=        d7ej>        gej>        d7gg          fd!d6id7d8gd8d7ggfi d7d8gd9d:ggfg          d;             Z?ej+        ,                    d/e#          d<             Z@ej+        ,                    d/e#          d=             ZAd> ZBd? ZCd@ ZDej+        ,                    dAdBdCg          ej+        ,                    dDddEg          dF                         ZEdG ZFej+        ,                    dHdIdJg          dK             ZGej+        ,                    dLdMdNg          dO             ZHej+        ,                    dPd,d-g          dQ             ZIdS )SzF
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
    N)stats)distance)HDBSCAN)CONDENSED_dtype_condense_tree_do_labelling)_OUTLIER_ENCODING)
make_blobs)fowlkes_mallows_score)_VALID_METRICSeuclidean_distances)BallTreeKDTree)StandardScaler)shuffle)assert_allcloseassert_array_equal)CSC_CONTAINERSCSR_CONTAINERS   
   )	n_samplesrandom_state   )r   )kd_tree	ball_treebruteautoc                 $    h | ]\  }}|d          S )label ).0_outs      b/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/sklearn/cluster/tests/test_hdbscan.py	<setcomp>r'   &   s     KKKvq#c'lKKK    Gz?c                     t          t          |           t          z
            }|dk    sJ t          | t                    |k    sJ d S )N   )lensetOUTLIER_SETr   y)labels	threshold
n_clusterss      r&   check_label_qualityr3   )   sH    S[[;.//J???? ++i777777r(   outlier_typec                    t           j        t           j        d|          }d d d|          }t          |          d         }t          |          d         }t                                          }|dg|d<   ||g|d<   t                                          |          }|j        |k    	                                \  }t          |ddg            ||j        |          	                                \  }t          |ddg           t          t          dd                    t          t          d	d
                    z   }	t                                          ||	                   }
t          |
j        |j        |	                    dS )O
    Tests if np.inf and np.nan data are each treated as special outliers.
    )infinitemissingc                     | |k    S Nr"   xr/   s     r&   <lambda>z#test_outlier_data.<locals>.<lambda>9   s
    a r(   c                 *    t          j        |           S r:   )npisnanr;   s     r&   r=   z#test_outlier_data.<locals>.<lambda>:   s     r(   r!   prob   r         r   N)r?   infnanr	   Xcopyr   fitlabels_nonzeror   probabilities_listrange)r4   outlier
prob_checkr!   rA   	X_outliermodelmissing_labels_idxmissing_probs_idxclean_indicesclean_models              r&   test_outlier_datarW   /   ss    F6  G
 ('++  J l+G4E\*62DIQ<IaLW%IaLIIMM)$$E"]e3<<>>)Aq6222&Ju';TBBKKMM(1a&111q!%%U1c]](;(;;M))--	- 899K{*EM-,HIIIIIr(   c                  ^   t          t                    } |                                 }t          dd                              |           }t          | |           t          |           d}t          j        t          |          5  t          dd                              t                     ddd           n# 1 swxY w Y   d}d| d	<   d
| d<   t          j        t          |          5  t          d                              |            ddd           dS # 1 swxY w Y   dS )zy
    Tests that HDBSCAN works with precomputed distance matrices, and throws the
    appropriate errors when needed.
    precomputedT)metricrH   z*The precomputed distance matrix.*has shapematchNz'The precomputed distance matrix.*valuesr   )r   rB   rB   )rB   r   rZ   )
r   rG   rH   r   fit_predictr   r3   pytestraises
ValueError)D
D_originalr0   msgs       r&   test_hdbscan_distance_matrixre   O   s   
 	AAJM555AA!DDFAz"""
7C	z	-	-	- @ @}4000<<Q???@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ 5CAdGAdG	z	-	-	- 5 5}%%%11!4445 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5s$   	*B??CC1$D""D&)D&sparse_constructorc                    t          j        t          j        t                              }|t	          j        |          z  }t          j        |                                d          }d|||k    <    | |          }|	                                 t          d                              |          }t          |           dS )zA
    Tests that HDBSCAN works with sparse distance matrices.
    2           rY   r]   N)r   
squareformpdistrG   r?   maxr   scoreatpercentileflatteneliminate_zerosr   r^   r3   )rf   rb   r1   r0   s       r&   #test_hdbscan_sparse_distance_matrixrp   g   s    
 	HN1--..ANA'		R88IAa9n1AM***66q99Fr(   c                  p    t                                          t                    } t          |            dS )z
    Tests that HDBSCAN works with feature array, including an arbitrary
    goodness of fit check. Note that the check is a simple heuristic.
    N)r   r^   rG   r3   r0   s    r&   test_hdbscan_feature_arrayrs   y   s1    
 YY""1%%F r(   algorZ   c                 l   t          |                               t                    }t          |           | dv rdS t          t
          d}dt          j        t          j        d                   idt          j	        t          j        d                   iddidt          j	        t          j        d                   d	d

                    |d          }t          | ||          }|||          j        vrNt          j        t                    5  |                    t                     ddd           dS # 1 swxY w Y   dS |dk    rNt          j        t"                    5  |                    t                     ddd           dS # 1 swxY w Y   dS |                    t                     dS )z
    Tests that HDBSCAN works with the expected combinations of algorithms and
    metrics, or raises the expected errors.
    )	algorithm)r   r   N)r   r   VrB   p   )rx   w)mahalanobis
seuclidean	minkowski
wminkowski)rv   rZ   metric_paramsr~   )r   r^   rG   r3   r   r   r?   eyeshapeonesgetvalid_metricsr_   r`   ra   rI   warnsFutureWarning)rt   rZ   r0   ALGOS_TREESr   hdbs         r&   test_hdbscan_algorithmsr      s%    t$$$0033F      K
 RVAGAJ//0BGAGAJ//01XBGAGAJ$7$788	 
 
c&$  #  C [&444]:&& 	 	GGAJJJ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 		<		\-(( 	 	GGAJJJ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	




s$   D99D= D=%FFFc                      t                                          t                    } |                     d          }t	          |d           dS )z
    Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
    This test is more of a sanity check than a rigorous evaluation.
    333333?gq=
ףp?)r1   N)r   rI   rG   dbscan_clusteringr3   )	clustererr0   s     r&   test_dbscan_clusteringr      sG    
 		a  I((--F $//////r(   cut_distance)皙?      ?rB   c                 0   t           d         d         }t           d         d         }t                                          }t          j        dg|d<   dt          j        g|d<   t          j        t          j        g|d<   t                                          |          }|                    |           }t          j	        ||k              }t          |ddg           t          j	        ||k              }t          |dg           t          t          t          d	                    t          ||z             z
            }t                                          ||                   }	|	                    |           }
t          |
||                    d
S )r6   r8   r!   r7   rB   r   ry   rC   )r   r   N)r	   rG   rH   r?   rE   rF   r   rI   r   flatnonzeror   rM   r-   rN   )r   missing_labelinfinite_labelrQ   rR   r0   rS   infinite_labels_idx	clean_idxrV   clean_labelss              r&   #test_dbscan_clustering_outlier_datar      sT   
 &i09M&z27;NIFA;IaLrv;IaLFBF#IaLIIMM)$$E$$,$??F-(?@@)Aq6222.>)ABB*QC000Ss__s+=@S+S'T'TTUUI))--	) 455K00l0KKL|VI%677777r(   c                      t          ddt          j        t          j        d                   i                              t                    } t          |            dS )z4
    Tests that HDBSCAN using `BallTree` works.
    r|   rw   rB   )rZ   r   N)r   r?   r   rG   r   r^   r3   rr   s    r&   !test_hdbscan_best_balltree_metricr      sV     C1D1D+E  k!nn  r(   c                      t          t          t                    dz
                                t                    } t	          |                               t                    sJ dS )z
    Tests that HDBSCAN correctly does not generate a valid cluster when the
    `min_cluster_size` is too large for the data.
    rB   min_cluster_sizeN)r   r,   rG   r^   r-   issubsetr.   rr   s    r&   test_hdbscan_no_clustersr      sR    
 c!ffqj111==a@@Fv;;,,,,,,,r(   c                  8   t          dt          t                    d          D ]u} t          |                               t                    }d |D             }t          |          dk    r,t          j        t          j        |                    | k    sJ vdS )zb
    Test that the smallest non-noise cluster has at least `min_cluster_size`
    many points
    ry   rB   r   c                     g | ]
}|d k    |S )r   r"   )r#   r!   s     r&   
<listcomp>z1test_hdbscan_min_cluster_size.<locals>.<listcomp>   s    @@@ERKKuKKKr(   r   N)rN   r,   rG   r   r^   r?   minbincount)r   r0   true_labelss      r&   test_hdbscan_min_cluster_sizer      s    
 "!SVVQ// H H*:;;;GGJJ@@&@@@{q  6"+k22337GGGGG	H Hr(   c                      t           j        } t          |                               t                    }t          |           dS )zA
    Tests that HDBSCAN works when passed a callable metric.
    r]   N)r   	euclideanr   r^   rG   r3   )rZ   r0   s     r&   test_hdbscan_callable_metricr      s>     FF###//22Fr(   treer   r   c                     t          d|           }d}t          j        t          |          5  |                    t
                     ddd           dS # 1 swxY w Y   dS )z
    Tests that HDBSCAN correctly raises an error when passing precomputed data
    while requesting a tree-based algorithm.
    rY   rZ   rv   z%precomputed is not a valid metric forr[   N)r   r_   r`   ra   rI   rG   )r   r   rd   s      r&   "test_hdbscan_precomputed_non_bruter      s     $
7
7
7C
1C	z	-	-	-  


                 s   AAAcsr_containerc                    t                                          t                    j        }t	          |            | t                    }|                                }t                                          |          j        }t          ||           t          j        dft          j	        dffD ]\  }}t                                          }||d<   t                                          |          j        }t	          |           |d         t          |         d         k    sJ |                                }||d<   t                                          |          j        }t          ||           d}t          j        t          |          5  t          dd	
                              |           ddd           dS # 1 swxY w Y   dS )z
    Tests that HDBSCAN works correctly when passing sparse feature data.
    Evaluates correctness by comparing against the same data passed as a dense
    array.
    r7   r8   r   r   r   r!   z4Sparse data matrices only support algorithm `brute`.r[   r   r   r   N)r   rI   rG   rJ   r3   rH   r   r?   rE   rF   r	   r_   r`   ra   )	r   dense_labels	_X_sparseX_sparsesparse_labelsoutlier_valr4   X_denserd   s	            r&   test_hdbscan_sparser   
  s    99==##+L%%%a  I~~HIIMM(++3M|]333 (*vz&:RVY<O%P 
8 
8!\&&((#yy}}W--5L)))A"3L"A'"JJJJJ>>##$		h//7<7777
@C	z	-	-	- I I{k:::>>xHHHI I I I I I I I I I I I I I I I I Is   %GGGrv   c                    ddg}t          dd|d          \  }}t          d                              |          }t          ||j        |j                  D ],\  }}}t          ||d	d
           t          ||d	d
           -t          | dt          j        d                                       t                    }|j        j        d         dk    sJ |j        j        d         dk    sJ dS )zj
    Tests that HDBSCAN centers are calculated and stored properly, and are
    accurate to the data.
    )ri   ri   )      @r   i  r   r   )r   r   centerscluster_stdboth)store_centersrB   g?)rtolatol)rv   r   r   N)	r
   r   rI   zip
centroids_medoids_r   rG   r   )rv   r   Hr$   r   centercentroidmedoids           r&   test_hdbscan_centersr   -  s    :&G1gSVWWWDAq

'
'
'
+
+A
.
.C$'$N$N ; ; &qt<<<<QT::::: 6AGAJ  	c!ff  >"a''''<a A%%%%%%r(   c                  
   t           j                            d          } |                     dd          }t	          dddd                              |          }t          j        |d	          \  }}t          |          dk    sJ ||d
k             dk    sJ t	          ddddd                              |          }t          j        |d	          \  }}t          |          dk    sJ ||d
k             dk    sJ dS )zS
    Tests that HDBSCAN single-cluster selection with epsilon works correctly.
    r      ry   rC   ri   eomT)r   cluster_selection_epsiloncluster_selection_methodallow_single_cluster)return_countsr      g
ףp=
?r   )r   r   r   r   rv   N)r?   randomRandomStaterandr   r^   uniquer,   )rngno_structurer0   unique_labelscountss        r&   .test_hdbscan_allow_single_cluster_with_epsilonr   C  s;    )


"
"C88C##L"%!&!	  
 k,  IfDAAAM6}"""" -2%&++++ "&!&!   k,  IfDAAAM6}""""-2%&!++++++r(   c                  2   ddgddgddgddgg} t          d| g dd          \  }}t                                          |          j        }t	          t          |                    t          d	|v           z
  }|d
k    sJ t          ||          dk     dS )z
    Validate that HDBSCAN can properly cluster this difficult synthetic
    dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
    example)
    g333333g333333?r+   i  )皙?gffffff?皙?r   r   )r   r   r   r   r      r)   N)r
   r   rI   rJ   r,   r-   intr   )r   rG   r/   r0   r2   s        r&   test_hdbscan_better_than_dbscanr   d  s     u~t}q!fq"g>G+++	  DAq YY]]1%FS[[!!Cf$5$55J????&!$$t++++r(   z	kwargs, XrY   rB   ry   r+   r   c                 H    t          dddi|                    |            dS )zo
    Tests that HDBSCAN works correctly for array-likes and precomputed inputs
    with non-finite points.
    min_samplesrB   Nr"   )r   rI   )rG   kwargss     r&   test_hdbscan_usable_inputsr   x  s1     $$$V$$((+++++r(   c                      | t          j        d                    }d}t          j        t          |          5  t          d                              |           ddd           dS # 1 swxY w Y   dS )zd
    Tests that HDBSCAN raises the correct error when there are too few
    non-zero distances.
    )r   r   z#There exists points with fewer thanr[   rY   r]   N)r?   zerosr_   r`   ra   r   rI   r   rG   rd   s      r&   -test_hdbscan_sparse_distances_too_few_nonzeror     s     	bhx(())A
/C	z	-	-	- - -}%%%))!,,,- - - - - - - - - - - - - - - - - -s   $A,,A03A0c                 6   t          j        d          }d|ddddf<   d|ddddf<   ||j        z   } | |          }d}t          j        t
          |          5  t          d	                              |           ddd           dS # 1 swxY w Y   dS )
zu
    Tests that HDBSCAN raises the correct error when the distance matrix
    has multiple connected components.
    )   r   rB   NrC      z2HDBSCAN cannot be perfomed on a disconnected graphr[   rY   r]   )r?   r   Tr_   r`   ra   r   rI   r   s      r&   0test_hdbscan_sparse_distances_disconnected_graphr     s     	AAbqb"1"fIAabb"##gJ	ACAaA
>C	z	-	-	- - -}%%%))!,,,- - - - - - - - - - - - - - - - - -s   $BBBc                     d } d}t          j        t          |          5  t          d|                               t
                     ddd           n# 1 swxY w Y   t          j        t          |          5  t          d|                               t
                     ddd           n# 1 swxY w Y   t          t          t          j	                  t          t          j	                  z
            }t          |          dk    ret          j        t          |          5  t          d|d                                       t
                     ddd           dS # 1 swxY w Y   dS dS )	zR
    Tests that HDBSCAN correctly raises an error for invalid metric choices.
    c                     | S r:   r"   )r<   s    r&   r=   z2test_hdbscan_tree_invalid_metric.<locals>.<lambda>  s     r(   zV.* is not a valid metric for a .*-based algorithm\. Please select a different metric\.r[   r   )rv   rZ   Nr   r   )r_   r`   ra   r   rI   rG   rM   r-   r   r   r   r,   )metric_callablerd   metrics_not_kds      r&    test_hdbscan_tree_invalid_metricr     sG    "kO	  
z	-	-	- D D)O<<<@@CCCD D D D D D D D D D D D D D D	z	-	-	- F F+o>>>BB1EEEF F F F F F F F F F F F F F F
 #h455F<P8Q8QQRRN
>Q]:S111 	J 	Jiq0ABBBFFqIII	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J s5   *AAA=*B33B7:B7,0E))E-0E-c                      t          t          t                    dz             } d}t          j        t
          |          5  |                     t                     ddd           dS # 1 swxY w Y   dS )zx
    Tests that HDBSCAN correctly raises an error when setting `min_samples`
    larger than the number of samples.
    rB   )r   z min_samples (.*) must be at mostr[   N)r   r,   rG   r_   r`   ra   rI   )r   rd   s     r&   !test_hdbscan_too_many_min_samplesr     s    
 c!ffqj
)
)
)C
-C	z	-	-	-  


                 s   A++A/2A/c                     t                                           } t          j        | d<   d}t	          d          }t          j        t          |          5  |                    |            ddd           dS # 1 swxY w Y   dS )zu
    Tests that HDBSCAN correctly raises an error when providing precomputed
    distances with `np.nan` values.
    r   z(np.nan values found in precomputed-denserY   r]   r[   N)	rG   rH   r?   rF   r   r_   r`   ra   rI   )X_nanrd   r   s      r&   "test_hdbscan_precomputed_dense_nanr     s    
 FFHHE&E$K
4C

'
'
'C	z	-	-	-                   s   A99A= A=r   TFepsilonr   c                 0   d}t          || ddgddgddgg          \  }t                                          |          }t          |j        |j                  }|dz   |dz   |dz   h}|dz   d|dz   d	|dz   di}t          |||||
          fdt          t                              D             fdt          t                              D             }	 t          j
        |	j                            }
t          |
           dS )zR
    Tests that the `_do_labelling` helper function correctly assigns labels.
    0   r   r   )r   r   r   ry   r+   r   rB   condensed_treeclusterscluster_label_mapr   r   c                 Z    i | ]'}|t          j        |k              d          d          (S )r   )r?   where)r#   _yr/   s     r&   
<dictcomp>z+test_labelling_distinct.<locals>.<dictcomp>  s3    KKKBHQ"W--a03KKKr(   c                 .    i | ]}||                  S r"   r"   )r#   r  first_with_labelr0   s     r&   r  z+test_labelling_distinct.<locals>.<dictcomp>  s&    KKK2v.r23KKKr(   N)r
   r   rI   r   _single_linkage_tree_r   r   rM   r-   r?   	vectorizer   r   )global_random_seedr   r   r   rG   estr   r   r   y_to_labelsaligned_targetr  r0   r/   s              @@@r&   test_labelling_distinctr    sZ    I' FGG
		 	 	DAq ))--

C#!C4H  N Ay1}i!m<H"Q9q=!Y]AN%+1")  F LKKKd3q66llKKKKKKKKd3q66llKKKK2R\+/22155Nv~.....r(   c                     d} d}t          j        dd|dfddd|dfddgt          	          }t          || h| d| dz   did
d          }|d         dk     }t	          |          t	          |dk              k    sJ t          || h| d| dz   did
d          }|d         |k     }t	          |          t	          |dk              k    sJ dS )z
    Tests that the `_do_labelling` helper function correctly thresholds the
    incoming lambda values given various `cluster_selection_epsilon` values.
    rC   g      ?ry   rB   )rC   rB   r   rB   r   )rC   r+   r   rB   )rC   r   r   rB   )dtypeTr   valuer   N)r?   arrayr   r   sum)r   
MAX_LAMBDAr   r0   	num_noises        r&   test_labelling_thresholdingr    s#   
 IJX:q!:q!	
 	 	 	N %$aQ:!"#  F w'!+Iy>>S2......%$aQ:!"#  F w'*4Iy>>S2........r(   r   r   r   c                 @   t           j                            d          }|                    d          }t          |          }d}t	          j        t          |          5  t          d|                               |           ddd           dS # 1 swxY w Y   dS )zCheck that we raise an error if the centers are requested together with
    a precomputed input matrix.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27893
    r   )d   ry   z>Cannot store centers when using a precomputed distance matrix.r[   rY   )rZ   r   N)	r?   r   r   r   r_   r`   ra   r   rI   )r   r   rG   X_disterr_msgs        r&   0test_hdbscan_error_precomputed_and_store_centersr  %  s     )


"
"C

8A ##FNG	z	1	1	1 O O}MBBBFFvNNNO O O O O O O O O O O O O O O O O Os   !%BBB
valid_algor   r   c                 X    t          d|                               t                     dS )zTest that HDBSCAN works with the "cosine" metric when the algorithm is set
    to "brute" or "auto".

    Non-regression test for issue #28631
    cosiner   N)r   r^   rG   )r  s    r&   *test_hdbscan_cosine_metric_valid_algorithmr  5  s+     8z222>>qAAAAAr(   invalid_algoc                     t          d|           }t          j        t          d          5  |                    t
                     ddd           dS # 1 swxY w Y   dS )zTest that HDBSCAN raises an informative error is raised when an unsupported
    algorithm is used with the "cosine" metric.
    r  r   zcosine is not a valid metricr[   N)r   r_   r`   ra   r^   rG   )r  hdbscans     r&   ,test_hdbscan_cosine_metric_invalid_algorithmr   ?  s    
 X>>>G	z)G	H	H	H  A                 s   AAA)r)   )J__doc__numpyr?   r_   scipyr   scipy.spatialr   sklearn.clusterr   sklearn.cluster._hdbscan._treer   r   r    sklearn.cluster._hdbscan.hdbscanr	   sklearn.datasetsr
   sklearn.metricsr   sklearn.metrics.pairwiser   r   sklearn.neighborsr   r   sklearn.preprocessingr   sklearn.utilsr   sklearn.utils._testingr   r   sklearn.utils.fixesr   r   rG   r/   fit_transform
ALGORITHMSitemsr.   r3   markparametrizerW   re   rp   rs   r   r   r   r   r   r   r   r   r   r   r   r   r  rE   r   r   r   r   r   r   r  r  r  r  r   r"   r(   r&   <module>r5     s   
            " " " " " " # # # # # #         
 ? > > > > > ' ' ' ' ' ' 1 1 1 1 1 1 H H H H H H H H . . . . . . . . 0 0 0 0 0 0 ! ! ! ! ! ! F F F F F F F F > > > > > > > >zCb1111wq!!$$$1N""1%%  
 dKK1H1B1H1J1JKKKK8 8 8 8 ):;;J J <;J>5 5 50 -/Q/Q./QRR    SR "	  	  	  ,,>22$ $ 32 -,$N
0 
0 
0 778 8 8784     - - -	H 	H 	H      )[!9::  ;: .99I I :9ID j11& & 21&*, , ,B, , ,( 
M	"HBHq"&kBFA;-G$H$HI
M	"aVaV$45	q!fq!f , , , .99	- 	- :9	- .99- - :9- J J J0  
 
 
 /$??QH--!/ !/ .- @?!/H&/ &/ &/R :x*@AAO O BAO '899B B :9B )[)ABB  CB  r(   