
    Q/Ph                         d dl mZmZmZmZmZ d dlmc mZ	 d dl
mZ d dlmZ d dlmZ ej        dededed	eeeef         ee         f         fd
            ZdS )    )AnyDictOptionalSequenceTupleN)	DataFrame)Settings)get_duplicatesconfigdfsupported_columnsreturnc                 $   | j         j        }i }|dk    r|dfS |r|                                dk    rd|d<   d|d<   |dfS | j         j        }||j        v rt          d| d          |                    |j                                      t          j        d          	                    |                    
                    |t          j        |                              d	                                        t          j        |          d
k              }|                                |d<   |d         |                                z  |d<   ||                    |d                              |                                          fS )a  Obtain the most occurring duplicate rows in the DataFrame.

    Args:
        config: report Settings object
        df: the Pandas DataFrame.
        supported_columns: the columns to consider

    Returns:
        A subset of the DataFrame, ordered by occurrence.
    r   Nn_duplicatesg        p_duplicateszDuplicates key (z}) may not be part of the DataFrame. Either change the  column name in the DataFrame or change the 'duplicates.key' parameter.*int   F)	ascending)
duplicatesheadcountkeycolumns
ValueErrorgroupByaggFalias
withColumncolcastfilterorderBylimittoPandas)r   r   r   n_headmetricsduplicates_keyduplicated_dfs          l/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/ydata_profiling/model/spark/duplicates_spark.pyget_duplicates_sparkr,   
   s    #F G{{} 

a"#"%}&*N##W~ W W W
 
 	
 	

2:	QWS\\//	0	0	NAE.$9$9$>$>u$E$E	F	F	n%%)	*	*	  ,1133GN%n5

BGNn>>DDVLLUUWW     )typingr   r   r   r   r   pyspark.sql.functionssql	functionsr   pyspark.sqlr   ydata_profiling.configr	    ydata_profiling.model.duplicatesr
   registerstrr,    r-   r+   <module>r8      s    7 7 7 7 7 7 7 7 7 7 7 7 7 7 ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! + + + + + + ; ; ; ; ; ; ++#+8@+
4S>8I../+ + + + + +r-   