
    Q/Ph                         d dl mZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZmZ  G d d          Zde	d	ed
efdZde	d	ed
efdZde	d	ed
efdZdS )    )AnyListOptionalN)	DataFrame)Settings)plot_missing_barplot_missing_heatmapplot_missing_matrixc                   f    e Zd ZdZ	 ddedee         defdZde	fdZ
defd	Zdee         fd
ZdS )MissingnoBarSparkPatcha:  
    Technical Debt :
    This is a monkey patching object that allows usage of the library missingno as is for spark dataframes.
    This is because missingno library's bar function always applies a isnull().sum() on dataframes in the visualisation
    function, instead of allowing just values counts as an entry point. Thus, in order to calculate the
    missing values dataframe in spark, we compute it first, then wrap it in this MissingnoBarSparkPatch object which
    will be unwrapped by missingno and return the pre-computed value counts.
    The best fix to this currently terrible patch is to submit a PR to missingno to separate preprocessing function
    (compute value counts from df) and visualisation functions such that we can call the visualisation directly.
    Unfortunately, the missingno library people have not really responded to our issues on gitlab.
    See https://github.com/ResidentMario/missingno/issues/119.
    We could also fork the missingno library and implement some of the code in our database, but that feels
    like bad practice as well.
    Ndfcolumnsoriginal_df_sizec                 0    || _         || _        || _        d S )N)r   r   r   )selfr   r   r   s       i/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/ydata_profiling/model/spark/missing_spark.py__init__zMissingnoBarSparkPatch.__init__   s       0    returnc                     | S )zW
        This patches the .isnull().sum() function called by missingno library
         r   s    r   isnullzMissingnoBarSparkPatch.isnull%   s	     r   c                     | j         S )zN
        This patches the .sum() function called by missingno library
        )r   r   s    r   sumzMissingnoBarSparkPatch.sum+   s     wr   c                     | j         S )zO
        This patches the len(df) function called by missingno library
        )r   r   s    r   __len__zMissingnoBarSparkPatch.__len__1   s     $$r   )NN)__name__
__module____qualname____doc__r   r   strintr   r   r   r   r   r   r   r   r   r   r      s           QU1 11&*3i1JM1 1 1 1    Y    %# % % % % % %r   r   configr   r   c                     dd l mc m  |j        fd|j        D                                                                  d          }t          | ||j        |                                          S )Nr   c           
          g | ]g}                                                             |                              |          z  |                                        |          hS r   )countwhenr   isnanalias).0cFs     r   
<listcomp>zmissing_bar.<locals>.<listcomp>>   sZ    XXXaggaffQXXa[[1771::5q99::@@CCXXXr   indexaxis)notnull_countsr   nrows)	pyspark.sql.functionssql	functionsaggr   toPandassqueezer   r'   )r$   r   data_nan_countsr-   s      @r   missing_barr;   8   s    %%%%%%%%% 	XXXXRZXXX	
 
	g		  
"((**   r   c                     t          ||j        |                                          }t          | |j        |                                j        t          |                    S )Nr   r   )r   notnullr3   )r   r   r'   r
   r>   valueslen)r$   r   s     r   missing_matrixrA   I   sV    	BJ	T	T	TB


#"gg	   r   c                    t          ||j        |                                          }d t          t	          j        |                                d                    D             }|j        d d |f         }|                                                                }t	          j	        |          }d|t	          j
        |          <   t          | ||t          |j                            S )Nr=   c                 $    g | ]\  }}|d k    |S )r   r   )r+   ins      r   r.   z#missing_heatmap.<locals>.<listcomp>W   s"    RRRTQAPQEEqEEEr   rowsr0   T)corr_matmaskr   )r   r   r'   	enumeratenpvarr   iloccorr
zeros_liketriu_indices_fromr	   list)r$   r   r   rG   rH   s        r   missing_heatmaprQ   S   s    	BJ	T	T	TB SRYrvbiikk'G'G'GHHRRRG	G	B yy{{!!H=""D'+D	d	#	#$d2:6F6F   r   )typingr   r   r   numpyrJ   pyspark.sqlr   ydata_profiling.configr   %ydata_profiling.visualisation.missingr   r	   r
   r   r"   r;   rA   rQ   r   r   r   <module>rW      s3   & & & & & & & & & &     ! ! ! ! ! ! + + + + + +         '% '% '% '% '% '% '% '%T i C    "8  s    H )       r   