
    MhN                        d dl mZ d dlmZ d dlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d          Z G d	 d
e          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z% G d de          Z& ed          d-d"            Z' ed#          	 d.d/d+            Z(d,S )0    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangeis_cjk_uncommonc                  B    e Zd ZdZddZddZdd	Zedd            ZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterstrreturnboolc                    t           )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     U/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible'   
     "!    Nonec                    t           )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r#   r%   s     r'   feedzMessDetectorPlugin.feed-   s
    
 "!r*   c                    t           )zB
        Permit to reset the plugin to the initial state.
        r#   r&   s    r'   resetzMessDetectorPlugin.reset4   r)   r*   floatc                    t           )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r#   r/   s    r'   ratiozMessDetectorPlugin.ratio:   s
     "!r*   Nr   r   r    r!   r   r   r    r+   r    r+   r    r1   )	__name__
__module____qualname____doc__r(   r-   r0   propertyr3    r*   r'   r   r   !   sz         
" " " "" " " "" " " " " " " X" " "r*   r   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS ) TooManySymbolOrPunctuationPluginr    r+   c                L    d| _         d| _        d| _        d | _        d| _        d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr/   s    r'   __init__z)TooManySymbolOrPunctuationPlugin.__init__D   s0    '("#%&04!,1###r*   r   r   r!   c                *    |                                 S Nisprintabler%   s     r'   r(   z)TooManySymbolOrPunctuationPlugin.eligibleL       $$&&&r*   c                (   | xj         dz  c_         || j        k    ro|t          vrft          |          r| xj        dz  c_        nF|                                du r0t          |          r!t          |          du r| xj        dz  c_        || _        d S )Nr   F   )	rC   rD   r   r   rA   isdigitr   r   rB   r%   s     r'   r-   z%TooManySymbolOrPunctuationPlugin.feedO   s    " 222!===i(( (''1,'''!!##u,,i(( -	**e33""a'""$-!!!r*   c                0    d| _         d| _        d| _        d S Nr   )rA   rC   rB   r/   s    r'   r0   z&TooManySymbolOrPunctuationPlugin.reseta   s     "# !r*   r1   c                ^    | j         dk    rdS | j        | j        z   | j         z  }|dk    r|ndS )Nr           333333?)rC   rA   rB   )r&   ratio_of_punctuations     r'   r3   z&TooManySymbolOrPunctuationPlugin.ratiof   sK     A%%3 #d&88!'" (<s'B'B##Kr*   Nr6   r4   r5   r7   	r8   r9   r:   rF   r(   r-   r0   r<   r3   r=   r*   r'   r?   r?   C   s        2 2 2 2' ' ' '. . . .$   
 L L L XL L Lr*   r?   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )TooManyAccentuatedPluginr    r+   c                "    d| _         d| _        d S rP   rC   _accentuated_countr/   s    r'   rF   z!TooManyAccentuatedPlugin.__init__s   s    %&'(r*   r   r   r!   c                *    |                                 S rH   )isalphar%   s     r'   r(   z!TooManyAccentuatedPlugin.eligiblew   s      """r*   c                h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S Nr   )rC   r
   rZ   r%   s     r'   r-   zTooManyAccentuatedPlugin.feedz   sJ    ")$$ 	)##q(####	) 	)r*   c                "    d| _         d| _        d S rP   rY   r/   s    r'   r0   zTooManyAccentuatedPlugin.reset   s     !"#r*   r1   c                N    | j         dk     rdS | j        | j         z  }|dk    r|ndS )N   rR   gffffff?rY   )r&   ratio_of_accentuations     r'   r3   zTooManyAccentuatedPlugin.ratio   s<     1$$3'+'>AV'V(=(E(E$$3Nr*   Nr6   r4   r5   r7   rU   r=   r*   r'   rW   rW   r   s        ) ) ) )# # # #) ) ) )$ $ $ $ O O O XO O Or*   rW   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )UnprintablePluginr    r+   c                "    d| _         d| _        d S rP   )_unprintable_countrC   r/   s    r'   rF   zUnprintablePlugin.__init__   s    '(%&r*   r   r   r!   c                    dS NTr=   r%   s     r'   r(   zUnprintablePlugin.eligible       tr*   c                d    t          |          r| xj        dz  c_        | xj        dz  c_        d S r^   )r   rf   rC   r%   s     r'   r-   zUnprintablePlugin.feed   s@    )$$ 	)##q(##"r*   c                    d| _         d S rP   )rf   r/   s    r'   r0   zUnprintablePlugin.reset   s    "#r*   r1   c                @    | j         dk    rdS | j        dz  | j         z  S )Nr   rR   ra   )rC   rf   r/   s    r'   r3   zUnprintablePlugin.ratio   s+     A%%3'!+t/DDDr*   Nr6   r4   r5   r7   rU   r=   r*   r'   rd   rd      s        ' ' ' '   # # # #
$ $ $ $ E E E XE E Er*   rd   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )SuspiciousDuplicateAccentPluginr    r+   c                0    d| _         d| _        d | _        d S rP   _successive_countrC   _last_latin_characterr/   s    r'   rF   z(SuspiciousDuplicateAccentPlugin.__init__   s     &'%&15"""r*   r   r   r!   c                H    |                                 ot          |          S rH   )r\   r   r%   s     r'   r(   z(SuspiciousDuplicateAccentPlugin.eligible   s!      "":x	':'::r*   c                l   | xj         dz  c_         | j        t          |          rt          | j                  rr|                                r)| j                                        r| xj        dz  c_        t          |          t          | j                  k    r| xj        dz  c_        || _        d S r^   )rC   rr   r
   isupperrq   r   r%   s     r'   r-   z$SuspiciousDuplicateAccentPlugin.feed   s    "&2y)) 3t9:: 3   "" ,t'A'I'I'K'K ,&&!+&&Y''=9S+T+TTT&&!+&&%."""r*   c                0    d| _         d| _        d | _        d S rP   rp   r/   s    r'   r0   z%SuspiciousDuplicateAccentPlugin.reset   s     !" !%)"""r*   r1   c                @    | j         dk    rdS | j        dz  | j         z  S )Nr   rR   rM   )rC   rq   r/   s    r'   r3   z%SuspiciousDuplicateAccentPlugin.ratio   s+     A%%3&*d.CCCr*   Nr6   r4   r5   r7   rU   r=   r*   r'   rn   rn      s        6 6 6 6; ; ; ;/ / / /* * * *
 D D D XD D Dr*   rn   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )SuspiciousRanger    r+   c                0    d| _         d| _        d | _        d S rP   )"_suspicious_successive_range_countrC   _last_printable_seenr/   s    r'   rF   zSuspiciousRange.__init__   s     78/%&04!!!r*   r   r   r!   c                *    |                                 S rH   rI   r%   s     r'   r(   zSuspiciousRange.eligible   rK   r*   c                D   | xj         dz  c_         |                                st          |          s	|t          v r	d | _        d S | j        	|| _        d S t          | j                  }t          |          }t          ||          r| xj        dz  c_        || _        d S r^   )rC   isspacer   r   r|   r    is_suspiciously_successive_ranger{   )r&   r   unicode_range_aunicode_range_bs       r'   r-   zSuspiciousRange.feed   s    " 	i((	 888(,D%F$,(1D%F&3D4M&N&N&3I&>&>+O_MM 	933q833$-!!!r*   c                0    d| _         d| _        d | _        d S rP   )rC   r{   r|   r/   s    r'   r0   zSuspiciousRange.reset   s      !23/$(!!!r*   r1   c                D    | j         dk    rdS | j        dz  | j         z  }|S )N   rR   rM   )rC   r{   )r&   ratio_of_suspicious_range_usages     r'   r3   zSuspiciousRange.ratio   s8     B&&3 3a7!2"' /.r*   Nr6   r4   r5   r7   rU   r=   r*   r'   ry   ry      s        5 5 5 5
' ' ' '. . . ..) ) ) )
 / / / X/ / /r*   ry   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )SuperWeirdWordPluginr    r+   c                    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _	        d S )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrC   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr/   s    r'   rF   zSuperWeirdWordPlugin.__init__   sW     !$%() */!). %&)*!)*!()   r*   r   r   r!   c                    dS rh   r=   r%   s     r'   r(   zSuperWeirdWordPlugin.eligible  ri   r*   c                   |                                 r| xj        |z  c_        t          |          r| xj        dz  c_        | j        du r|t          |          du st          |          r\t          |          du rKt          |          du r:t          |          du r)t          |          du rt          |          du rd| _        t          |          s<t          |          s-t          |          st          |          st          |          r| xj        dz  c_        d S | j        sd S |                                st          |          st          |          r| j        r| xj        dz  c_        t!          | j                  }| xj        |z  c_        |dk    r| j        |z  dk    rd| _        nt          | j        d                   rW| j        d                                         r8t)          d | j        D                       du r| xj        dz  c_        d| _        n"| j        dk    rd| _        | xj        dz  c_        |dk    ri| j        rbd	 t-          | j        t/          d
|                    D             }d}|rt!          |          |z  dk    rd}|s| xj        dz  c_        d| _        | j        r9| xj        dz  c_        | xj        t!          | j                  z  c_        d| _        d| _        d| _        d
| _        d
| _        d S |dvr>|                                du r*t7          |          rd| _        | xj        |z  c_        d S d S d S d S )Nr   FT         ?c              3  >   K   | ]}|                                 V  d S rH   ru   ).0_s     r'   	<genexpr>z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>8  s*      >>AAIIKK>>>>>>r*      c                @    g | ]\  }}|                                 |S r=   r   )r   cis      r'   
<listcomp>z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>@  s:     " " "1yy{{"" " "r*   r   rS   r   >   r   -<=>|~)r\   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   lenrC   r   ru   allr   zipranger   r   rN   r   )r&   r   buffer_lengthcamel_case_dstprobable_camel_caseds        r'   r-   zSuperWeirdWordPlugin.feed  s
    	LLI%LLi(( /))Q.))(E11i((E11^I5N5N19%%..i((E11	**e33	**e33I&&%//+/(y!!.Y''. y)). y))	.
 9%%. ((A-((F| 	F5	&#1)#<#<5	&@LY@W@W5	&l5	& !!$T\!2!2M!!]2!!!!,}<CC04D-- #4<#344	2R(0022	2 >>>>>>>%GG,,1,,04D---2204D-,,1,,""t'?"" " #DL%=2I2I J J" " "
 .3$! 0s>':':]'Jc'Q'Q+/(+ 5,,1,,04D-( 2$$)$$))S->->>)),1)',D$DL()D%'(D$$$@@@!!##u,,)$$ - )-D%LLI%LLLL A@,,,,r*   c                v    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   Fr   )r   r   r   r   r   rC   r   r   r/   s    r'   r0   zSuperWeirdWordPlugin.reset_  sG    $)!#(   !$%!#$   r*   r1   c                P    | j         dk    r| j        dk    rdS | j        | j        z  S )N
   r   rR   )r   r   r   rC   r/   s    r'   r3   zSuperWeirdWordPlugin.ratioi  s3    r!!d&>!&C&C3(4+@@@r*   Nr6   r4   r5   r7   rU   r=   r*   r'   r   r      s        * * * *   O& O& O& O&b% % % % A A A XA A Ar*   r   c                  J    e Zd ZdZddZddZdd	Zdd
Zedd            Z	dS )CjkUncommonPluginz<
    Detect messy CJK text that probably means nothing.
    r    r+   c                "    d| _         d| _        d S rP   rC   _uncommon_countr/   s    r'   rF   zCjkUncommonPlugin.__init__v  s    %&$%r*   r   r   r!   c                     t          |          S rH   )r   r%   s     r'   r(   zCjkUncommonPlugin.eligiblez  s    i   r*   c                h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S r^   )rC   r   r   r%   s     r'   r-   zCjkUncommonPlugin.feed}  sI    "9%% 	  A%  F	 	r*   c                "    d| _         d| _        d S rP   r   r/   s    r'   r0   zCjkUncommonPlugin.reset  s     ! r*   r1   c                T    | j         dk     rdS | j        | j         z  }|dk    r|dz  ndS )Nra   rR   r   r   r   )r&   uncommon_form_usages     r'   r3   zCjkUncommonPlugin.ratio  sC     1$$3%)%9D<Q%Q ,?+D+D"R''#Mr*   Nr6   r4   r5   r7   )
r8   r9   r:   r;   rF   r(   r-   r0   r<   r3   r=   r*   r'   r   r   q  s         & & & &! ! ! !   ! ! ! ! N N N XN N Nr*   r   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )ArchaicUpperLowerPluginr    r+   c                h    d| _         d| _        d| _        d| _        d| _        d | _        d| _        d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrC   _last_alpha_seen_current_ascii_onlyr/   s    r'   rF   z ArchaicUpperLowerPlugin.__init__  s?    	45,23*890%&,0)-   r*   r   r   r!   c                    dS rh   r=   r%   s     r'   r(   z ArchaicUpperLowerPlugin.eligible  ri   r*   c                   |                                 ot          |          }|du }|r| j        dk    rt| j        dk    r4|                                du r| j        du r| xj        | j        z  c_        d| _        d| _        d | _        d| _        | xj	        dz  c_	        d| _        d S | j        du r|
                                du rd| _        | j        |                                r| j                                        s-|                                rB| j                                        r)| j        du r| xj        dz  c_        d| _        nd| _        nd| _        | xj	        dz  c_	        | xj        dz  c_        || _        d S )NFr   @   r   TrM   )r\   r   r   rN   r   r   r   r   r   rC   isasciiru   islower)r&   r   is_concerned	chunk_seps       r'   r-   zArchaicUpperLowerPlugin.feed  s    ((**J/?	/J/J E)	 	=AA4::%%''500,5588688 23D.34D0$(D!DI!!Q&!!'+D$F#t++	0A0A0C0Cu0L0L',D$ ,!!## 	"(=(E(E(G(G 	"!!##	"(,(=(E(E(G(G	" 9$$66!;66 %DII $DII!	",,1,, )r*   c                h    d| _         d| _        d| _        d| _        d | _        d| _        d| _        d S )Nr   FT)rC   r   r   r   r   r   r   r/   s    r'   r0   zArchaicUpperLowerPlugin.reset  s?     !/0,-.*340 $	#'   r*   r1   c                :    | j         dk    rdS | j        | j         z  S )Nr   rR   )rC   r   r/   s    r'   r3   zArchaicUpperLowerPlugin.ratio  s&     A%%37$:OOOr*   Nr6   r4   r5   r7   rU   r=   r*   r'   r   r     s        . . . .   (* (* (* (*T( ( ( ( P P P XP P Pr*   r   c                  F    e Zd ZddZddZddZdd	Zedd            ZdS )ArabicIsolatedFormPluginr    r+   c                "    d| _         d| _        d S rP   rC   _isolated_form_countr/   s    r'   rF   z!ArabicIsolatedFormPlugin.__init__  s    %&)*!!!r*   c                "    d| _         d| _        d S rP   r   r/   s    r'   r0   zArabicIsolatedFormPlugin.reset  s     !$%!!!r*   r   r   r!   c                     t          |          S rH   )r   r%   s     r'   r(   z!ArabicIsolatedFormPlugin.eligible  s    ###r*   c                h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S r^   )rC   r   r   r%   s     r'   r-   zArabicIsolatedFormPlugin.feed  sJ    ""9-- 	+%%*%%%%	+ 	+r*   r1   c                >    | j         dk     rdS | j        | j         z  }|S )Nra   rR   r   )r&   isolated_form_usages     r'   r3   zArabicIsolatedFormPlugin.ratio  s,     1$$3%)%>AV%V""r*   Nr6   r4   r5   r7   )	r8   r9   r:   rF   r0   r(   r-   r<   r3   r=   r*   r'   r   r     s        + + + +& & & &$ $ $ $+ + + + # # # X# # #r*   r      )maxsizer   
str | Noner   r    r!   c                   | |dS | |k    rdS d| v rd|v rdS d| v sd|v rdS d| v sd|v r
d| v sd|v rdS |                      d          |                     d          }}|D ]}|t          v r||v r dS | dv |dv }}|s|r
d	| v sd	|v rdS |r|rdS d
| v sd
|v rd	| v sd	|v rdS | dk    s|dk    rdS d	| v sd	|v s| dv r&|dv r"d| v sd|v rdS d| v sd|v rdS | dk    s|dk    rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr	   )r   r   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r'   r   r     s    /"9t/))u/!!g&@&@uo%%)G)Gu 	?""g&@&@&&+*H*Hu 	c""c"" '
   000!!!55 "
 	
	

 	33 ' 	 ,   E_$<$<u , u?""h/&A&AO##u'?'?5m++-/O/O5 	  E_$<$<333777O++}/O/O5o%%O)C)C5m++-/O/O54r*   i   皙?Fdecoded_sequencer   maximum_thresholdr1   debugc           	     Z   d t                                           D             }t          |           dz   }d}|dk     rd}n|dk    rd}nd}t          | d	z   t	          |                    D ]m\  }}|D ],}	|	                    |          r|	                    |           -|d
k    r	||z  d
k    s	||dz
  k    r!t          d |D                       }||k    r nn|rt          d          }
|
	                    t          d| d| d|            t          |           dk    rL|
	                    t          d| dd                     |
	                    t          d| dd                     |D ],}|
	                    t          |j         d|j                    -t          |d          S )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                "    g | ]} |            S r=   r=   )r   md_classs     r'   r   zmess_ratio.<locals>.<listcomp>N  s+     + + +

+ + +r*   r   rR   i       r   r      
r   c              3  $   K   | ]}|j         V  d S rH   )r3   )r   dts     r'   r   zmess_ratio.<locals>.<genexpr>e  s$      !?!?r"(!?!?!?!?!?!?r*   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r(   r-   sumr   logr   	__class__r3   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   s               r'   
mess_ratior  F  s1   + +#5#D#D#F#F+ + +I &''!+F O||13))	4,.)),/) 04 7vGG  	5! 	) 	)H  ++ )i((( AII%"CCqHHfqj  !!?!?Y!?!?!???O"333 =/00

51R5 5et5 5!25 5	
 	
 	
   2%%JJuG0@"0EGGHHHJJuG.>suu.EGGHHH 	= 	=BJJu;;;;<<<<!$$$r*   N)r   r   r   r   r    r!   )r   F)r   r   r   r1   r   r!   r    r1   ))
__future__r   	functoolsr   loggingr   constantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r?   rW   rd   rn   ry   r   r   r   r   r   r  r=   r*   r'   <module>r     s   " " " " " "                     
                                       ," " " " " " " "D,L ,L ,L ,L ,L'9 ,L ,L ,L^O O O O O1 O O O6E E E E E* E E E0"D "D "D "D "D&8 "D "D "DJ./ ./ ./ ./ ./( ./ ./ ./bsA sA sA sA sA- sA sA sAl N  N  N  N  N*  N  N  NFIP IP IP IP IP0 IP IP IPX# # # # #1 # # #8 4F F F FR 4IN4% 4% 4% 4% 4% 4% 4%r*   