mirror of
				https://gitlab.sectorq.eu/jaydee/omv_backup.git
				synced 2025-10-31 02:21:10 +01:00 
			
		
		
		
	added v3
This commit is contained in:
		
							
								
								
									
										630
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/md.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										630
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/md.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,630 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from functools import lru_cache | ||||
| from logging import getLogger | ||||
|  | ||||
| from .constant import ( | ||||
|     COMMON_SAFE_ASCII_CHARACTERS, | ||||
|     TRACE, | ||||
|     UNICODE_SECONDARY_RANGE_KEYWORD, | ||||
| ) | ||||
| from .utils import ( | ||||
|     is_accentuated, | ||||
|     is_arabic, | ||||
|     is_arabic_isolated_form, | ||||
|     is_case_variable, | ||||
|     is_cjk, | ||||
|     is_emoticon, | ||||
|     is_hangul, | ||||
|     is_hiragana, | ||||
|     is_katakana, | ||||
|     is_latin, | ||||
|     is_punctuation, | ||||
|     is_separator, | ||||
|     is_symbol, | ||||
|     is_thai, | ||||
|     is_unprintable, | ||||
|     remove_accent, | ||||
|     unicode_range, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class MessDetectorPlugin: | ||||
|     """ | ||||
|     Base abstract class used for mess detection plugins. | ||||
|     All detectors MUST extend and implement given methods. | ||||
|     """ | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         """ | ||||
|         Determine if given character should be fed in. | ||||
|         """ | ||||
|         raise NotImplementedError  # pragma: nocover | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         """ | ||||
|         The main routine to be executed upon character. | ||||
|         Insert the logic in witch the text would be considered chaotic. | ||||
|         """ | ||||
|         raise NotImplementedError  # pragma: nocover | ||||
|  | ||||
|     def reset(self) -> None:  # pragma: no cover | ||||
|         """ | ||||
|         Permit to reset the plugin to the initial state. | ||||
|         """ | ||||
|         raise NotImplementedError | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         """ | ||||
|         Compute the chaos ratio based on what your feed() has seen. | ||||
|         Must NOT be lower than 0.; No restriction gt 0. | ||||
|         """ | ||||
|         raise NotImplementedError  # pragma: nocover | ||||
|  | ||||
|  | ||||
| class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._punctuation_count: int = 0 | ||||
|         self._symbol_count: int = 0 | ||||
|         self._character_count: int = 0 | ||||
|  | ||||
|         self._last_printable_char: str | None = None | ||||
|         self._frenzy_symbol_in_word: bool = False | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return character.isprintable() | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         self._character_count += 1 | ||||
|  | ||||
|         if ( | ||||
|             character != self._last_printable_char | ||||
|             and character not in COMMON_SAFE_ASCII_CHARACTERS | ||||
|         ): | ||||
|             if is_punctuation(character): | ||||
|                 self._punctuation_count += 1 | ||||
|             elif ( | ||||
|                 character.isdigit() is False | ||||
|                 and is_symbol(character) | ||||
|                 and is_emoticon(character) is False | ||||
|             ): | ||||
|                 self._symbol_count += 2 | ||||
|  | ||||
|         self._last_printable_char = character | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._punctuation_count = 0 | ||||
|         self._character_count = 0 | ||||
|         self._symbol_count = 0 | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._character_count == 0: | ||||
|             return 0.0 | ||||
|  | ||||
|         ratio_of_punctuation: float = ( | ||||
|             self._punctuation_count + self._symbol_count | ||||
|         ) / self._character_count | ||||
|  | ||||
|         return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 | ||||
|  | ||||
|  | ||||
| class TooManyAccentuatedPlugin(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._character_count: int = 0 | ||||
|         self._accentuated_count: int = 0 | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return character.isalpha() | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         self._character_count += 1 | ||||
|  | ||||
|         if is_accentuated(character): | ||||
|             self._accentuated_count += 1 | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._character_count = 0 | ||||
|         self._accentuated_count = 0 | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._character_count < 8: | ||||
|             return 0.0 | ||||
|  | ||||
|         ratio_of_accentuation: float = self._accentuated_count / self._character_count | ||||
|         return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 | ||||
|  | ||||
|  | ||||
| class UnprintablePlugin(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._unprintable_count: int = 0 | ||||
|         self._character_count: int = 0 | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return True | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         if is_unprintable(character): | ||||
|             self._unprintable_count += 1 | ||||
|         self._character_count += 1 | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._unprintable_count = 0 | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._character_count == 0: | ||||
|             return 0.0 | ||||
|  | ||||
|         return (self._unprintable_count * 8) / self._character_count | ||||
|  | ||||
|  | ||||
| class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._successive_count: int = 0 | ||||
|         self._character_count: int = 0 | ||||
|  | ||||
|         self._last_latin_character: str | None = None | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return character.isalpha() and is_latin(character) | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         self._character_count += 1 | ||||
|         if ( | ||||
|             self._last_latin_character is not None | ||||
|             and is_accentuated(character) | ||||
|             and is_accentuated(self._last_latin_character) | ||||
|         ): | ||||
|             if character.isupper() and self._last_latin_character.isupper(): | ||||
|                 self._successive_count += 1 | ||||
|             # Worse if its the same char duplicated with different accent. | ||||
|             if remove_accent(character) == remove_accent(self._last_latin_character): | ||||
|                 self._successive_count += 1 | ||||
|         self._last_latin_character = character | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._successive_count = 0 | ||||
|         self._character_count = 0 | ||||
|         self._last_latin_character = None | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._character_count == 0: | ||||
|             return 0.0 | ||||
|  | ||||
|         return (self._successive_count * 2) / self._character_count | ||||
|  | ||||
|  | ||||
| class SuspiciousRange(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._suspicious_successive_range_count: int = 0 | ||||
|         self._character_count: int = 0 | ||||
|         self._last_printable_seen: str | None = None | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return character.isprintable() | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         self._character_count += 1 | ||||
|  | ||||
|         if ( | ||||
|             character.isspace() | ||||
|             or is_punctuation(character) | ||||
|             or character in COMMON_SAFE_ASCII_CHARACTERS | ||||
|         ): | ||||
|             self._last_printable_seen = None | ||||
|             return | ||||
|  | ||||
|         if self._last_printable_seen is None: | ||||
|             self._last_printable_seen = character | ||||
|             return | ||||
|  | ||||
|         unicode_range_a: str | None = unicode_range(self._last_printable_seen) | ||||
|         unicode_range_b: str | None = unicode_range(character) | ||||
|  | ||||
|         if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): | ||||
|             self._suspicious_successive_range_count += 1 | ||||
|  | ||||
|         self._last_printable_seen = character | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._character_count = 0 | ||||
|         self._suspicious_successive_range_count = 0 | ||||
|         self._last_printable_seen = None | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._character_count <= 13: | ||||
|             return 0.0 | ||||
|  | ||||
|         ratio_of_suspicious_range_usage: float = ( | ||||
|             self._suspicious_successive_range_count * 2 | ||||
|         ) / self._character_count | ||||
|  | ||||
|         return ratio_of_suspicious_range_usage | ||||
|  | ||||
|  | ||||
| class SuperWeirdWordPlugin(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._word_count: int = 0 | ||||
|         self._bad_word_count: int = 0 | ||||
|         self._foreign_long_count: int = 0 | ||||
|  | ||||
|         self._is_current_word_bad: bool = False | ||||
|         self._foreign_long_watch: bool = False | ||||
|  | ||||
|         self._character_count: int = 0 | ||||
|         self._bad_character_count: int = 0 | ||||
|  | ||||
|         self._buffer: str = "" | ||||
|         self._buffer_accent_count: int = 0 | ||||
|         self._buffer_glyph_count: int = 0 | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return True | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         if character.isalpha(): | ||||
|             self._buffer += character | ||||
|             if is_accentuated(character): | ||||
|                 self._buffer_accent_count += 1 | ||||
|             if ( | ||||
|                 self._foreign_long_watch is False | ||||
|                 and (is_latin(character) is False or is_accentuated(character)) | ||||
|                 and is_cjk(character) is False | ||||
|                 and is_hangul(character) is False | ||||
|                 and is_katakana(character) is False | ||||
|                 and is_hiragana(character) is False | ||||
|                 and is_thai(character) is False | ||||
|             ): | ||||
|                 self._foreign_long_watch = True | ||||
|             if ( | ||||
|                 is_cjk(character) | ||||
|                 or is_hangul(character) | ||||
|                 or is_katakana(character) | ||||
|                 or is_hiragana(character) | ||||
|                 or is_thai(character) | ||||
|             ): | ||||
|                 self._buffer_glyph_count += 1 | ||||
|             return | ||||
|         if not self._buffer: | ||||
|             return | ||||
|         if ( | ||||
|             character.isspace() or is_punctuation(character) or is_separator(character) | ||||
|         ) and self._buffer: | ||||
|             self._word_count += 1 | ||||
|             buffer_length: int = len(self._buffer) | ||||
|  | ||||
|             self._character_count += buffer_length | ||||
|  | ||||
|             if buffer_length >= 4: | ||||
|                 if self._buffer_accent_count / buffer_length >= 0.5: | ||||
|                     self._is_current_word_bad = True | ||||
|                 # Word/Buffer ending with an upper case accentuated letter are so rare, | ||||
|                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious. | ||||
|                 elif ( | ||||
|                     is_accentuated(self._buffer[-1]) | ||||
|                     and self._buffer[-1].isupper() | ||||
|                     and all(_.isupper() for _ in self._buffer) is False | ||||
|                 ): | ||||
|                     self._foreign_long_count += 1 | ||||
|                     self._is_current_word_bad = True | ||||
|                 elif self._buffer_glyph_count == 1: | ||||
|                     self._is_current_word_bad = True | ||||
|                     self._foreign_long_count += 1 | ||||
|             if buffer_length >= 24 and self._foreign_long_watch: | ||||
|                 camel_case_dst = [ | ||||
|                     i | ||||
|                     for c, i in zip(self._buffer, range(0, buffer_length)) | ||||
|                     if c.isupper() | ||||
|                 ] | ||||
|                 probable_camel_cased: bool = False | ||||
|  | ||||
|                 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): | ||||
|                     probable_camel_cased = True | ||||
|  | ||||
|                 if not probable_camel_cased: | ||||
|                     self._foreign_long_count += 1 | ||||
|                     self._is_current_word_bad = True | ||||
|  | ||||
|             if self._is_current_word_bad: | ||||
|                 self._bad_word_count += 1 | ||||
|                 self._bad_character_count += len(self._buffer) | ||||
|                 self._is_current_word_bad = False | ||||
|  | ||||
|             self._foreign_long_watch = False | ||||
|             self._buffer = "" | ||||
|             self._buffer_accent_count = 0 | ||||
|             self._buffer_glyph_count = 0 | ||||
|         elif ( | ||||
|             character not in {"<", ">", "-", "=", "~", "|", "_"} | ||||
|             and character.isdigit() is False | ||||
|             and is_symbol(character) | ||||
|         ): | ||||
|             self._is_current_word_bad = True | ||||
|             self._buffer += character | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._buffer = "" | ||||
|         self._is_current_word_bad = False | ||||
|         self._foreign_long_watch = False | ||||
|         self._bad_word_count = 0 | ||||
|         self._word_count = 0 | ||||
|         self._character_count = 0 | ||||
|         self._bad_character_count = 0 | ||||
|         self._foreign_long_count = 0 | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._word_count <= 10 and self._foreign_long_count == 0: | ||||
|             return 0.0 | ||||
|  | ||||
|         return self._bad_character_count / self._character_count | ||||
|  | ||||
|  | ||||
| class CjkInvalidStopPlugin(MessDetectorPlugin): | ||||
|     """ | ||||
|     GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and | ||||
|     can be easily detected. Searching for the overuse of '丅' and '丄'. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self) -> None: | ||||
|         self._wrong_stop_count: int = 0 | ||||
|         self._cjk_character_count: int = 0 | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return True | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         if character in {"丅", "丄"}: | ||||
|             self._wrong_stop_count += 1 | ||||
|             return | ||||
|         if is_cjk(character): | ||||
|             self._cjk_character_count += 1 | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._wrong_stop_count = 0 | ||||
|         self._cjk_character_count = 0 | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._cjk_character_count < 16: | ||||
|             return 0.0 | ||||
|         return self._wrong_stop_count / self._cjk_character_count | ||||
|  | ||||
|  | ||||
| class ArchaicUpperLowerPlugin(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._buf: bool = False | ||||
|  | ||||
|         self._character_count_since_last_sep: int = 0 | ||||
|  | ||||
|         self._successive_upper_lower_count: int = 0 | ||||
|         self._successive_upper_lower_count_final: int = 0 | ||||
|  | ||||
|         self._character_count: int = 0 | ||||
|  | ||||
|         self._last_alpha_seen: str | None = None | ||||
|         self._current_ascii_only: bool = True | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return True | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         is_concerned = character.isalpha() and is_case_variable(character) | ||||
|         chunk_sep = is_concerned is False | ||||
|  | ||||
|         if chunk_sep and self._character_count_since_last_sep > 0: | ||||
|             if ( | ||||
|                 self._character_count_since_last_sep <= 64 | ||||
|                 and character.isdigit() is False | ||||
|                 and self._current_ascii_only is False | ||||
|             ): | ||||
|                 self._successive_upper_lower_count_final += ( | ||||
|                     self._successive_upper_lower_count | ||||
|                 ) | ||||
|  | ||||
|             self._successive_upper_lower_count = 0 | ||||
|             self._character_count_since_last_sep = 0 | ||||
|             self._last_alpha_seen = None | ||||
|             self._buf = False | ||||
|             self._character_count += 1 | ||||
|             self._current_ascii_only = True | ||||
|  | ||||
|             return | ||||
|  | ||||
|         if self._current_ascii_only is True and character.isascii() is False: | ||||
|             self._current_ascii_only = False | ||||
|  | ||||
|         if self._last_alpha_seen is not None: | ||||
|             if (character.isupper() and self._last_alpha_seen.islower()) or ( | ||||
|                 character.islower() and self._last_alpha_seen.isupper() | ||||
|             ): | ||||
|                 if self._buf is True: | ||||
|                     self._successive_upper_lower_count += 2 | ||||
|                     self._buf = False | ||||
|                 else: | ||||
|                     self._buf = True | ||||
|             else: | ||||
|                 self._buf = False | ||||
|  | ||||
|         self._character_count += 1 | ||||
|         self._character_count_since_last_sep += 1 | ||||
|         self._last_alpha_seen = character | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._character_count = 0 | ||||
|         self._character_count_since_last_sep = 0 | ||||
|         self._successive_upper_lower_count = 0 | ||||
|         self._successive_upper_lower_count_final = 0 | ||||
|         self._last_alpha_seen = None | ||||
|         self._buf = False | ||||
|         self._current_ascii_only = True | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._character_count == 0: | ||||
|             return 0.0 | ||||
|  | ||||
|         return self._successive_upper_lower_count_final / self._character_count | ||||
|  | ||||
|  | ||||
| class ArabicIsolatedFormPlugin(MessDetectorPlugin): | ||||
|     def __init__(self) -> None: | ||||
|         self._character_count: int = 0 | ||||
|         self._isolated_form_count: int = 0 | ||||
|  | ||||
|     def reset(self) -> None:  # Abstract | ||||
|         self._character_count = 0 | ||||
|         self._isolated_form_count = 0 | ||||
|  | ||||
|     def eligible(self, character: str) -> bool: | ||||
|         return is_arabic(character) | ||||
|  | ||||
|     def feed(self, character: str) -> None: | ||||
|         self._character_count += 1 | ||||
|  | ||||
|         if is_arabic_isolated_form(character): | ||||
|             self._isolated_form_count += 1 | ||||
|  | ||||
|     @property | ||||
|     def ratio(self) -> float: | ||||
|         if self._character_count < 8: | ||||
|             return 0.0 | ||||
|  | ||||
|         isolated_form_usage: float = self._isolated_form_count / self._character_count | ||||
|  | ||||
|         return isolated_form_usage | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=1024) | ||||
| def is_suspiciously_successive_range( | ||||
|     unicode_range_a: str | None, unicode_range_b: str | None | ||||
| ) -> bool: | ||||
|     """ | ||||
|     Determine if two Unicode range seen next to each other can be considered as suspicious. | ||||
|     """ | ||||
|     if unicode_range_a is None or unicode_range_b is None: | ||||
|         return True | ||||
|  | ||||
|     if unicode_range_a == unicode_range_b: | ||||
|         return False | ||||
|  | ||||
|     if "Latin" in unicode_range_a and "Latin" in unicode_range_b: | ||||
|         return False | ||||
|  | ||||
|     if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: | ||||
|         return False | ||||
|  | ||||
|     # Latin characters can be accompanied with a combining diacritical mark | ||||
|     # eg. Vietnamese. | ||||
|     if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( | ||||
|         "Combining" in unicode_range_a or "Combining" in unicode_range_b | ||||
|     ): | ||||
|         return False | ||||
|  | ||||
|     keywords_range_a, keywords_range_b = ( | ||||
|         unicode_range_a.split(" "), | ||||
|         unicode_range_b.split(" "), | ||||
|     ) | ||||
|  | ||||
|     for el in keywords_range_a: | ||||
|         if el in UNICODE_SECONDARY_RANGE_KEYWORD: | ||||
|             continue | ||||
|         if el in keywords_range_b: | ||||
|             return False | ||||
|  | ||||
|     # Japanese Exception | ||||
|     range_a_jp_chars, range_b_jp_chars = ( | ||||
|         unicode_range_a | ||||
|         in ( | ||||
|             "Hiragana", | ||||
|             "Katakana", | ||||
|         ), | ||||
|         unicode_range_b in ("Hiragana", "Katakana"), | ||||
|     ) | ||||
|     if (range_a_jp_chars or range_b_jp_chars) and ( | ||||
|         "CJK" in unicode_range_a or "CJK" in unicode_range_b | ||||
|     ): | ||||
|         return False | ||||
|     if range_a_jp_chars and range_b_jp_chars: | ||||
|         return False | ||||
|  | ||||
|     if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: | ||||
|         if "CJK" in unicode_range_a or "CJK" in unicode_range_b: | ||||
|             return False | ||||
|         if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": | ||||
|             return False | ||||
|  | ||||
|     # Chinese/Japanese use dedicated range for punctuation and/or separators. | ||||
|     if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( | ||||
|         unicode_range_a in ["Katakana", "Hiragana"] | ||||
|         and unicode_range_b in ["Katakana", "Hiragana"] | ||||
|     ): | ||||
|         if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: | ||||
|             return False | ||||
|         if "Forms" in unicode_range_a or "Forms" in unicode_range_b: | ||||
|             return False | ||||
|         if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": | ||||
|             return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=2048) | ||||
| def mess_ratio( | ||||
|     decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False | ||||
| ) -> float: | ||||
|     """ | ||||
|     Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. | ||||
|     """ | ||||
|  | ||||
|     detectors: list[MessDetectorPlugin] = [ | ||||
|         md_class() for md_class in MessDetectorPlugin.__subclasses__() | ||||
|     ] | ||||
|  | ||||
|     length: int = len(decoded_sequence) + 1 | ||||
|  | ||||
|     mean_mess_ratio: float = 0.0 | ||||
|  | ||||
|     if length < 512: | ||||
|         intermediary_mean_mess_ratio_calc: int = 32 | ||||
|     elif length <= 1024: | ||||
|         intermediary_mean_mess_ratio_calc = 64 | ||||
|     else: | ||||
|         intermediary_mean_mess_ratio_calc = 128 | ||||
|  | ||||
|     for character, index in zip(decoded_sequence + "\n", range(length)): | ||||
|         for detector in detectors: | ||||
|             if detector.eligible(character): | ||||
|                 detector.feed(character) | ||||
|  | ||||
|         if ( | ||||
|             index > 0 and index % intermediary_mean_mess_ratio_calc == 0 | ||||
|         ) or index == length - 1: | ||||
|             mean_mess_ratio = sum(dt.ratio for dt in detectors) | ||||
|  | ||||
|             if mean_mess_ratio >= maximum_threshold: | ||||
|                 break | ||||
|  | ||||
|     if debug: | ||||
|         logger = getLogger("charset_normalizer") | ||||
|  | ||||
|         logger.log( | ||||
|             TRACE, | ||||
|             "Mess-detector extended-analysis start. " | ||||
|             f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " | ||||
|             f"maximum_threshold={maximum_threshold}", | ||||
|         ) | ||||
|  | ||||
|         if len(decoded_sequence) > 16: | ||||
|             logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") | ||||
|             logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") | ||||
|  | ||||
|         for dt in detectors: | ||||
|             logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") | ||||
|  | ||||
|     return round(mean_mess_ratio, 3) | ||||
		Reference in New Issue
	
	Block a user