mirror of
				https://gitlab.sectorq.eu/jaydee/omv_backup.git
				synced 2025-10-31 02:21:10 +01:00 
			
		
		
		
	added v3
This commit is contained in:
		
							
								
								
									
										408
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										408
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,408 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| import importlib | ||||
| import logging | ||||
| import unicodedata | ||||
| from codecs import IncrementalDecoder | ||||
| from encodings.aliases import aliases | ||||
| from functools import lru_cache | ||||
| from re import findall | ||||
| from typing import Generator | ||||
|  | ||||
| from _multibytecodec import (  # type: ignore[import-not-found,import] | ||||
|     MultibyteIncrementalDecoder, | ||||
| ) | ||||
|  | ||||
| from .constant import ( | ||||
|     ENCODING_MARKS, | ||||
|     IANA_SUPPORTED_SIMILAR, | ||||
|     RE_POSSIBLE_ENCODING_INDICATION, | ||||
|     UNICODE_RANGES_COMBINED, | ||||
|     UNICODE_SECONDARY_RANGE_KEYWORD, | ||||
|     UTF8_MAXIMAL_ALLOCATION, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_accentuated(character: str) -> bool: | ||||
|     try: | ||||
|         description: str = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|     return ( | ||||
|         "WITH GRAVE" in description | ||||
|         or "WITH ACUTE" in description | ||||
|         or "WITH CEDILLA" in description | ||||
|         or "WITH DIAERESIS" in description | ||||
|         or "WITH CIRCUMFLEX" in description | ||||
|         or "WITH TILDE" in description | ||||
|         or "WITH MACRON" in description | ||||
|         or "WITH RING ABOVE" in description | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def remove_accent(character: str) -> str: | ||||
|     decomposed: str = unicodedata.decomposition(character) | ||||
|     if not decomposed: | ||||
|         return character | ||||
|  | ||||
|     codes: list[str] = decomposed.split(" ") | ||||
|  | ||||
|     return chr(int(codes[0], 16)) | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def unicode_range(character: str) -> str | None: | ||||
|     """ | ||||
|     Retrieve the Unicode range official name from a single character. | ||||
|     """ | ||||
|     character_ord: int = ord(character) | ||||
|  | ||||
|     for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): | ||||
|         if character_ord in ord_range: | ||||
|             return range_name | ||||
|  | ||||
|     return None | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_latin(character: str) -> bool: | ||||
|     try: | ||||
|         description: str = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|     return "LATIN" in description | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_punctuation(character: str) -> bool: | ||||
|     character_category: str = unicodedata.category(character) | ||||
|  | ||||
|     if "P" in character_category: | ||||
|         return True | ||||
|  | ||||
|     character_range: str | None = unicode_range(character) | ||||
|  | ||||
|     if character_range is None: | ||||
|         return False | ||||
|  | ||||
|     return "Punctuation" in character_range | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_symbol(character: str) -> bool: | ||||
|     character_category: str = unicodedata.category(character) | ||||
|  | ||||
|     if "S" in character_category or "N" in character_category: | ||||
|         return True | ||||
|  | ||||
|     character_range: str | None = unicode_range(character) | ||||
|  | ||||
|     if character_range is None: | ||||
|         return False | ||||
|  | ||||
|     return "Forms" in character_range and character_category != "Lo" | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_emoticon(character: str) -> bool: | ||||
|     character_range: str | None = unicode_range(character) | ||||
|  | ||||
|     if character_range is None: | ||||
|         return False | ||||
|  | ||||
|     return "Emoticons" in character_range or "Pictographs" in character_range | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_separator(character: str) -> bool: | ||||
|     if character.isspace() or character in {"|", "+", "<", ">"}: | ||||
|         return True | ||||
|  | ||||
|     character_category: str = unicodedata.category(character) | ||||
|  | ||||
|     return "Z" in character_category or character_category in {"Po", "Pd", "Pc"} | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_case_variable(character: str) -> bool: | ||||
|     return character.islower() != character.isupper() | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_cjk(character: str) -> bool: | ||||
|     try: | ||||
|         character_name = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|  | ||||
|     return "CJK" in character_name | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_hiragana(character: str) -> bool: | ||||
|     try: | ||||
|         character_name = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|  | ||||
|     return "HIRAGANA" in character_name | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_katakana(character: str) -> bool: | ||||
|     try: | ||||
|         character_name = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|  | ||||
|     return "KATAKANA" in character_name | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_hangul(character: str) -> bool: | ||||
|     try: | ||||
|         character_name = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|  | ||||
|     return "HANGUL" in character_name | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_thai(character: str) -> bool: | ||||
|     try: | ||||
|         character_name = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|  | ||||
|     return "THAI" in character_name | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_arabic(character: str) -> bool: | ||||
|     try: | ||||
|         character_name = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|  | ||||
|     return "ARABIC" in character_name | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_arabic_isolated_form(character: str) -> bool: | ||||
|     try: | ||||
|         character_name = unicodedata.name(character) | ||||
|     except ValueError:  # Defensive: unicode database outdated? | ||||
|         return False | ||||
|  | ||||
|     return "ARABIC" in character_name and "ISOLATED FORM" in character_name | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) | ||||
| def is_unicode_range_secondary(range_name: str) -> bool: | ||||
|     return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | ||||
| def is_unprintable(character: str) -> bool: | ||||
|     return ( | ||||
|         character.isspace() is False  # includes \n \t \r \v | ||||
|         and character.isprintable() is False | ||||
|         and character != "\x1a"  # Why? Its the ASCII substitute character. | ||||
|         and character != "\ufeff"  # bug discovered in Python, | ||||
|         # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None: | ||||
|     """ | ||||
|     Extract using ASCII-only decoder any specified encoding in the first n-bytes. | ||||
|     """ | ||||
|     if not isinstance(sequence, bytes): | ||||
|         raise TypeError | ||||
|  | ||||
|     seq_len: int = len(sequence) | ||||
|  | ||||
|     results: list[str] = findall( | ||||
|         RE_POSSIBLE_ENCODING_INDICATION, | ||||
|         sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), | ||||
|     ) | ||||
|  | ||||
|     if len(results) == 0: | ||||
|         return None | ||||
|  | ||||
|     for specified_encoding in results: | ||||
|         specified_encoding = specified_encoding.lower().replace("-", "_") | ||||
|  | ||||
|         encoding_alias: str | ||||
|         encoding_iana: str | ||||
|  | ||||
|         for encoding_alias, encoding_iana in aliases.items(): | ||||
|             if encoding_alias == specified_encoding: | ||||
|                 return encoding_iana | ||||
|             if encoding_iana == specified_encoding: | ||||
|                 return encoding_iana | ||||
|  | ||||
|     return None | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=128) | ||||
| def is_multi_byte_encoding(name: str) -> bool: | ||||
|     """ | ||||
|     Verify is a specific encoding is a multi byte one based on it IANA name | ||||
|     """ | ||||
|     return name in { | ||||
|         "utf_8", | ||||
|         "utf_8_sig", | ||||
|         "utf_16", | ||||
|         "utf_16_be", | ||||
|         "utf_16_le", | ||||
|         "utf_32", | ||||
|         "utf_32_le", | ||||
|         "utf_32_be", | ||||
|         "utf_7", | ||||
|     } or issubclass( | ||||
|         importlib.import_module(f"encodings.{name}").IncrementalDecoder, | ||||
|         MultibyteIncrementalDecoder, | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]: | ||||
|     """ | ||||
|     Identify and extract SIG/BOM in given sequence. | ||||
|     """ | ||||
|  | ||||
|     for iana_encoding in ENCODING_MARKS: | ||||
|         marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding] | ||||
|  | ||||
|         if isinstance(marks, bytes): | ||||
|             marks = [marks] | ||||
|  | ||||
|         for mark in marks: | ||||
|             if sequence.startswith(mark): | ||||
|                 return iana_encoding, mark | ||||
|  | ||||
|     return None, b"" | ||||
|  | ||||
|  | ||||
| def should_strip_sig_or_bom(iana_encoding: str) -> bool: | ||||
|     return iana_encoding not in {"utf_16", "utf_32"} | ||||
|  | ||||
|  | ||||
| def iana_name(cp_name: str, strict: bool = True) -> str: | ||||
|     """Returns the Python normalized encoding name (Not the IANA official name).""" | ||||
|     cp_name = cp_name.lower().replace("-", "_") | ||||
|  | ||||
|     encoding_alias: str | ||||
|     encoding_iana: str | ||||
|  | ||||
|     for encoding_alias, encoding_iana in aliases.items(): | ||||
|         if cp_name in [encoding_alias, encoding_iana]: | ||||
|             return encoding_iana | ||||
|  | ||||
|     if strict: | ||||
|         raise ValueError(f"Unable to retrieve IANA for '{cp_name}'") | ||||
|  | ||||
|     return cp_name | ||||
|  | ||||
|  | ||||
| def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: | ||||
|     if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): | ||||
|         return 0.0 | ||||
|  | ||||
|     decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder | ||||
|     decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder | ||||
|  | ||||
|     id_a: IncrementalDecoder = decoder_a(errors="ignore") | ||||
|     id_b: IncrementalDecoder = decoder_b(errors="ignore") | ||||
|  | ||||
|     character_match_count: int = 0 | ||||
|  | ||||
|     for i in range(255): | ||||
|         to_be_decoded: bytes = bytes([i]) | ||||
|         if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): | ||||
|             character_match_count += 1 | ||||
|  | ||||
|     return character_match_count / 254 | ||||
|  | ||||
|  | ||||
| def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: | ||||
|     """ | ||||
|     Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using | ||||
|     the function cp_similarity. | ||||
|     """ | ||||
|     return ( | ||||
|         iana_name_a in IANA_SUPPORTED_SIMILAR | ||||
|         and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def set_logging_handler( | ||||
|     name: str = "charset_normalizer", | ||||
|     level: int = logging.INFO, | ||||
|     format_string: str = "%(asctime)s | %(levelname)s | %(message)s", | ||||
| ) -> None: | ||||
|     logger = logging.getLogger(name) | ||||
|     logger.setLevel(level) | ||||
|  | ||||
|     handler = logging.StreamHandler() | ||||
|     handler.setFormatter(logging.Formatter(format_string)) | ||||
|     logger.addHandler(handler) | ||||
|  | ||||
|  | ||||
| def cut_sequence_chunks( | ||||
|     sequences: bytes, | ||||
|     encoding_iana: str, | ||||
|     offsets: range, | ||||
|     chunk_size: int, | ||||
|     bom_or_sig_available: bool, | ||||
|     strip_sig_or_bom: bool, | ||||
|     sig_payload: bytes, | ||||
|     is_multi_byte_decoder: bool, | ||||
|     decoded_payload: str | None = None, | ||||
| ) -> Generator[str, None, None]: | ||||
|     if decoded_payload and is_multi_byte_decoder is False: | ||||
|         for i in offsets: | ||||
|             chunk = decoded_payload[i : i + chunk_size] | ||||
|             if not chunk: | ||||
|                 break | ||||
|             yield chunk | ||||
|     else: | ||||
|         for i in offsets: | ||||
|             chunk_end = i + chunk_size | ||||
|             if chunk_end > len(sequences) + 8: | ||||
|                 continue | ||||
|  | ||||
|             cut_sequence = sequences[i : i + chunk_size] | ||||
|  | ||||
|             if bom_or_sig_available and strip_sig_or_bom is False: | ||||
|                 cut_sequence = sig_payload + cut_sequence | ||||
|  | ||||
|             chunk = cut_sequence.decode( | ||||
|                 encoding_iana, | ||||
|                 errors="ignore" if is_multi_byte_decoder else "strict", | ||||
|             ) | ||||
|  | ||||
|             # multi-byte bad cutting detector and adjustment | ||||
|             # not the cleanest way to perform that fix but clever enough for now. | ||||
|             if is_multi_byte_decoder and i > 0: | ||||
|                 chunk_partial_size_chk: int = min(chunk_size, 16) | ||||
|  | ||||
|                 if ( | ||||
|                     decoded_payload | ||||
|                     and chunk[:chunk_partial_size_chk] not in decoded_payload | ||||
|                 ): | ||||
|                     for j in range(i, i - 4, -1): | ||||
|                         cut_sequence = sequences[j:chunk_end] | ||||
|  | ||||
|                         if bom_or_sig_available and strip_sig_or_bom is False: | ||||
|                             cut_sequence = sig_payload + cut_sequence | ||||
|  | ||||
|                         chunk = cut_sequence.decode(encoding_iana, errors="ignore") | ||||
|  | ||||
|                         if chunk[:chunk_partial_size_chk] in decoded_payload: | ||||
|                             break | ||||
|  | ||||
|             yield chunk | ||||
		Reference in New Issue
	
	Block a user