mirror of
				https://gitlab.sectorq.eu/jaydee/omv_backup.git
				synced 2025-10-31 02:21:10 +01:00 
			
		
		
		
	added v3
This commit is contained in:
		
							
								
								
									
										360
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										360
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,360 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from encodings.aliases import aliases | ||||
| from hashlib import sha256 | ||||
| from json import dumps | ||||
| from re import sub | ||||
| from typing import Any, Iterator, List, Tuple | ||||
|  | ||||
| from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE | ||||
| from .utils import iana_name, is_multi_byte_encoding, unicode_range | ||||
|  | ||||
|  | ||||
| class CharsetMatch: | ||||
|     def __init__( | ||||
|         self, | ||||
|         payload: bytes, | ||||
|         guessed_encoding: str, | ||||
|         mean_mess_ratio: float, | ||||
|         has_sig_or_bom: bool, | ||||
|         languages: CoherenceMatches, | ||||
|         decoded_payload: str | None = None, | ||||
|         preemptive_declaration: str | None = None, | ||||
|     ): | ||||
|         self._payload: bytes = payload | ||||
|  | ||||
|         self._encoding: str = guessed_encoding | ||||
|         self._mean_mess_ratio: float = mean_mess_ratio | ||||
|         self._languages: CoherenceMatches = languages | ||||
|         self._has_sig_or_bom: bool = has_sig_or_bom | ||||
|         self._unicode_ranges: list[str] | None = None | ||||
|  | ||||
|         self._leaves: list[CharsetMatch] = [] | ||||
|         self._mean_coherence_ratio: float = 0.0 | ||||
|  | ||||
|         self._output_payload: bytes | None = None | ||||
|         self._output_encoding: str | None = None | ||||
|  | ||||
|         self._string: str | None = decoded_payload | ||||
|  | ||||
|         self._preemptive_declaration: str | None = preemptive_declaration | ||||
|  | ||||
|     def __eq__(self, other: object) -> bool: | ||||
|         if not isinstance(other, CharsetMatch): | ||||
|             if isinstance(other, str): | ||||
|                 return iana_name(other) == self.encoding | ||||
|             return False | ||||
|         return self.encoding == other.encoding and self.fingerprint == other.fingerprint | ||||
|  | ||||
|     def __lt__(self, other: object) -> bool: | ||||
|         """ | ||||
|         Implemented to make sorted available upon CharsetMatches items. | ||||
|         """ | ||||
|         if not isinstance(other, CharsetMatch): | ||||
|             raise ValueError | ||||
|  | ||||
|         chaos_difference: float = abs(self.chaos - other.chaos) | ||||
|         coherence_difference: float = abs(self.coherence - other.coherence) | ||||
|  | ||||
|         # Below 1% difference --> Use Coherence | ||||
|         if chaos_difference < 0.01 and coherence_difference > 0.02: | ||||
|             return self.coherence > other.coherence | ||||
|         elif chaos_difference < 0.01 and coherence_difference <= 0.02: | ||||
|             # When having a difficult decision, use the result that decoded as many multi-byte as possible. | ||||
|             # preserve RAM usage! | ||||
|             if len(self._payload) >= TOO_BIG_SEQUENCE: | ||||
|                 return self.chaos < other.chaos | ||||
|             return self.multi_byte_usage > other.multi_byte_usage | ||||
|  | ||||
|         return self.chaos < other.chaos | ||||
|  | ||||
|     @property | ||||
|     def multi_byte_usage(self) -> float: | ||||
|         return 1.0 - (len(str(self)) / len(self.raw)) | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         # Lazy Str Loading | ||||
|         if self._string is None: | ||||
|             self._string = str(self._payload, self._encoding, "strict") | ||||
|         return self._string | ||||
|  | ||||
|     def __repr__(self) -> str: | ||||
|         return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>" | ||||
|  | ||||
|     def add_submatch(self, other: CharsetMatch) -> None: | ||||
|         if not isinstance(other, CharsetMatch) or other == self: | ||||
|             raise ValueError( | ||||
|                 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( | ||||
|                     other.__class__ | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         other._string = None  # Unload RAM usage; dirty trick. | ||||
|         self._leaves.append(other) | ||||
|  | ||||
|     @property | ||||
|     def encoding(self) -> str: | ||||
|         return self._encoding | ||||
|  | ||||
|     @property | ||||
|     def encoding_aliases(self) -> list[str]: | ||||
|         """ | ||||
|         Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. | ||||
|         """ | ||||
|         also_known_as: list[str] = [] | ||||
|         for u, p in aliases.items(): | ||||
|             if self.encoding == u: | ||||
|                 also_known_as.append(p) | ||||
|             elif self.encoding == p: | ||||
|                 also_known_as.append(u) | ||||
|         return also_known_as | ||||
|  | ||||
|     @property | ||||
|     def bom(self) -> bool: | ||||
|         return self._has_sig_or_bom | ||||
|  | ||||
|     @property | ||||
|     def byte_order_mark(self) -> bool: | ||||
|         return self._has_sig_or_bom | ||||
|  | ||||
|     @property | ||||
|     def languages(self) -> list[str]: | ||||
|         """ | ||||
|         Return the complete list of possible languages found in decoded sequence. | ||||
|         Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. | ||||
|         """ | ||||
|         return [e[0] for e in self._languages] | ||||
|  | ||||
|     @property | ||||
|     def language(self) -> str: | ||||
|         """ | ||||
|         Most probable language found in decoded sequence. If none were detected or inferred, the property will return | ||||
|         "Unknown". | ||||
|         """ | ||||
|         if not self._languages: | ||||
|             # Trying to infer the language based on the given encoding | ||||
|             # Its either English or we should not pronounce ourselves in certain cases. | ||||
|             if "ascii" in self.could_be_from_charset: | ||||
|                 return "English" | ||||
|  | ||||
|             # doing it there to avoid circular import | ||||
|             from charset_normalizer.cd import encoding_languages, mb_encoding_languages | ||||
|  | ||||
|             languages = ( | ||||
|                 mb_encoding_languages(self.encoding) | ||||
|                 if is_multi_byte_encoding(self.encoding) | ||||
|                 else encoding_languages(self.encoding) | ||||
|             ) | ||||
|  | ||||
|             if len(languages) == 0 or "Latin Based" in languages: | ||||
|                 return "Unknown" | ||||
|  | ||||
|             return languages[0] | ||||
|  | ||||
|         return self._languages[0][0] | ||||
|  | ||||
|     @property | ||||
|     def chaos(self) -> float: | ||||
|         return self._mean_mess_ratio | ||||
|  | ||||
|     @property | ||||
|     def coherence(self) -> float: | ||||
|         if not self._languages: | ||||
|             return 0.0 | ||||
|         return self._languages[0][1] | ||||
|  | ||||
|     @property | ||||
|     def percent_chaos(self) -> float: | ||||
|         return round(self.chaos * 100, ndigits=3) | ||||
|  | ||||
|     @property | ||||
|     def percent_coherence(self) -> float: | ||||
|         return round(self.coherence * 100, ndigits=3) | ||||
|  | ||||
|     @property | ||||
|     def raw(self) -> bytes: | ||||
|         """ | ||||
|         Original untouched bytes. | ||||
|         """ | ||||
|         return self._payload | ||||
|  | ||||
|     @property | ||||
|     def submatch(self) -> list[CharsetMatch]: | ||||
|         return self._leaves | ||||
|  | ||||
|     @property | ||||
|     def has_submatch(self) -> bool: | ||||
|         return len(self._leaves) > 0 | ||||
|  | ||||
|     @property | ||||
|     def alphabets(self) -> list[str]: | ||||
|         if self._unicode_ranges is not None: | ||||
|             return self._unicode_ranges | ||||
|         # list detected ranges | ||||
|         detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] | ||||
|         # filter and sort | ||||
|         self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) | ||||
|         return self._unicode_ranges | ||||
|  | ||||
|     @property | ||||
|     def could_be_from_charset(self) -> list[str]: | ||||
|         """ | ||||
|         The complete list of encoding that output the exact SAME str result and therefore could be the originating | ||||
|         encoding. | ||||
|         This list does include the encoding available in property 'encoding'. | ||||
|         """ | ||||
|         return [self._encoding] + [m.encoding for m in self._leaves] | ||||
|  | ||||
|     def output(self, encoding: str = "utf_8") -> bytes: | ||||
|         """ | ||||
|         Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. | ||||
|         Any errors will be simply ignored by the encoder NOT replaced. | ||||
|         """ | ||||
|         if self._output_encoding is None or self._output_encoding != encoding: | ||||
|             self._output_encoding = encoding | ||||
|             decoded_string = str(self) | ||||
|             if ( | ||||
|                 self._preemptive_declaration is not None | ||||
|                 and self._preemptive_declaration.lower() | ||||
|                 not in ["utf-8", "utf8", "utf_8"] | ||||
|             ): | ||||
|                 patched_header = sub( | ||||
|                     RE_POSSIBLE_ENCODING_INDICATION, | ||||
|                     lambda m: m.string[m.span()[0] : m.span()[1]].replace( | ||||
|                         m.groups()[0], | ||||
|                         iana_name(self._output_encoding).replace("_", "-"),  # type: ignore[arg-type] | ||||
|                     ), | ||||
|                     decoded_string[:8192], | ||||
|                     count=1, | ||||
|                 ) | ||||
|  | ||||
|                 decoded_string = patched_header + decoded_string[8192:] | ||||
|  | ||||
|             self._output_payload = decoded_string.encode(encoding, "replace") | ||||
|  | ||||
|         return self._output_payload  # type: ignore | ||||
|  | ||||
|     @property | ||||
|     def fingerprint(self) -> str: | ||||
|         """ | ||||
|         Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. | ||||
|         """ | ||||
|         return sha256(self.output()).hexdigest() | ||||
|  | ||||
|  | ||||
| class CharsetMatches: | ||||
|     """ | ||||
|     Container with every CharsetMatch items ordered by default from most probable to the less one. | ||||
|     Act like a list(iterable) but does not implements all related methods. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, results: list[CharsetMatch] | None = None): | ||||
|         self._results: list[CharsetMatch] = sorted(results) if results else [] | ||||
|  | ||||
|     def __iter__(self) -> Iterator[CharsetMatch]: | ||||
|         yield from self._results | ||||
|  | ||||
|     def __getitem__(self, item: int | str) -> CharsetMatch: | ||||
|         """ | ||||
|         Retrieve a single item either by its position or encoding name (alias may be used here). | ||||
|         Raise KeyError upon invalid index or encoding not present in results. | ||||
|         """ | ||||
|         if isinstance(item, int): | ||||
|             return self._results[item] | ||||
|         if isinstance(item, str): | ||||
|             item = iana_name(item, False) | ||||
|             for result in self._results: | ||||
|                 if item in result.could_be_from_charset: | ||||
|                     return result | ||||
|         raise KeyError | ||||
|  | ||||
|     def __len__(self) -> int: | ||||
|         return len(self._results) | ||||
|  | ||||
|     def __bool__(self) -> bool: | ||||
|         return len(self._results) > 0 | ||||
|  | ||||
|     def append(self, item: CharsetMatch) -> None: | ||||
|         """ | ||||
|         Insert a single match. Will be inserted accordingly to preserve sort. | ||||
|         Can be inserted as a submatch. | ||||
|         """ | ||||
|         if not isinstance(item, CharsetMatch): | ||||
|             raise ValueError( | ||||
|                 "Cannot append instance '{}' to CharsetMatches".format( | ||||
|                     str(item.__class__) | ||||
|                 ) | ||||
|             ) | ||||
|         # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) | ||||
|         if len(item.raw) < TOO_BIG_SEQUENCE: | ||||
|             for match in self._results: | ||||
|                 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: | ||||
|                     match.add_submatch(item) | ||||
|                     return | ||||
|         self._results.append(item) | ||||
|         self._results = sorted(self._results) | ||||
|  | ||||
|     def best(self) -> CharsetMatch | None: | ||||
|         """ | ||||
|         Simply return the first match. Strict equivalent to matches[0]. | ||||
|         """ | ||||
|         if not self._results: | ||||
|             return None | ||||
|         return self._results[0] | ||||
|  | ||||
|     def first(self) -> CharsetMatch | None: | ||||
|         """ | ||||
|         Redundant method, call the method best(). Kept for BC reasons. | ||||
|         """ | ||||
|         return self.best() | ||||
|  | ||||
|  | ||||
| CoherenceMatch = Tuple[str, float] | ||||
| CoherenceMatches = List[CoherenceMatch] | ||||
|  | ||||
|  | ||||
| class CliDetectionResult: | ||||
|     def __init__( | ||||
|         self, | ||||
|         path: str, | ||||
|         encoding: str | None, | ||||
|         encoding_aliases: list[str], | ||||
|         alternative_encodings: list[str], | ||||
|         language: str, | ||||
|         alphabets: list[str], | ||||
|         has_sig_or_bom: bool, | ||||
|         chaos: float, | ||||
|         coherence: float, | ||||
|         unicode_path: str | None, | ||||
|         is_preferred: bool, | ||||
|     ): | ||||
|         self.path: str = path | ||||
|         self.unicode_path: str | None = unicode_path | ||||
|         self.encoding: str | None = encoding | ||||
|         self.encoding_aliases: list[str] = encoding_aliases | ||||
|         self.alternative_encodings: list[str] = alternative_encodings | ||||
|         self.language: str = language | ||||
|         self.alphabets: list[str] = alphabets | ||||
|         self.has_sig_or_bom: bool = has_sig_or_bom | ||||
|         self.chaos: float = chaos | ||||
|         self.coherence: float = coherence | ||||
|         self.is_preferred: bool = is_preferred | ||||
|  | ||||
|     @property | ||||
|     def __dict__(self) -> dict[str, Any]:  # type: ignore | ||||
|         return { | ||||
|             "path": self.path, | ||||
|             "encoding": self.encoding, | ||||
|             "encoding_aliases": self.encoding_aliases, | ||||
|             "alternative_encodings": self.alternative_encodings, | ||||
|             "language": self.language, | ||||
|             "alphabets": self.alphabets, | ||||
|             "has_sig_or_bom": self.has_sig_or_bom, | ||||
|             "chaos": self.chaos, | ||||
|             "coherence": self.coherence, | ||||
|             "unicode_path": self.unicode_path, | ||||
|             "is_preferred": self.is_preferred, | ||||
|         } | ||||
|  | ||||
|     def to_json(self) -> str: | ||||
|         return dumps(self.__dict__, ensure_ascii=True, indent=4) | ||||
		Reference in New Issue
	
	Block a user