mirror of
				https://gitlab.sectorq.eu/jaydee/omv_backup.git
				synced 2025-10-31 02:21:10 +01:00 
			
		
		
		
	added v3
This commit is contained in:
		
							
								
								
									
										395
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/cd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										395
									
								
								venv/lib/python3.11/site-packages/charset_normalizer/cd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,395 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| import importlib | ||||
| from codecs import IncrementalDecoder | ||||
| from collections import Counter | ||||
| from functools import lru_cache | ||||
| from typing import Counter as TypeCounter | ||||
|  | ||||
| from .constant import ( | ||||
|     FREQUENCIES, | ||||
|     KO_NAMES, | ||||
|     LANGUAGE_SUPPORTED_COUNT, | ||||
|     TOO_SMALL_SEQUENCE, | ||||
|     ZH_NAMES, | ||||
| ) | ||||
| from .md import is_suspiciously_successive_range | ||||
| from .models import CoherenceMatches | ||||
| from .utils import ( | ||||
|     is_accentuated, | ||||
|     is_latin, | ||||
|     is_multi_byte_encoding, | ||||
|     is_unicode_range_secondary, | ||||
|     unicode_range, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def encoding_unicode_range(iana_name: str) -> list[str]: | ||||
|     """ | ||||
|     Return associated unicode ranges in a single byte code page. | ||||
|     """ | ||||
|     if is_multi_byte_encoding(iana_name): | ||||
|         raise OSError("Function not supported on multi-byte code page") | ||||
|  | ||||
|     decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder | ||||
|  | ||||
|     p: IncrementalDecoder = decoder(errors="ignore") | ||||
|     seen_ranges: dict[str, int] = {} | ||||
|     character_count: int = 0 | ||||
|  | ||||
|     for i in range(0x40, 0xFF): | ||||
|         chunk: str = p.decode(bytes([i])) | ||||
|  | ||||
|         if chunk: | ||||
|             character_range: str | None = unicode_range(chunk) | ||||
|  | ||||
|             if character_range is None: | ||||
|                 continue | ||||
|  | ||||
|             if is_unicode_range_secondary(character_range) is False: | ||||
|                 if character_range not in seen_ranges: | ||||
|                     seen_ranges[character_range] = 0 | ||||
|                 seen_ranges[character_range] += 1 | ||||
|                 character_count += 1 | ||||
|  | ||||
|     return sorted( | ||||
|         [ | ||||
|             character_range | ||||
|             for character_range in seen_ranges | ||||
|             if seen_ranges[character_range] / character_count >= 0.15 | ||||
|         ] | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def unicode_range_languages(primary_range: str) -> list[str]: | ||||
|     """ | ||||
|     Return inferred languages used with a unicode range. | ||||
|     """ | ||||
|     languages: list[str] = [] | ||||
|  | ||||
|     for language, characters in FREQUENCIES.items(): | ||||
|         for character in characters: | ||||
|             if unicode_range(character) == primary_range: | ||||
|                 languages.append(language) | ||||
|                 break | ||||
|  | ||||
|     return languages | ||||
|  | ||||
|  | ||||
| @lru_cache() | ||||
| def encoding_languages(iana_name: str) -> list[str]: | ||||
|     """ | ||||
|     Single-byte encoding language association. Some code page are heavily linked to particular language(s). | ||||
|     This function does the correspondence. | ||||
|     """ | ||||
|     unicode_ranges: list[str] = encoding_unicode_range(iana_name) | ||||
|     primary_range: str | None = None | ||||
|  | ||||
|     for specified_range in unicode_ranges: | ||||
|         if "Latin" not in specified_range: | ||||
|             primary_range = specified_range | ||||
|             break | ||||
|  | ||||
|     if primary_range is None: | ||||
|         return ["Latin Based"] | ||||
|  | ||||
|     return unicode_range_languages(primary_range) | ||||
|  | ||||
|  | ||||
| @lru_cache() | ||||
| def mb_encoding_languages(iana_name: str) -> list[str]: | ||||
|     """ | ||||
|     Multi-byte encoding language association. Some code page are heavily linked to particular language(s). | ||||
|     This function does the correspondence. | ||||
|     """ | ||||
|     if ( | ||||
|         iana_name.startswith("shift_") | ||||
|         or iana_name.startswith("iso2022_jp") | ||||
|         or iana_name.startswith("euc_j") | ||||
|         or iana_name == "cp932" | ||||
|     ): | ||||
|         return ["Japanese"] | ||||
|     if iana_name.startswith("gb") or iana_name in ZH_NAMES: | ||||
|         return ["Chinese"] | ||||
|     if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: | ||||
|         return ["Korean"] | ||||
|  | ||||
|     return [] | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) | ||||
| def get_target_features(language: str) -> tuple[bool, bool]: | ||||
|     """ | ||||
|     Determine main aspects from a supported language if it contains accents and if is pure Latin. | ||||
|     """ | ||||
|     target_have_accents: bool = False | ||||
|     target_pure_latin: bool = True | ||||
|  | ||||
|     for character in FREQUENCIES[language]: | ||||
|         if not target_have_accents and is_accentuated(character): | ||||
|             target_have_accents = True | ||||
|         if target_pure_latin and is_latin(character) is False: | ||||
|             target_pure_latin = False | ||||
|  | ||||
|     return target_have_accents, target_pure_latin | ||||
|  | ||||
|  | ||||
| def alphabet_languages( | ||||
|     characters: list[str], ignore_non_latin: bool = False | ||||
| ) -> list[str]: | ||||
|     """ | ||||
|     Return associated languages associated to given characters. | ||||
|     """ | ||||
|     languages: list[tuple[str, float]] = [] | ||||
|  | ||||
|     source_have_accents = any(is_accentuated(character) for character in characters) | ||||
|  | ||||
|     for language, language_characters in FREQUENCIES.items(): | ||||
|         target_have_accents, target_pure_latin = get_target_features(language) | ||||
|  | ||||
|         if ignore_non_latin and target_pure_latin is False: | ||||
|             continue | ||||
|  | ||||
|         if target_have_accents is False and source_have_accents: | ||||
|             continue | ||||
|  | ||||
|         character_count: int = len(language_characters) | ||||
|  | ||||
|         character_match_count: int = len( | ||||
|             [c for c in language_characters if c in characters] | ||||
|         ) | ||||
|  | ||||
|         ratio: float = character_match_count / character_count | ||||
|  | ||||
|         if ratio >= 0.2: | ||||
|             languages.append((language, ratio)) | ||||
|  | ||||
|     languages = sorted(languages, key=lambda x: x[1], reverse=True) | ||||
|  | ||||
|     return [compatible_language[0] for compatible_language in languages] | ||||
|  | ||||
|  | ||||
| def characters_popularity_compare( | ||||
|     language: str, ordered_characters: list[str] | ||||
| ) -> float: | ||||
|     """ | ||||
|     Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. | ||||
|     The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). | ||||
|     Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) | ||||
|     """ | ||||
|     if language not in FREQUENCIES: | ||||
|         raise ValueError(f"{language} not available") | ||||
|  | ||||
|     character_approved_count: int = 0 | ||||
|     FREQUENCIES_language_set = set(FREQUENCIES[language]) | ||||
|  | ||||
|     ordered_characters_count: int = len(ordered_characters) | ||||
|     target_language_characters_count: int = len(FREQUENCIES[language]) | ||||
|  | ||||
|     large_alphabet: bool = target_language_characters_count > 26 | ||||
|  | ||||
|     for character, character_rank in zip( | ||||
|         ordered_characters, range(0, ordered_characters_count) | ||||
|     ): | ||||
|         if character not in FREQUENCIES_language_set: | ||||
|             continue | ||||
|  | ||||
|         character_rank_in_language: int = FREQUENCIES[language].index(character) | ||||
|         expected_projection_ratio: float = ( | ||||
|             target_language_characters_count / ordered_characters_count | ||||
|         ) | ||||
|         character_rank_projection: int = int(character_rank * expected_projection_ratio) | ||||
|  | ||||
|         if ( | ||||
|             large_alphabet is False | ||||
|             and abs(character_rank_projection - character_rank_in_language) > 4 | ||||
|         ): | ||||
|             continue | ||||
|  | ||||
|         if ( | ||||
|             large_alphabet is True | ||||
|             and abs(character_rank_projection - character_rank_in_language) | ||||
|             < target_language_characters_count / 3 | ||||
|         ): | ||||
|             character_approved_count += 1 | ||||
|             continue | ||||
|  | ||||
|         characters_before_source: list[str] = FREQUENCIES[language][ | ||||
|             0:character_rank_in_language | ||||
|         ] | ||||
|         characters_after_source: list[str] = FREQUENCIES[language][ | ||||
|             character_rank_in_language: | ||||
|         ] | ||||
|         characters_before: list[str] = ordered_characters[0:character_rank] | ||||
|         characters_after: list[str] = ordered_characters[character_rank:] | ||||
|  | ||||
|         before_match_count: int = len( | ||||
|             set(characters_before) & set(characters_before_source) | ||||
|         ) | ||||
|  | ||||
|         after_match_count: int = len( | ||||
|             set(characters_after) & set(characters_after_source) | ||||
|         ) | ||||
|  | ||||
|         if len(characters_before_source) == 0 and before_match_count <= 4: | ||||
|             character_approved_count += 1 | ||||
|             continue | ||||
|  | ||||
|         if len(characters_after_source) == 0 and after_match_count <= 4: | ||||
|             character_approved_count += 1 | ||||
|             continue | ||||
|  | ||||
|         if ( | ||||
|             before_match_count / len(characters_before_source) >= 0.4 | ||||
|             or after_match_count / len(characters_after_source) >= 0.4 | ||||
|         ): | ||||
|             character_approved_count += 1 | ||||
|             continue | ||||
|  | ||||
|     return character_approved_count / len(ordered_characters) | ||||
|  | ||||
|  | ||||
| def alpha_unicode_split(decoded_sequence: str) -> list[str]: | ||||
|     """ | ||||
|     Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. | ||||
|     Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; | ||||
|     One containing the latin letters and the other hebrew. | ||||
|     """ | ||||
|     layers: dict[str, str] = {} | ||||
|  | ||||
|     for character in decoded_sequence: | ||||
|         if character.isalpha() is False: | ||||
|             continue | ||||
|  | ||||
|         character_range: str | None = unicode_range(character) | ||||
|  | ||||
|         if character_range is None: | ||||
|             continue | ||||
|  | ||||
|         layer_target_range: str | None = None | ||||
|  | ||||
|         for discovered_range in layers: | ||||
|             if ( | ||||
|                 is_suspiciously_successive_range(discovered_range, character_range) | ||||
|                 is False | ||||
|             ): | ||||
|                 layer_target_range = discovered_range | ||||
|                 break | ||||
|  | ||||
|         if layer_target_range is None: | ||||
|             layer_target_range = character_range | ||||
|  | ||||
|         if layer_target_range not in layers: | ||||
|             layers[layer_target_range] = character.lower() | ||||
|             continue | ||||
|  | ||||
|         layers[layer_target_range] += character.lower() | ||||
|  | ||||
|     return list(layers.values()) | ||||
|  | ||||
|  | ||||
| def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches: | ||||
|     """ | ||||
|     This function merge results previously given by the function coherence_ratio. | ||||
|     The return type is the same as coherence_ratio. | ||||
|     """ | ||||
|     per_language_ratios: dict[str, list[float]] = {} | ||||
|     for result in results: | ||||
|         for sub_result in result: | ||||
|             language, ratio = sub_result | ||||
|             if language not in per_language_ratios: | ||||
|                 per_language_ratios[language] = [ratio] | ||||
|                 continue | ||||
|             per_language_ratios[language].append(ratio) | ||||
|  | ||||
|     merge = [ | ||||
|         ( | ||||
|             language, | ||||
|             round( | ||||
|                 sum(per_language_ratios[language]) / len(per_language_ratios[language]), | ||||
|                 4, | ||||
|             ), | ||||
|         ) | ||||
|         for language in per_language_ratios | ||||
|     ] | ||||
|  | ||||
|     return sorted(merge, key=lambda x: x[1], reverse=True) | ||||
|  | ||||
|  | ||||
| def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: | ||||
|     """ | ||||
|     We shall NOT return "English—" in CoherenceMatches because it is an alternative | ||||
|     of "English". This function only keeps the best match and remove the em-dash in it. | ||||
|     """ | ||||
|     index_results: dict[str, list[float]] = dict() | ||||
|  | ||||
|     for result in results: | ||||
|         language, ratio = result | ||||
|         no_em_name: str = language.replace("—", "") | ||||
|  | ||||
|         if no_em_name not in index_results: | ||||
|             index_results[no_em_name] = [] | ||||
|  | ||||
|         index_results[no_em_name].append(ratio) | ||||
|  | ||||
|     if any(len(index_results[e]) > 1 for e in index_results): | ||||
|         filtered_results: CoherenceMatches = [] | ||||
|  | ||||
|         for language in index_results: | ||||
|             filtered_results.append((language, max(index_results[language]))) | ||||
|  | ||||
|         return filtered_results | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| @lru_cache(maxsize=2048) | ||||
| def coherence_ratio( | ||||
|     decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None | ||||
| ) -> CoherenceMatches: | ||||
|     """ | ||||
|     Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. | ||||
|     A layer = Character extraction by alphabets/ranges. | ||||
|     """ | ||||
|  | ||||
|     results: list[tuple[str, float]] = [] | ||||
|     ignore_non_latin: bool = False | ||||
|  | ||||
|     sufficient_match_count: int = 0 | ||||
|  | ||||
|     lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] | ||||
|     if "Latin Based" in lg_inclusion_list: | ||||
|         ignore_non_latin = True | ||||
|         lg_inclusion_list.remove("Latin Based") | ||||
|  | ||||
|     for layer in alpha_unicode_split(decoded_sequence): | ||||
|         sequence_frequencies: TypeCounter[str] = Counter(layer) | ||||
|         most_common = sequence_frequencies.most_common() | ||||
|  | ||||
|         character_count: int = sum(o for c, o in most_common) | ||||
|  | ||||
|         if character_count <= TOO_SMALL_SEQUENCE: | ||||
|             continue | ||||
|  | ||||
|         popular_character_ordered: list[str] = [c for c, o in most_common] | ||||
|  | ||||
|         for language in lg_inclusion_list or alphabet_languages( | ||||
|             popular_character_ordered, ignore_non_latin | ||||
|         ): | ||||
|             ratio: float = characters_popularity_compare( | ||||
|                 language, popular_character_ordered | ||||
|             ) | ||||
|  | ||||
|             if ratio < threshold: | ||||
|                 continue | ||||
|             elif ratio >= 0.8: | ||||
|                 sufficient_match_count += 1 | ||||
|  | ||||
|             results.append((language, round(ratio, 4))) | ||||
|  | ||||
|             if sufficient_match_count >= 3: | ||||
|                 break | ||||
|  | ||||
|     return sorted( | ||||
|         filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True | ||||
|     ) | ||||
		Reference in New Issue
	
	Block a user