| from __future__ import annotations |
|
|
| import importlib |
| from codecs import IncrementalDecoder |
| from collections import Counter |
| from functools import lru_cache |
| from typing import Counter as TypeCounter |
|
|
| from .constant import ( |
| FREQUENCIES, |
| KO_NAMES, |
| LANGUAGE_SUPPORTED_COUNT, |
| TOO_SMALL_SEQUENCE, |
| ZH_NAMES, |
| ) |
| from .md import is_suspiciously_successive_range |
| from .models import CoherenceMatches |
| from .utils import ( |
| is_accentuated, |
| is_latin, |
| is_multi_byte_encoding, |
| is_unicode_range_secondary, |
| unicode_range, |
| ) |
|
|
|
|
| def encoding_unicode_range(iana_name: str) -> list[str]: |
| """ |
| Return associated unicode ranges in a single byte code page. |
| """ |
| if is_multi_byte_encoding(iana_name): |
| raise OSError("Function not supported on multi-byte code page") |
|
|
| decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder |
|
|
| p: IncrementalDecoder = decoder(errors="ignore") |
| seen_ranges: dict[str, int] = {} |
| character_count: int = 0 |
|
|
| for i in range(0x40, 0xFF): |
| chunk: str = p.decode(bytes([i])) |
|
|
| if chunk: |
| character_range: str | None = unicode_range(chunk) |
|
|
| if character_range is None: |
| continue |
|
|
| if is_unicode_range_secondary(character_range) is False: |
| if character_range not in seen_ranges: |
| seen_ranges[character_range] = 0 |
| seen_ranges[character_range] += 1 |
| character_count += 1 |
|
|
| return sorted( |
| [ |
| character_range |
| for character_range in seen_ranges |
| if seen_ranges[character_range] / character_count >= 0.15 |
| ] |
| ) |
|
|
|
|
| def unicode_range_languages(primary_range: str) -> list[str]: |
| """ |
| Return inferred languages used with a unicode range. |
| """ |
| languages: list[str] = [] |
|
|
| for language, characters in FREQUENCIES.items(): |
| for character in characters: |
| if unicode_range(character) == primary_range: |
| languages.append(language) |
| break |
|
|
| return languages |
|
|
|
|
| @lru_cache() |
| def encoding_languages(iana_name: str) -> list[str]: |
| """ |
| Single-byte encoding language association. Some code page are heavily linked to particular language(s). |
| This function does the correspondence. |
| """ |
| unicode_ranges: list[str] = encoding_unicode_range(iana_name) |
| primary_range: str | None = None |
|
|
| for specified_range in unicode_ranges: |
| if "Latin" not in specified_range: |
| primary_range = specified_range |
| break |
|
|
| if primary_range is None: |
| return ["Latin Based"] |
|
|
| return unicode_range_languages(primary_range) |
|
|
|
|
| @lru_cache() |
| def mb_encoding_languages(iana_name: str) -> list[str]: |
| """ |
| Multi-byte encoding language association. Some code page are heavily linked to particular language(s). |
| This function does the correspondence. |
| """ |
| if ( |
| iana_name.startswith("shift_") |
| or iana_name.startswith("iso2022_jp") |
| or iana_name.startswith("euc_j") |
| or iana_name == "cp932" |
| ): |
| return ["Japanese"] |
| if iana_name.startswith("gb") or iana_name in ZH_NAMES: |
| return ["Chinese"] |
| if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: |
| return ["Korean"] |
|
|
| return [] |
|
|
|
|
| @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) |
| def get_target_features(language: str) -> tuple[bool, bool]: |
| """ |
| Determine main aspects from a supported language if it contains accents and if is pure Latin. |
| """ |
| target_have_accents: bool = False |
| target_pure_latin: bool = True |
|
|
| for character in FREQUENCIES[language]: |
| if not target_have_accents and is_accentuated(character): |
| target_have_accents = True |
| if target_pure_latin and is_latin(character) is False: |
| target_pure_latin = False |
|
|
| return target_have_accents, target_pure_latin |
|
|
|
|
| def alphabet_languages( |
| characters: list[str], ignore_non_latin: bool = False |
| ) -> list[str]: |
| """ |
| Return associated languages associated to given characters. |
| """ |
| languages: list[tuple[str, float]] = [] |
|
|
| source_have_accents = any(is_accentuated(character) for character in characters) |
|
|
| for language, language_characters in FREQUENCIES.items(): |
| target_have_accents, target_pure_latin = get_target_features(language) |
|
|
| if ignore_non_latin and target_pure_latin is False: |
| continue |
|
|
| if target_have_accents is False and source_have_accents: |
| continue |
|
|
| character_count: int = len(language_characters) |
|
|
| character_match_count: int = len( |
| [c for c in language_characters if c in characters] |
| ) |
|
|
| ratio: float = character_match_count / character_count |
|
|
| if ratio >= 0.2: |
| languages.append((language, ratio)) |
|
|
| languages = sorted(languages, key=lambda x: x[1], reverse=True) |
|
|
| return [compatible_language[0] for compatible_language in languages] |
|
|
|
|
| def characters_popularity_compare( |
| language: str, ordered_characters: list[str] |
| ) -> float: |
| """ |
| Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. |
| The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). |
| Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) |
| """ |
| if language not in FREQUENCIES: |
| raise ValueError(f"{language} not available") |
|
|
| character_approved_count: int = 0 |
| FREQUENCIES_language_set = set(FREQUENCIES[language]) |
|
|
| ordered_characters_count: int = len(ordered_characters) |
| target_language_characters_count: int = len(FREQUENCIES[language]) |
|
|
| large_alphabet: bool = target_language_characters_count > 26 |
|
|
| for character, character_rank in zip( |
| ordered_characters, range(0, ordered_characters_count) |
| ): |
| if character not in FREQUENCIES_language_set: |
| continue |
|
|
| character_rank_in_language: int = FREQUENCIES[language].index(character) |
| expected_projection_ratio: float = ( |
| target_language_characters_count / ordered_characters_count |
| ) |
| character_rank_projection: int = int(character_rank * expected_projection_ratio) |
|
|
| if ( |
| large_alphabet is False |
| and abs(character_rank_projection - character_rank_in_language) > 4 |
| ): |
| continue |
|
|
| if ( |
| large_alphabet is True |
| and abs(character_rank_projection - character_rank_in_language) |
| < target_language_characters_count / 3 |
| ): |
| character_approved_count += 1 |
| continue |
|
|
| characters_before_source: list[str] = FREQUENCIES[language][ |
| 0:character_rank_in_language |
| ] |
| characters_after_source: list[str] = FREQUENCIES[language][ |
| character_rank_in_language: |
| ] |
| characters_before: list[str] = ordered_characters[0:character_rank] |
| characters_after: list[str] = ordered_characters[character_rank:] |
|
|
| before_match_count: int = len( |
| set(characters_before) & set(characters_before_source) |
| ) |
|
|
| after_match_count: int = len( |
| set(characters_after) & set(characters_after_source) |
| ) |
|
|
| if len(characters_before_source) == 0 and before_match_count <= 4: |
| character_approved_count += 1 |
| continue |
|
|
| if len(characters_after_source) == 0 and after_match_count <= 4: |
| character_approved_count += 1 |
| continue |
|
|
| if ( |
| before_match_count / len(characters_before_source) >= 0.4 |
| or after_match_count / len(characters_after_source) >= 0.4 |
| ): |
| character_approved_count += 1 |
| continue |
|
|
| return character_approved_count / len(ordered_characters) |
|
|
|
|
| def alpha_unicode_split(decoded_sequence: str) -> list[str]: |
| """ |
| Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. |
| Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; |
| One containing the latin letters and the other hebrew. |
| """ |
| layers: dict[str, str] = {} |
|
|
| for character in decoded_sequence: |
| if character.isalpha() is False: |
| continue |
|
|
| character_range: str | None = unicode_range(character) |
|
|
| if character_range is None: |
| continue |
|
|
| layer_target_range: str | None = None |
|
|
| for discovered_range in layers: |
| if ( |
| is_suspiciously_successive_range(discovered_range, character_range) |
| is False |
| ): |
| layer_target_range = discovered_range |
| break |
|
|
| if layer_target_range is None: |
| layer_target_range = character_range |
|
|
| if layer_target_range not in layers: |
| layers[layer_target_range] = character.lower() |
| continue |
|
|
| layers[layer_target_range] += character.lower() |
|
|
| return list(layers.values()) |
|
|
|
|
| def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches: |
| """ |
| This function merge results previously given by the function coherence_ratio. |
| The return type is the same as coherence_ratio. |
| """ |
| per_language_ratios: dict[str, list[float]] = {} |
| for result in results: |
| for sub_result in result: |
| language, ratio = sub_result |
| if language not in per_language_ratios: |
| per_language_ratios[language] = [ratio] |
| continue |
| per_language_ratios[language].append(ratio) |
|
|
| merge = [ |
| ( |
| language, |
| round( |
| sum(per_language_ratios[language]) / len(per_language_ratios[language]), |
| 4, |
| ), |
| ) |
| for language in per_language_ratios |
| ] |
|
|
| return sorted(merge, key=lambda x: x[1], reverse=True) |
|
|
|
|
| def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: |
| """ |
| We shall NOT return "English—" in CoherenceMatches because it is an alternative |
| of "English". This function only keeps the best match and remove the em-dash in it. |
| """ |
| index_results: dict[str, list[float]] = dict() |
|
|
| for result in results: |
| language, ratio = result |
| no_em_name: str = language.replace("—", "") |
|
|
| if no_em_name not in index_results: |
| index_results[no_em_name] = [] |
|
|
| index_results[no_em_name].append(ratio) |
|
|
| if any(len(index_results[e]) > 1 for e in index_results): |
| filtered_results: CoherenceMatches = [] |
|
|
| for language in index_results: |
| filtered_results.append((language, max(index_results[language]))) |
|
|
| return filtered_results |
|
|
| return results |
|
|
|
|
| @lru_cache(maxsize=2048) |
| def coherence_ratio( |
| decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None |
| ) -> CoherenceMatches: |
| """ |
| Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. |
| A layer = Character extraction by alphabets/ranges. |
| """ |
|
|
| results: list[tuple[str, float]] = [] |
| ignore_non_latin: bool = False |
|
|
| sufficient_match_count: int = 0 |
|
|
| lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] |
| if "Latin Based" in lg_inclusion_list: |
| ignore_non_latin = True |
| lg_inclusion_list.remove("Latin Based") |
|
|
| for layer in alpha_unicode_split(decoded_sequence): |
| sequence_frequencies: TypeCounter[str] = Counter(layer) |
| most_common = sequence_frequencies.most_common() |
|
|
| character_count: int = sum(o for c, o in most_common) |
|
|
| if character_count <= TOO_SMALL_SEQUENCE: |
| continue |
|
|
| popular_character_ordered: list[str] = [c for c, o in most_common] |
|
|
| for language in lg_inclusion_list or alphabet_languages( |
| popular_character_ordered, ignore_non_latin |
| ): |
| ratio: float = characters_popularity_compare( |
| language, popular_character_ordered |
| ) |
|
|
| if ratio < threshold: |
| continue |
| elif ratio >= 0.8: |
| sufficient_match_count += 1 |
|
|
| results.append((language, round(ratio, 4))) |
|
|
| if sufficient_match_count >= 3: |
| break |
|
|
| return sorted( |
| filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True |
| ) |
|
|