代码拉取完成,页面将自动刷新
#!/usr/bin/env python
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# 10.02.2015 - helour - first attempt
# 08.08.2016 - Dan Blanchard - first usable release
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
#
######################### END LICENSE BLOCK #########################
"""
Convert old style SBCS model to new
"""
import os
import sys
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from string import ascii_letters
import chardet
from chardet import __version__
from chardet.metadata.languages import LANGUAGES
from chardet.sbcharsetprober import SingleByteCharSetModel
# Turn ascii_letters into a set to make other ops easier
ascii_letters = set(ascii_letters)
def normalize_name(charset_name):
"""Convert name to proper Python constant format"""
# Title case to start
charset_name = charset_name.upper()
# Underscores instead of hyphens
charset_name = charset_name.replace("-", "_")
return charset_name
def convert_sbcs_model(old_model, alphabet):
"""Create a SingleByteCharSetModel object representing the charset."""
# Setup tables necessary for computing transition frequencies for model
char_to_order = {i: order for i, order in enumerate(old_model["char_to_order_map"])}
pos_ratio = old_model["typical_positive_ratio"]
keep_ascii_letters = old_model["keep_english_letter"]
curr_model = SingleByteCharSetModel(
charset_name=old_model["charset_name"],
language=old_model["language"],
char_to_order_map=char_to_order,
# language_model is filled in later
language_model=None,
typical_positive_ratio=pos_ratio,
keep_ascii_letters=keep_ascii_letters,
alphabet=alphabet,
)
return curr_model
def print_char_to_order(var_name, order_map, charset_name, output_file):
print(f"{var_name} = {{", file=output_file)
for char, order in sorted(order_map.items()):
char_bytes = bytes(bytearray((char,)))
try:
unicode_char = char_bytes.decode(charset_name)
except UnicodeError:
unicode_char = None
print(f" {char!r}: {order!r}, # {unicode_char!r}", file=output_file)
print("}\n", file=output_file)
def print_language_model(var_name, language_model, output_file, char_ranks):
print(
"# 3: Positive\n" "# 2: Likely\n" "# 1: Unlikely\n" "# 0: Negative\n",
file=output_file,
)
print(f"{var_name} = {{", file=output_file)
for first_char, sub_dict in sorted(language_model.items()):
# Skip empty sub_dicts
if not sub_dict or first_char not in char_ranks:
continue
print(f" {char_ranks[first_char]!r}: {{ # {first_char!r}", file=output_file)
for second_char, likelihood in sorted(sub_dict.items()):
if second_char not in char_ranks:
continue
print(
f" {char_ranks[second_char]!r}: {likelihood!r}, # "
f"{second_char!r}",
file=output_file,
)
print(" },", file=output_file)
print("}\n", file=output_file)
def convert_models_for_lang(language):
"""Convert old SingleByteCharSetModels for the given language"""
# Validate language
language = language.title()
lang_metadata = LANGUAGES.get(language)
if not lang_metadata:
raise ValueError(
f"Unknown language: {language}. If you are adding a model for a"
" new language, you must first update metadata/"
"languages.py"
)
lang_mod_name = f"lang{language.lower()}model"
if not os.path.exists(os.path.join("chardet", lang_mod_name + ".py")):
print(f"Skipping {language} because it does not have an old model.")
return
lang_mod = getattr(chardet, lang_mod_name)
print(
f"\n{language}\n----------------------------------------------------------------"
)
print(f"Keep ASCII Letters: {lang_metadata.use_ascii}")
print(f"Alphabet: {lang_metadata.alphabet}")
# Create char-to-order maps (aka char-to-rank dicts)
charset_models = {}
char_ranks = {}
order_to_chars = {}
for var_name in dir(lang_mod):
if not ("Model" in var_name and "LangModel" not in var_name):
continue
old_model = getattr(lang_mod, var_name)
charset_name = old_model["charset_name"]
print(f"Converting charset model for {charset_name}")
sys.stdout.flush()
charset_models[charset_name] = convert_sbcs_model(
old_model, lang_metadata.alphabet
)
# Since we don't know which charsets have which characters, we have to
# try to reconstruct char_ranks (for letters only, since that's all
# the old language models contain)
for byte_hex, order in charset_models[charset_name].char_to_order_map.items():
# order 64 was basically ignored before because of the off by one
# error, but it's hard to know if training took that into account
if order > 64:
continue
# Convert to bytes in Python 2 and 3
char = bytes(bytearray((byte_hex,)))
try:
unicode_char = char.decode(charset_name)
except UnicodeDecodeError:
continue
if unicode_char not in char_ranks:
char_ranks[unicode_char] = order
order_to_chars[order] = unicode_char
elif char_ranks[unicode_char] != order:
raise ValueError(f"Unstable character ranking for {unicode_char}")
old_lang_model = getattr(lang_mod, f"{language.title()}LangModel")
language_model = {}
# Preserve off-by-one error here by ignoring first column and row
for i in range(1, 64):
if i not in order_to_chars:
continue
lang_char = order_to_chars[i]
language_model[lang_char] = {}
for j in range(1, 64):
if j not in order_to_chars:
continue
lang_char2 = order_to_chars[j]
language_model[lang_char][lang_char2] = old_lang_model[(i * 64) + j]
# Write output files
print(f"Writing output file for {language}\n\n")
sys.stdout.flush()
with open(f"lang{language.lower()}model.py", "w") as output_file:
upper_lang = language.upper()
# print header to set encoding
print(
"from chardet.sbcharsetprober import SingleByteCharSetModel\n\n",
file=output_file,
)
lm_name = f"{upper_lang}_LANG_MODEL"
print_language_model(lm_name, language_model, output_file, char_ranks)
print(
"# 255: Undefined characters that did not exist in training text\n"
"# 254: Carriage/Return\n"
"# 253: symbol (punctuation) that does not belong to word\n"
"# 252: 0 - 9\n"
"# 251: Control characters\n\n"
"# Character Mapping Table(s):",
file=output_file,
)
for charset_name, sbcs_model in charset_models.items():
normal_name = normalize_name(charset_name)
char_to_order_name = f"{normal_name}_{upper_lang}_CHAR_TO_ORDER"
print_char_to_order(
char_to_order_name,
sbcs_model.char_to_order_map,
charset_name,
output_file,
)
sbcs_model_name = f"{normal_name}_{upper_lang}_MODEL"
sbcs_model.char_to_order_map.clear()
sbcs_model_repr = (
repr(sbcs_model)
.replace("None", lm_name)
.replace("{}", char_to_order_name)
.replace(", ", (",\n" + " " * (len(sbcs_model_name) + 26)))
)
print(f"{sbcs_model_name} = {sbcs_model_repr}\n", file=output_file)
def main():
parser = ArgumentParser(
description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"language",
help="The name of the language the input documents are "
"in. Also the name of the language the generated "
"model will detect. If no language is specified, "
"models for all languages known to chardet will be"
" trained.",
nargs="*",
)
parser.add_argument("--version", action="version", version=__version__)
args = parser.parse_args()
if not args.language:
args.language = list(sorted(LANGUAGES.keys()))
for language in args.language:
convert_models_for_lang(language)
if __name__ == "__main__":
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。