wmt_t2t
/
dataset_infos.json



1
{"de-en": {"description": "Translate dataset based on the data from statmt.org.\n\nVersions exists for the different years using a combination of multiple data\nsources. The base `wmt_translate` allows you to create your own config to choose\nyour own data/language pair by creating a custom `datasets.translate.wmt.WmtConfig`.\n\n```\nconfig = datasets.wmt.WmtConfig(\n    version=\"0.0.1\",\n    language_pair=(\"fr\", \"de\"),\n    subsets={\n        datasets.Split.TRAIN: [\"commoncrawl_frde\"],\n        datasets.Split.VALIDATION: [\"euelections_dev2019\"],\n    },\n)\nbuilder = datasets.builder(\"wmt_translate\", config=config)\n```\n\n", "citation": "\n@InProceedings{bojar-EtAl:2014:W14-33,\n  author    = {Bojar, Ondrej  and  Buck, Christian  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Leveling, Johannes  and  Monz, Christof  and  Pecina, Pavel  and  Post, Matt  and  Saint-Amand, Herve  and  Soricut, Radu  and  Specia, Lucia  and  Tamchyna, Ale\u000b{s}},\n  title     = {Findings of the 2014 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2014},\n  address   = {Baltimore, Maryland, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {12--58},\n  url       = {http://www.aclweb.org/anthology/W/W14/W14-3302}\n}\n", "homepage": "https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/translate_ende.py", "license": "", "features": {"translation": {"languages": ["de", "en"], "id": null, "_type": "Translation"}}, "post_processed": null, "supervised_keys": {"input": "de", "output": "en"}, "task_templates": null, "builder_name": "wmt_t2t", "config_name": "de-en", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1385110179, "num_examples": 4592289, "dataset_name": "wmt_t2t"}, "validation": {"name": "validation", "num_bytes": 736415, "num_examples": 3000, "dataset_name": "wmt_t2t"}, "test": {"name": "test", "num_bytes": 777334, "num_examples": 3003, "dataset_name": "wmt_t2t"}}, "download_checksums": {"https://huggingface.co/datasets/wmt/wmt13/resolve/main-zip/training-parallel-europarl-v7.zip": {"num_bytes": 658092427, "checksum": "5b2d8b32c2396da739b4e731871c597fcc6e75729becd74619d0712eecf7770e"}, "https://huggingface.co/datasets/wmt/wmt13/resolve/main-zip/training-parallel-commoncrawl.zip": {"num_bytes": 918734483, "checksum": "5ffe980072ea29adfd84568d099bea366d9f72772b988e670794ae851b4e5627"}, "https://huggingface.co/datasets/wmt/wmt18/resolve/main-zip/translation-task/training-parallel-nc-v13.zip": {"num_bytes": 113221161, "checksum": "feff2c0315f66f94a9373bffa419f5664e16dc1e05298f0e37b2869ce4604b70"}, "https://huggingface.co/datasets/wmt/wmt19/resolve/main-zip/translation-task/dev.zip": {"num_bytes": 38714274, "checksum": "d796e363740fdc4261aa6f5a3d2f8223e3adaee7d737b7724863325b8956dfd1"}}, "download_size": 1728762345, "post_processing_size": null, "dataset_size": 1386623928, "size_in_bytes": 3115386273}}