PKU-DAIR
/
Hetu

"""Wikipedia dataset containing cleaned articles of all languages."""
import bz2
import codecs
import json
import re
import xml.etree.cElementTree as etree

import datasets


logger = datasets.logging.get_logger(__name__)


_CITATION = """\
@ONLINE {wikidump,
    author = {Wikimedia Foundation},
    title  = {Wikimedia Downloads},
    url    = {https://dumps.wikimedia.org}
}
"""

_DESCRIPTION = """\
Wikipedia dataset containing cleaned articles of all languages.
The datasets are built from the Wikipedia dump
(https://dumps.wikimedia.org/) with one split per language. Each example
contains the content of one full Wikipedia article with cleaning to strip
markdown and unwanted sections (references, etc.).
"""

_LICENSE = (
    "This work is licensed under the Creative Commons Attribution-ShareAlike "
    "3.0 Unported License. To view a copy of this license, visit "
    "http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to "
    "Creative Commons, PO Box 1866, Mountain View, CA 94042, USA."
)

# Source: https://en.wikipedia.org/wiki/List_of_Wikipedias (accessed 3/1/2019)
# Removed because no articles: hz.
WIKIPEDIA_LANGUAGES = [
    "aa",
    "ab",
    "ace",
    "ady",
    "af",
    "ak",
    "als",
    "am",
    "an",
    "ang",
    "ar",
    "arc",
    "arz",
    "as",
    "ast",
    "atj",
    "av",
    "ay",
    "az",
    "azb",
    "ba",
    "bar",
    "bat-smg",
    "bcl",
    "be",
    "be-x-old",
    "bg",
    "bh",
    "bi",
    "bjn",
    "bm",
    "bn",
    "bo",
    "bpy",
    "br",
    "bs",
    "bug",
    "bxr",
    "ca",
    "cbk-zam",
    "cdo",
    "ce",
    "ceb",
    "ch",
    "cho",
    "chr",
    "chy",
    "ckb",
    "co",
    "cr",
    "crh",
    "cs",
    "csb",
    "cu",
    "cv",
    "cy",
    "da",
    "de",
    "din",
    "diq",
    "dsb",
    "dty",
    "dv",
    "dz",
    "ee",
    "el",
    "eml",
    "en",
    "eo",
    "es",
    "et",
    "eu",
    "ext",
    "fa",
    "ff",
    "fi",
    "fiu-vro",
    "fj",
    "fo",
    "fr",
    "frp",
    "frr",
    "fur",
    "fy",
    "ga",
    "gag",
    "gan",
    "gd",
    "gl",
    "glk",
    "gn",
    "gom",
    "gor",
    "got",
    "gu",
    "gv",
    "ha",
    "hak",
    "haw",
    "he",
    "hi",
    "hif",
    "ho",
    "hr",
    "hsb",
    "ht",
    "hu",
    "hy",
    "ia",
    "id",
    "ie",
    "ig",
    "ii",
    "ik",
    "ilo",
    "inh",
    "io",
    "is",
    "it",
    "iu",
    "ja",
    "jam",
    "jbo",
    "jv",
    "ka",
    "kaa",
    "kab",
    "kbd",
    "kbp",
    "kg",
    "ki",
    "kj",
    "kk",
    "kl",
    "km",
    "kn",
    "ko",
    "koi",
    "krc",
    "ks",
    "ksh",
    "ku",
    "kv",
    "kw",
    "ky",
    "la",
    "lad",
    "lb",
    "lbe",
    "lez",
    "lfn",
    "lg",
    "li",
    "lij",
    "lmo",
    "ln",
    "lo",
    "lrc",
    "lt",
    "ltg",
    "lv",
    "mai",
    "map-bms",
    "mdf",
    "mg",
    "mh",
    "mhr",
    "mi",
    "min",
    "mk",
    "ml",
    "mn",
    "mr",
    "mrj",
    "ms",
    "mt",
    "mus",
    "mwl",
    "my",
    "myv",
    "mzn",
    "na",
    "nah",
    "nap",
    "nds",
    "nds-nl",
    "ne",
    "new",
    "ng",
    "nl",
    "nn",
    "no",
    "nov",
    "nrm",
    "nso",
    "nv",
    "ny",
    "oc",
    "olo",
    "om",
    "or",
    "os",
    "pa",
    "pag",
    "pam",
    "pap",
    "pcd",
    "pdc",
    "pfl",
    "pi",
    "pih",
    "pl",
    "pms",
    "pnb",
    "pnt",
    "ps",
    "pt",
    "qu",
    "rm",
    "rmy",
    "rn",
    "ro",
    "roa-rup",
    "roa-tara",
    "ru",
    "rue",
    "rw",
    "sa",
    "sah",
    "sat",
    "sc",
    "scn",
    "sco",
    "sd",
    "se",
    "sg",
    "sh",
    "si",
    "simple",
    "sk",
    "sl",
    "sm",
    "sn",
    "so",
    "sq",
    "sr",
    "srn",
    "ss",
    "st",
    "stq",
    "su",
    "sv",
    "sw",
    "szl",
    "ta",
    "tcy",
    "te",
    "tet",
    "tg",
    "th",
    "ti",
    "tk",
    "tl",
    "tn",
    "to",
    "tpi",
    "tr",
    "ts",
    "tt",
    "tum",
    "tw",
    "ty",
    "tyv",
    "udm",
    "ug",
    "uk",
    "ur",
    "uz",
    "ve",
    "vec",
    "vep",
    "vi",
    "vls",
    "vo",
    "wa",
    "war",
    "wo",
    "wuu",
    "xal",
    "xh",
    "xmf",
    "yi",
    "yo",
    "za",
    "zea",
    "zh",
    "zh-classical",
    "zh-min-nan",
    "zh-yue",
    "zu",
]

_BASE_URL_TMPL = "https://dumps.wikimedia.org/{lang}wiki/{date}/"
_INFO_FILE = "dumpstatus.json"


class WikipediaConfig(datasets.BuilderConfig):
    """BuilderConfig for Wikipedia."""

    def __init__(self, language=None, date=None, **kwargs):
        """BuilderConfig for Wikipedia.
        Args:
          language: string, the language code for the Wikipedia dump to use.
          date: string, date of the Wikipedia dump in YYYYMMDD format. A list of
            available dates can be found at https://dumps.wikimedia.org/enwiki/.
          **kwargs: keyword arguments forwarded to super.
        """
        super(WikipediaConfig, self).__init__(
            name="{0}.{1}".format(date, language),
            description="Wikipedia dataset for {0}, parsed from {1} dump.".format(
                language, date),
            **kwargs,
        )
        self.date = date
        self.language = language


_VERSION = datasets.Version("1.0.0", "")


class Wikipedia(datasets.BeamBasedBuilder):
    """Wikipedia dataset."""

    # Use mirror (your.org) to avoid download caps.
    BUILDER_CONFIG_CLASS = WikipediaConfig
    BUILDER_CONFIGS = [
        WikipediaConfig(
            version=_VERSION,
            language=lang,
            date="20200501",
        )  # pylint:disable=g-complex-comprehension
        for lang in WIKIPEDIA_LANGUAGES
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({"title": datasets.Value(
                "string"), "text": datasets.Value("string")}),
            # No default supervised_keys.
            supervised_keys=None,
            homepage="https://dumps.wikimedia.org",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager, pipeline):
        def _base_url(lang):
            return _BASE_URL_TMPL.format(lang=lang.replace("-", "_"), date=self.config.date)

        lang = self.config.language

        info_url = _base_url(lang) + _INFO_FILE
        # Use dictionary since testing mock always returns the same result.
        downloaded_files = dl_manager.download_and_extract({"info": info_url})

        xml_urls = []
        total_bytes = 0
        with open(downloaded_files["info"], encoding="utf-8") as f:
            dump_info = json.load(f)
        multistream_dump_info = dump_info["jobs"]["articlesmultistreamdump"]
        assert (
            multistream_dump_info["status"] == "done"
        ), "Specified dump (%s) multistream status is not 'done': %s" % (
            _base_url(lang),
            multistream_dump_info["status"],
        )

        for fname, info in multistream_dump_info["files"].items():
            if ".xml" not in fname:
                continue
            total_bytes += info["size"]
            xml_urls.append(_base_url(lang) + fname)

            # Use dictionary since testing mock always returns the same result.
        downloaded_files = dl_manager.download({"xml": xml_urls})
        if not pipeline.is_local():
            downloaded_files = dl_manager.ship_files_with_pipeline(
                downloaded_files, pipeline)

        return [
            datasets.SplitGenerator(  # pylint:disable=g-complex-comprehension
                name=datasets.Split.TRAIN, gen_kwargs={
                    "filepaths": downloaded_files["xml"], "language": lang}
            )
        ]

    def _build_pcollection(self, pipeline, filepaths, language):
        """Build PCollection of examples in the raw (text) form."""
        import apache_beam as beam
        import mwparserfromhell

        def _extract_content(filepath):
            """Extracts article content from a single WikiMedia XML file."""
            logger.info("generating examples from = %s", filepath)
            with beam.io.filesystems.FileSystems.open(filepath) as f:
                f = bz2.BZ2File(filename=f)
                # Workaround due to: https://github.com/tensorflow/tensorflow/issues/33563
                utf_f = codecs.getreader("utf-8")(f)
                context = etree.iterparse(utf_f, events=("end",))
                for unused_event, elem in context:
                    if not elem.tag.endswith("page"):
                        continue
                    namespace = elem.tag[:-4]
                    title = elem.find("./{0}title".format(namespace)).text
                    ns = elem.find("./{0}ns".format(namespace)).text
                    id_ = elem.find("./{0}id".format(namespace)).text

                    # Filter pages that are not in the "main" namespace.
                    if ns != "0":
                        elem.clear()
                        continue

                    raw_content = elem.find(
                        "./{0}revision/{0}text".format(namespace)).text
                    elem.clear()

                    # Filter redirects.
                    if raw_content is None or raw_content.lower().startswith("#redirect"):
                        beam.metrics.Metrics.counter(
                            language, "filtered-redirects").inc()
                        continue

                    beam.metrics.Metrics.counter(
                        language, "extracted-examples").inc()
                    yield (id_, title, raw_content)

        def _clean_content(inputs):
            """Cleans raw wikicode to extract text."""
            id_, title, raw_content = inputs
            try:
                text = _parse_and_clean_wikicode(
                    raw_content, parser=mwparserfromhell)
            except (mwparserfromhell.parser.ParserError) as e:
                beam.metrics.Metrics.counter(language, "parser-error").inc()
                logger.error("mwparserfromhell ParseError: %s", e)
                return

            if not text:
                beam.metrics.Metrics.counter(
                    language, "empty-clean-examples").inc()
                return

            beam.metrics.Metrics.counter(language, "cleaned-examples").inc()

            yield id_, {"title": title, "text": text}

        return (
            pipeline
            | "Initialize" >> beam.Create(filepaths)
            | "Extract content" >> beam.FlatMap(_extract_content)
            | "Distribute" >> beam.transforms.Reshuffle()
            | "Clean content" >> beam.FlatMap(_clean_content)
        )


def _parse_and_clean_wikicode(raw_content, parser):
    """Strips formatting and unwanted sections from raw page content."""
    wikicode = parser.parse(raw_content)

    # Filters for references, tables, and file/image links.
    re_rm_wikilink = re.compile(
        "^(?:File|Image|Media):", flags=re.IGNORECASE | re.UNICODE)

    def rm_wikilink(obj):
        return bool(re_rm_wikilink.match(str(obj.title)))

    def rm_tag(obj):
        return str(obj.tag) in {"ref", "table"}

    def rm_template(obj):
        return obj.name.lower() in {"reflist", "notelist", "notelist-ua", "notelist-lr", "notelist-ur", "notelist-lg"}

    def try_remove_obj(obj, section):
        try:
            section.remove(obj)
        except ValueError:
            # For unknown reasons, objects are sometimes not found.
            pass

    section_text = []
    # Filter individual sections to clean.
    for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
        for obj in section.ifilter_wikilinks(matches=rm_wikilink, recursive=True):
            try_remove_obj(obj, section)
        for obj in section.ifilter_templates(matches=rm_template, recursive=True):
            try_remove_obj(obj, section)
        for obj in section.ifilter_tags(matches=rm_tag, recursive=True):
            try_remove_obj(obj, section)

        section_text.append(section.strip_code().strip())
    return "\n\n".join(section_text)