You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

wikipedia.py 12 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. """Wikipedia dataset containing cleaned articles of all languages."""
  2. import bz2
  3. import codecs
  4. import json
  5. import re
  6. import xml.etree.cElementTree as etree
  7. import datasets
  8. logger = datasets.logging.get_logger(__name__)
  9. _CITATION = """\
  10. @ONLINE {wikidump,
  11. author = {Wikimedia Foundation},
  12. title = {Wikimedia Downloads},
  13. url = {https://dumps.wikimedia.org}
  14. }
  15. """
  16. _DESCRIPTION = """\
  17. Wikipedia dataset containing cleaned articles of all languages.
  18. The datasets are built from the Wikipedia dump
  19. (https://dumps.wikimedia.org/) with one split per language. Each example
  20. contains the content of one full Wikipedia article with cleaning to strip
  21. markdown and unwanted sections (references, etc.).
  22. """
  23. _LICENSE = (
  24. "This work is licensed under the Creative Commons Attribution-ShareAlike "
  25. "3.0 Unported License. To view a copy of this license, visit "
  26. "http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to "
  27. "Creative Commons, PO Box 1866, Mountain View, CA 94042, USA."
  28. )
  29. # Source: https://en.wikipedia.org/wiki/List_of_Wikipedias (accessed 3/1/2019)
  30. # Removed because no articles: hz.
  31. WIKIPEDIA_LANGUAGES = [
  32. "aa",
  33. "ab",
  34. "ace",
  35. "ady",
  36. "af",
  37. "ak",
  38. "als",
  39. "am",
  40. "an",
  41. "ang",
  42. "ar",
  43. "arc",
  44. "arz",
  45. "as",
  46. "ast",
  47. "atj",
  48. "av",
  49. "ay",
  50. "az",
  51. "azb",
  52. "ba",
  53. "bar",
  54. "bat-smg",
  55. "bcl",
  56. "be",
  57. "be-x-old",
  58. "bg",
  59. "bh",
  60. "bi",
  61. "bjn",
  62. "bm",
  63. "bn",
  64. "bo",
  65. "bpy",
  66. "br",
  67. "bs",
  68. "bug",
  69. "bxr",
  70. "ca",
  71. "cbk-zam",
  72. "cdo",
  73. "ce",
  74. "ceb",
  75. "ch",
  76. "cho",
  77. "chr",
  78. "chy",
  79. "ckb",
  80. "co",
  81. "cr",
  82. "crh",
  83. "cs",
  84. "csb",
  85. "cu",
  86. "cv",
  87. "cy",
  88. "da",
  89. "de",
  90. "din",
  91. "diq",
  92. "dsb",
  93. "dty",
  94. "dv",
  95. "dz",
  96. "ee",
  97. "el",
  98. "eml",
  99. "en",
  100. "eo",
  101. "es",
  102. "et",
  103. "eu",
  104. "ext",
  105. "fa",
  106. "ff",
  107. "fi",
  108. "fiu-vro",
  109. "fj",
  110. "fo",
  111. "fr",
  112. "frp",
  113. "frr",
  114. "fur",
  115. "fy",
  116. "ga",
  117. "gag",
  118. "gan",
  119. "gd",
  120. "gl",
  121. "glk",
  122. "gn",
  123. "gom",
  124. "gor",
  125. "got",
  126. "gu",
  127. "gv",
  128. "ha",
  129. "hak",
  130. "haw",
  131. "he",
  132. "hi",
  133. "hif",
  134. "ho",
  135. "hr",
  136. "hsb",
  137. "ht",
  138. "hu",
  139. "hy",
  140. "ia",
  141. "id",
  142. "ie",
  143. "ig",
  144. "ii",
  145. "ik",
  146. "ilo",
  147. "inh",
  148. "io",
  149. "is",
  150. "it",
  151. "iu",
  152. "ja",
  153. "jam",
  154. "jbo",
  155. "jv",
  156. "ka",
  157. "kaa",
  158. "kab",
  159. "kbd",
  160. "kbp",
  161. "kg",
  162. "ki",
  163. "kj",
  164. "kk",
  165. "kl",
  166. "km",
  167. "kn",
  168. "ko",
  169. "koi",
  170. "krc",
  171. "ks",
  172. "ksh",
  173. "ku",
  174. "kv",
  175. "kw",
  176. "ky",
  177. "la",
  178. "lad",
  179. "lb",
  180. "lbe",
  181. "lez",
  182. "lfn",
  183. "lg",
  184. "li",
  185. "lij",
  186. "lmo",
  187. "ln",
  188. "lo",
  189. "lrc",
  190. "lt",
  191. "ltg",
  192. "lv",
  193. "mai",
  194. "map-bms",
  195. "mdf",
  196. "mg",
  197. "mh",
  198. "mhr",
  199. "mi",
  200. "min",
  201. "mk",
  202. "ml",
  203. "mn",
  204. "mr",
  205. "mrj",
  206. "ms",
  207. "mt",
  208. "mus",
  209. "mwl",
  210. "my",
  211. "myv",
  212. "mzn",
  213. "na",
  214. "nah",
  215. "nap",
  216. "nds",
  217. "nds-nl",
  218. "ne",
  219. "new",
  220. "ng",
  221. "nl",
  222. "nn",
  223. "no",
  224. "nov",
  225. "nrm",
  226. "nso",
  227. "nv",
  228. "ny",
  229. "oc",
  230. "olo",
  231. "om",
  232. "or",
  233. "os",
  234. "pa",
  235. "pag",
  236. "pam",
  237. "pap",
  238. "pcd",
  239. "pdc",
  240. "pfl",
  241. "pi",
  242. "pih",
  243. "pl",
  244. "pms",
  245. "pnb",
  246. "pnt",
  247. "ps",
  248. "pt",
  249. "qu",
  250. "rm",
  251. "rmy",
  252. "rn",
  253. "ro",
  254. "roa-rup",
  255. "roa-tara",
  256. "ru",
  257. "rue",
  258. "rw",
  259. "sa",
  260. "sah",
  261. "sat",
  262. "sc",
  263. "scn",
  264. "sco",
  265. "sd",
  266. "se",
  267. "sg",
  268. "sh",
  269. "si",
  270. "simple",
  271. "sk",
  272. "sl",
  273. "sm",
  274. "sn",
  275. "so",
  276. "sq",
  277. "sr",
  278. "srn",
  279. "ss",
  280. "st",
  281. "stq",
  282. "su",
  283. "sv",
  284. "sw",
  285. "szl",
  286. "ta",
  287. "tcy",
  288. "te",
  289. "tet",
  290. "tg",
  291. "th",
  292. "ti",
  293. "tk",
  294. "tl",
  295. "tn",
  296. "to",
  297. "tpi",
  298. "tr",
  299. "ts",
  300. "tt",
  301. "tum",
  302. "tw",
  303. "ty",
  304. "tyv",
  305. "udm",
  306. "ug",
  307. "uk",
  308. "ur",
  309. "uz",
  310. "ve",
  311. "vec",
  312. "vep",
  313. "vi",
  314. "vls",
  315. "vo",
  316. "wa",
  317. "war",
  318. "wo",
  319. "wuu",
  320. "xal",
  321. "xh",
  322. "xmf",
  323. "yi",
  324. "yo",
  325. "za",
  326. "zea",
  327. "zh",
  328. "zh-classical",
  329. "zh-min-nan",
  330. "zh-yue",
  331. "zu",
  332. ]
  333. _BASE_URL_TMPL = "https://dumps.wikimedia.org/{lang}wiki/{date}/"
  334. _INFO_FILE = "dumpstatus.json"
  335. class WikipediaConfig(datasets.BuilderConfig):
  336. """BuilderConfig for Wikipedia."""
  337. def __init__(self, language=None, date=None, **kwargs):
  338. """BuilderConfig for Wikipedia.
  339. Args:
  340. language: string, the language code for the Wikipedia dump to use.
  341. date: string, date of the Wikipedia dump in YYYYMMDD format. A list of
  342. available dates can be found at https://dumps.wikimedia.org/enwiki/.
  343. **kwargs: keyword arguments forwarded to super.
  344. """
  345. super(WikipediaConfig, self).__init__(
  346. name="{0}.{1}".format(date, language),
  347. description="Wikipedia dataset for {0}, parsed from {1} dump.".format(
  348. language, date),
  349. **kwargs,
  350. )
  351. self.date = date
  352. self.language = language
  353. _VERSION = datasets.Version("1.0.0", "")
  354. class Wikipedia(datasets.BeamBasedBuilder):
  355. """Wikipedia dataset."""
  356. # Use mirror (your.org) to avoid download caps.
  357. BUILDER_CONFIG_CLASS = WikipediaConfig
  358. BUILDER_CONFIGS = [
  359. WikipediaConfig(
  360. version=_VERSION,
  361. language=lang,
  362. date="20200501",
  363. ) # pylint:disable=g-complex-comprehension
  364. for lang in WIKIPEDIA_LANGUAGES
  365. ]
  366. def _info(self):
  367. return datasets.DatasetInfo(
  368. description=_DESCRIPTION,
  369. features=datasets.Features({"title": datasets.Value(
  370. "string"), "text": datasets.Value("string")}),
  371. # No default supervised_keys.
  372. supervised_keys=None,
  373. homepage="https://dumps.wikimedia.org",
  374. citation=_CITATION,
  375. )
  376. def _split_generators(self, dl_manager, pipeline):
  377. def _base_url(lang):
  378. return _BASE_URL_TMPL.format(lang=lang.replace("-", "_"), date=self.config.date)
  379. lang = self.config.language
  380. info_url = _base_url(lang) + _INFO_FILE
  381. # Use dictionary since testing mock always returns the same result.
  382. downloaded_files = dl_manager.download_and_extract({"info": info_url})
  383. xml_urls = []
  384. total_bytes = 0
  385. with open(downloaded_files["info"], encoding="utf-8") as f:
  386. dump_info = json.load(f)
  387. multistream_dump_info = dump_info["jobs"]["articlesmultistreamdump"]
  388. assert (
  389. multistream_dump_info["status"] == "done"
  390. ), "Specified dump (%s) multistream status is not 'done': %s" % (
  391. _base_url(lang),
  392. multistream_dump_info["status"],
  393. )
  394. for fname, info in multistream_dump_info["files"].items():
  395. if ".xml" not in fname:
  396. continue
  397. total_bytes += info["size"]
  398. xml_urls.append(_base_url(lang) + fname)
  399. # Use dictionary since testing mock always returns the same result.
  400. downloaded_files = dl_manager.download({"xml": xml_urls})
  401. if not pipeline.is_local():
  402. downloaded_files = dl_manager.ship_files_with_pipeline(
  403. downloaded_files, pipeline)
  404. return [
  405. datasets.SplitGenerator( # pylint:disable=g-complex-comprehension
  406. name=datasets.Split.TRAIN, gen_kwargs={
  407. "filepaths": downloaded_files["xml"], "language": lang}
  408. )
  409. ]
  410. def _build_pcollection(self, pipeline, filepaths, language):
  411. """Build PCollection of examples in the raw (text) form."""
  412. import apache_beam as beam
  413. import mwparserfromhell
  414. def _extract_content(filepath):
  415. """Extracts article content from a single WikiMedia XML file."""
  416. logger.info("generating examples from = %s", filepath)
  417. with beam.io.filesystems.FileSystems.open(filepath) as f:
  418. f = bz2.BZ2File(filename=f)
  419. # Workaround due to: https://github.com/tensorflow/tensorflow/issues/33563
  420. utf_f = codecs.getreader("utf-8")(f)
  421. context = etree.iterparse(utf_f, events=("end",))
  422. for unused_event, elem in context:
  423. if not elem.tag.endswith("page"):
  424. continue
  425. namespace = elem.tag[:-4]
  426. title = elem.find("./{0}title".format(namespace)).text
  427. ns = elem.find("./{0}ns".format(namespace)).text
  428. id_ = elem.find("./{0}id".format(namespace)).text
  429. # Filter pages that are not in the "main" namespace.
  430. if ns != "0":
  431. elem.clear()
  432. continue
  433. raw_content = elem.find(
  434. "./{0}revision/{0}text".format(namespace)).text
  435. elem.clear()
  436. # Filter redirects.
  437. if raw_content is None or raw_content.lower().startswith("#redirect"):
  438. beam.metrics.Metrics.counter(
  439. language, "filtered-redirects").inc()
  440. continue
  441. beam.metrics.Metrics.counter(
  442. language, "extracted-examples").inc()
  443. yield (id_, title, raw_content)
  444. def _clean_content(inputs):
  445. """Cleans raw wikicode to extract text."""
  446. id_, title, raw_content = inputs
  447. try:
  448. text = _parse_and_clean_wikicode(
  449. raw_content, parser=mwparserfromhell)
  450. except (mwparserfromhell.parser.ParserError) as e:
  451. beam.metrics.Metrics.counter(language, "parser-error").inc()
  452. logger.error("mwparserfromhell ParseError: %s", e)
  453. return
  454. if not text:
  455. beam.metrics.Metrics.counter(
  456. language, "empty-clean-examples").inc()
  457. return
  458. beam.metrics.Metrics.counter(language, "cleaned-examples").inc()
  459. yield id_, {"title": title, "text": text}
  460. return (
  461. pipeline
  462. | "Initialize" >> beam.Create(filepaths)
  463. | "Extract content" >> beam.FlatMap(_extract_content)
  464. | "Distribute" >> beam.transforms.Reshuffle()
  465. | "Clean content" >> beam.FlatMap(_clean_content)
  466. )
  467. def _parse_and_clean_wikicode(raw_content, parser):
  468. """Strips formatting and unwanted sections from raw page content."""
  469. wikicode = parser.parse(raw_content)
  470. # Filters for references, tables, and file/image links.
  471. re_rm_wikilink = re.compile(
  472. "^(?:File|Image|Media):", flags=re.IGNORECASE | re.UNICODE)
  473. def rm_wikilink(obj):
  474. return bool(re_rm_wikilink.match(str(obj.title)))
  475. def rm_tag(obj):
  476. return str(obj.tag) in {"ref", "table"}
  477. def rm_template(obj):
  478. return obj.name.lower() in {"reflist", "notelist", "notelist-ua", "notelist-lr", "notelist-ur", "notelist-lg"}
  479. def try_remove_obj(obj, section):
  480. try:
  481. section.remove(obj)
  482. except ValueError:
  483. # For unknown reasons, objects are sometimes not found.
  484. pass
  485. section_text = []
  486. # Filter individual sections to clean.
  487. for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
  488. for obj in section.ifilter_wikilinks(matches=rm_wikilink, recursive=True):
  489. try_remove_obj(obj, section)
  490. for obj in section.ifilter_templates(matches=rm_template, recursive=True):
  491. try_remove_obj(obj, section)
  492. for obj in section.ifilter_tags(matches=rm_tag, recursive=True):
  493. try_remove_obj(obj, section)
  494. section_text.append(section.strip_code().strip())
  495. return "\n\n".join(section_text)