You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

bookcorpus.py 3.4 kB

4 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. # Lint as: python3
  2. """The BookCorpus dataset based on Shawn Presser's work https://github.com/soskek/bookcorpus/issues/27 """
  3. import glob
  4. import os
  5. import pathlib
  6. import datasets
  7. _DESCRIPTION = """\
  8. Books are a rich source of both fine-grained information, how a character, \
  9. an object or a scene looks like, as well as high-level semantics, what \
  10. someone is thinking, feeling and how these states evolve through a story.\
  11. This version of bookcorpus has 17868 dataset items (books). Each item contains \
  12. two fields: title and text. The title is the name of the book (just the file name) \
  13. while text contains unprocessed book text. The bookcorpus has been prepared by \
  14. Shawn Presser and is generously hosted by The-Eye. The-Eye is a non-profit, community \
  15. driven platform dedicated to the archiving and long-term preservation of any and \
  16. all data including but by no means limited to... websites, books, games, software, \
  17. video, audio, other digital-obscura and ideas.
  18. """
  19. _CITATION = """\
  20. @InProceedings{Zhu_2015_ICCV,
  21. title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
  22. author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
  23. booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
  24. month = {December},
  25. year = {2015}
  26. }
  27. """
  28. _PROJECT_URL = "https://github.com/soskek/bookcorpus/issues/27"
  29. # _DOWNLOAD_URL = "https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"
  30. _DOWNLOAD_URL = "/home/xiaonan/develope/Athena/datasets/books_doc_format.tar.gz"
  31. class BookCorpusOpenConfig(datasets.BuilderConfig):
  32. """BuilderConfig for BookCorpus."""
  33. def __init__(self, **kwargs):
  34. """BuilderConfig for BookCorpus.
  35. Args:
  36. **kwargs: keyword arguments forwarded to super.
  37. """
  38. super(BookCorpusOpenConfig, self).__init__(
  39. version=datasets.Version("1.0.0", ""), **kwargs)
  40. class BookCorpusOpen(datasets.GeneratorBasedBuilder):
  41. """BookCorpus dataset."""
  42. BUILDER_CONFIGS = [
  43. BookCorpusOpenConfig(
  44. name="plain_text",
  45. description="Plain text",
  46. )
  47. ]
  48. def _info(self):
  49. return datasets.DatasetInfo(
  50. description=_DESCRIPTION,
  51. features=datasets.Features(
  52. {
  53. "title": datasets.Value("string"),
  54. "text": datasets.Value("string"),
  55. }
  56. ),
  57. supervised_keys=None,
  58. homepage=_PROJECT_URL,
  59. citation=_CITATION,
  60. )
  61. def _split_generators(self, dl_manager):
  62. arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
  63. return [
  64. datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={
  65. "directory": arch_path}),
  66. ]
  67. def _generate_examples(self, directory):
  68. glob_target = os.path.join(directory, "**/*.epub.txt")
  69. book_files = glob.glob(glob_target, recursive=True)
  70. book_files = sorted(book_files)
  71. _id = 0
  72. for book_file_path in book_files:
  73. path = pathlib.PurePath(book_file_path)
  74. with open(book_file_path, mode="r", encoding="utf-8") as f:
  75. yield _id, {"title": str(path.name), "text": f.read()},
  76. _id += 1