|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- # Lint as: python3
- """The BookCorpus dataset based on Shawn Presser's work https://github.com/soskek/bookcorpus/issues/27 """
-
-
- import glob
- import os
- import pathlib
-
- import datasets
-
-
- _DESCRIPTION = """\
- Books are a rich source of both fine-grained information, how a character, \
- an object or a scene looks like, as well as high-level semantics, what \
- someone is thinking, feeling and how these states evolve through a story.\
- This version of bookcorpus has 17868 dataset items (books). Each item contains \
- two fields: title and text. The title is the name of the book (just the file name) \
- while text contains unprocessed book text. The bookcorpus has been prepared by \
- Shawn Presser and is generously hosted by The-Eye. The-Eye is a non-profit, community \
- driven platform dedicated to the archiving and long-term preservation of any and \
- all data including but by no means limited to... websites, books, games, software, \
- video, audio, other digital-obscura and ideas.
- """
-
- _CITATION = """\
- @InProceedings{Zhu_2015_ICCV,
- title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
- author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
- booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
- month = {December},
- year = {2015}
- }
- """
- _PROJECT_URL = "https://github.com/soskek/bookcorpus/issues/27"
- # _DOWNLOAD_URL = "https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"
- _DOWNLOAD_URL = "/home/xiaonan/develope/Athena/datasets/books_doc_format.tar.gz"
-
-
- class BookCorpusOpenConfig(datasets.BuilderConfig):
- """BuilderConfig for BookCorpus."""
-
- def __init__(self, **kwargs):
- """BuilderConfig for BookCorpus.
- Args:
- **kwargs: keyword arguments forwarded to super.
- """
- super(BookCorpusOpenConfig, self).__init__(
- version=datasets.Version("1.0.0", ""), **kwargs)
-
-
- class BookCorpusOpen(datasets.GeneratorBasedBuilder):
- """BookCorpus dataset."""
-
- BUILDER_CONFIGS = [
- BookCorpusOpenConfig(
- name="plain_text",
- description="Plain text",
- )
- ]
-
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=datasets.Features(
- {
- "title": datasets.Value("string"),
- "text": datasets.Value("string"),
- }
- ),
- supervised_keys=None,
- homepage=_PROJECT_URL,
- citation=_CITATION,
- )
-
- def _split_generators(self, dl_manager):
- arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
-
- return [
- datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={
- "directory": arch_path}),
- ]
-
- def _generate_examples(self, directory):
- glob_target = os.path.join(directory, "**/*.epub.txt")
- book_files = glob.glob(glob_target, recursive=True)
- book_files = sorted(book_files)
- _id = 0
- for book_file_path in book_files:
- path = pathlib.PurePath(book_file_path)
- with open(book_file_path, mode="r", encoding="utf-8") as f:
- yield _id, {"title": str(path.name), "text": f.read()},
- _id += 1
|