From 24092e311497b38ebdaaaaacf6ce6e959980b546 Mon Sep 17 00:00:00 2001 From: yhcc Date: Sat, 4 Jun 2022 15:22:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0pipe=E7=9B=B8=E5=85=B3?= =?UTF-8?q?=E7=9A=84=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/collators/padders/get_padder.py | 106 +- .../core/dataloaders/jittor_dataloader/fdl.py | 11 +- .../core/dataloaders/paddle_dataloader/fdl.py | 11 +- .../core/dataloaders/torch_dataloader/fdl.py | 9 +- fastNLP/io/loader/__init__.py | 5 - fastNLP/io/loader/conll.py | 2 +- fastNLP/io/loader/coreference.py | 64 -- fastNLP/io/pipe/__init__.py | 2 - fastNLP/io/pipe/classification.py | 14 +- fastNLP/io/pipe/conll.py | 6 - fastNLP/io/pipe/coreference.py | 186 --- fastNLP/io/pipe/cws.py | 2 - fastNLP/io/pipe/matching.py | 12 +- fastNLP/io/pipe/qa.py | 2 - fastNLP/io/pipe/summarization.py | 3 - fastNLP/modules/torch/__init__.py | 1 + fastNLP/modules/torch/decoder/__init__.py | 7 +- fastNLP/modules/torch/decoder/mlp.py | 97 ++ .../dataloaders/torch_dataloader/test_fdl.py | 9 +- tests/data_for_tests/conll_2003_example.txt | 442 +++++++ tests/data_for_tests/conll_example.txt | 15 + tests/data_for_tests/cws_pku_utf_8 | 56 + tests/data_for_tests/cws_test | 1018 +++++++++++++++++ tests/data_for_tests/cws_train | 1002 ++++++++++++++++ .../glove.6B.50d_test.txt | 6 + .../small_static_embedding/word2vec_test.txt | 7 + tests/data_for_tests/io/20ng/dev.csv | 6 + tests/data_for_tests/io/20ng/test.csv | 6 + tests/data_for_tests/io/20ng/train.csv | 6 + tests/data_for_tests/io/BQCorpus/dev.txt | 6 + tests/data_for_tests/io/BQCorpus/test.txt | 6 + tests/data_for_tests/io/BQCorpus/train.txt | 6 + tests/data_for_tests/io/ChnSentiCorp/dev.txt | 7 + tests/data_for_tests/io/ChnSentiCorp/test.txt | 7 + .../data_for_tests/io/ChnSentiCorp/train.txt | 7 + tests/data_for_tests/io/LCQMC/dev.txt | 6 + tests/data_for_tests/io/LCQMC/test.txt | 5 + tests/data_for_tests/io/LCQMC/train.txt | 6 + tests/data_for_tests/io/MNLI/dev_matched.tsv | 6 + .../data_for_tests/io/MNLI/dev_mismatched.tsv | 6 + tests/data_for_tests/io/MNLI/test_matched.tsv | 6 + .../io/MNLI/test_mismatched.tsv | 6 + tests/data_for_tests/io/MNLI/train.tsv | 7 + tests/data_for_tests/io/MSRA_NER/dev.conll | 38 + tests/data_for_tests/io/MSRA_NER/test.conll | 31 + tests/data_for_tests/io/MSRA_NER/train.conll | 60 + tests/data_for_tests/io/OntoNotes/dev.txt | 10 + tests/data_for_tests/io/OntoNotes/test.txt | 10 + tests/data_for_tests/io/OntoNotes/train.txt | 50 + tests/data_for_tests/io/QNLI/dev.tsv | 6 + tests/data_for_tests/io/QNLI/test.tsv | 6 + tests/data_for_tests/io/QNLI/train.tsv | 6 + tests/data_for_tests/io/Quora/dev.tsv | 2 + tests/data_for_tests/io/Quora/test.tsv | 2 + tests/data_for_tests/io/Quora/train.tsv | 2 + tests/data_for_tests/io/R52/dev.csv | 6 + tests/data_for_tests/io/R52/test.csv | 6 + tests/data_for_tests/io/R52/train.csv | 6 + tests/data_for_tests/io/R8/dev.csv | 6 + tests/data_for_tests/io/R8/test.csv | 6 + tests/data_for_tests/io/R8/train.csv | 6 + tests/data_for_tests/io/RTE/dev.tsv | 6 + tests/data_for_tests/io/RTE/test.tsv | 6 + tests/data_for_tests/io/RTE/train.tsv | 6 + .../data_for_tests/io/SNLI/snli_1.0_dev.jsonl | 5 + .../io/SNLI/snli_1.0_test.jsonl | 5 + .../io/SNLI/snli_1.0_train.jsonl | 5 + tests/data_for_tests/io/SST-2/dev.tsv | 6 + tests/data_for_tests/io/SST-2/test.tsv | 6 + tests/data_for_tests/io/SST-2/train.tsv | 6 + tests/data_for_tests/io/SST/dev.txt | 6 + tests/data_for_tests/io/SST/test.txt | 6 + tests/data_for_tests/io/SST/train.txt | 6 + tests/data_for_tests/io/THUCNews/dev.txt | 9 + tests/data_for_tests/io/THUCNews/test.txt | 9 + tests/data_for_tests/io/THUCNews/train.txt | 9 + .../data_for_tests/io/WeiboSenti100k/dev.txt | 7 + .../data_for_tests/io/WeiboSenti100k/test.txt | 8 + .../io/WeiboSenti100k/train.txt | 7 + tests/data_for_tests/io/XNLI/dev.txt | 7 + tests/data_for_tests/io/XNLI/test.txt | 7 + tests/data_for_tests/io/XNLI/train.txt | 9 + tests/data_for_tests/io/ag/test.csv | 5 + tests/data_for_tests/io/ag/train.csv | 4 + tests/data_for_tests/io/cmrc/dev.json | 155 +++ tests/data_for_tests/io/cmrc/train.json | 161 +++ tests/data_for_tests/io/cnndm/dev.label.jsonl | 4 + .../data_for_tests/io/cnndm/test.label.jsonl | 4 + .../data_for_tests/io/cnndm/train.cnndm.jsonl | 10 + tests/data_for_tests/io/cnndm/vocab | 100 ++ tests/data_for_tests/io/conll2003/dev.txt | 49 + tests/data_for_tests/io/conll2003/test.txt | 51 + tests/data_for_tests/io/conll2003/train.txt | 48 + tests/data_for_tests/io/cws_as/dev.txt | 6 + tests/data_for_tests/io/cws_as/test.txt | 6 + tests/data_for_tests/io/cws_as/train.txt | 6 + tests/data_for_tests/io/cws_cityu/dev.txt | 6 + tests/data_for_tests/io/cws_cityu/test.txt | 6 + tests/data_for_tests/io/cws_cityu/train.txt | 6 + tests/data_for_tests/io/cws_msra/dev.txt | 2 + tests/data_for_tests/io/cws_msra/test.txt | 2 + tests/data_for_tests/io/cws_msra/train.txt | 3 + tests/data_for_tests/io/cws_pku/dev.txt | 6 + tests/data_for_tests/io/cws_pku/test.txt | 6 + tests/data_for_tests/io/cws_pku/train.txt | 9 + tests/data_for_tests/io/dbpedia/test.csv | 5 + tests/data_for_tests/io/dbpedia/train.csv | 14 + tests/data_for_tests/io/imdb/dev.txt | 6 + tests/data_for_tests/io/imdb/test.txt | 6 + tests/data_for_tests/io/imdb/train.txt | 6 + tests/data_for_tests/io/mr/dev.csv | 6 + tests/data_for_tests/io/mr/test.csv | 6 + tests/data_for_tests/io/mr/train.csv | 6 + tests/data_for_tests/io/ohsumed/dev.csv | 6 + tests/data_for_tests/io/ohsumed/test.csv | 6 + tests/data_for_tests/io/ohsumed/train.csv | 6 + tests/data_for_tests/io/peopledaily/dev.txt | 7 + tests/data_for_tests/io/peopledaily/test.txt | 41 + tests/data_for_tests/io/peopledaily/train.txt | 46 + tests/data_for_tests/io/weibo_NER/dev.conll | 21 + tests/data_for_tests/io/weibo_NER/test.conll | 17 + tests/data_for_tests/io/weibo_NER/train.conll | 69 ++ .../io/yelp_review_full/dev.csv | 6 + .../io/yelp_review_full/test.csv | 6 + .../io/yelp_review_full/train.csv | 6 + .../io/yelp_review_polarity/dev.csv | 6 + .../io/yelp_review_polarity/test.csv | 6 + .../io/yelp_review_polarity/train.csv | 6 + tests/data_for_tests/people.txt | 307 +++++ tests/data_for_tests/people_daily_raw.txt | 27 + tests/data_for_tests/sample_mnli.tsv | 12 + tests/data_for_tests/sample_snli.jsonl | 3 + tests/data_for_tests/text_classify.txt | 100 ++ tests/data_for_tests/zh_sample.conllx | 100 ++ tests/io/__init__.py | 0 tests/io/loader/test_classification_loader.py | 51 + tests/io/loader/test_conll_loader.py | 43 + tests/io/loader/test_cws_loader.py | 22 + tests/io/loader/test_matching_loader.py | 49 + tests/io/loader/test_qa_loader.py | 12 + tests/io/pipe/test_classification.py | 89 ++ tests/io/pipe/test_conll.py | 48 + tests/io/pipe/test_cws.py | 39 + tests/io/pipe/test_matching.py | 104 ++ tests/io/pipe/test_qa.py | 24 + tests/io/pipe/test_summary.py | 71 ++ tests/io/test_embed_loader.py | 38 + 147 files changed, 5339 insertions(+), 365 deletions(-) delete mode 100644 fastNLP/io/loader/coreference.py delete mode 100644 fastNLP/io/pipe/coreference.py create mode 100755 fastNLP/modules/torch/decoder/mlp.py create mode 100755 tests/data_for_tests/conll_2003_example.txt create mode 100755 tests/data_for_tests/conll_example.txt create mode 100755 tests/data_for_tests/cws_pku_utf_8 create mode 100755 tests/data_for_tests/cws_test create mode 100755 tests/data_for_tests/cws_train create mode 100755 tests/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt create mode 100755 tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt create mode 100755 tests/data_for_tests/io/20ng/dev.csv create mode 100755 tests/data_for_tests/io/20ng/test.csv create mode 100755 tests/data_for_tests/io/20ng/train.csv create mode 100755 tests/data_for_tests/io/BQCorpus/dev.txt create mode 100755 tests/data_for_tests/io/BQCorpus/test.txt create mode 100755 tests/data_for_tests/io/BQCorpus/train.txt create mode 100755 tests/data_for_tests/io/ChnSentiCorp/dev.txt create mode 100755 tests/data_for_tests/io/ChnSentiCorp/test.txt create mode 100755 tests/data_for_tests/io/ChnSentiCorp/train.txt create mode 100755 tests/data_for_tests/io/LCQMC/dev.txt create mode 100755 tests/data_for_tests/io/LCQMC/test.txt create mode 100755 tests/data_for_tests/io/LCQMC/train.txt create mode 100755 tests/data_for_tests/io/MNLI/dev_matched.tsv create mode 100755 tests/data_for_tests/io/MNLI/dev_mismatched.tsv create mode 100755 tests/data_for_tests/io/MNLI/test_matched.tsv create mode 100755 tests/data_for_tests/io/MNLI/test_mismatched.tsv create mode 100755 tests/data_for_tests/io/MNLI/train.tsv create mode 100755 tests/data_for_tests/io/MSRA_NER/dev.conll create mode 100755 tests/data_for_tests/io/MSRA_NER/test.conll create mode 100755 tests/data_for_tests/io/MSRA_NER/train.conll create mode 100755 tests/data_for_tests/io/OntoNotes/dev.txt create mode 100755 tests/data_for_tests/io/OntoNotes/test.txt create mode 100755 tests/data_for_tests/io/OntoNotes/train.txt create mode 100755 tests/data_for_tests/io/QNLI/dev.tsv create mode 100755 tests/data_for_tests/io/QNLI/test.tsv create mode 100755 tests/data_for_tests/io/QNLI/train.tsv create mode 100755 tests/data_for_tests/io/Quora/dev.tsv create mode 100755 tests/data_for_tests/io/Quora/test.tsv create mode 100755 tests/data_for_tests/io/Quora/train.tsv create mode 100755 tests/data_for_tests/io/R52/dev.csv create mode 100755 tests/data_for_tests/io/R52/test.csv create mode 100755 tests/data_for_tests/io/R52/train.csv create mode 100755 tests/data_for_tests/io/R8/dev.csv create mode 100755 tests/data_for_tests/io/R8/test.csv create mode 100755 tests/data_for_tests/io/R8/train.csv create mode 100755 tests/data_for_tests/io/RTE/dev.tsv create mode 100755 tests/data_for_tests/io/RTE/test.tsv create mode 100755 tests/data_for_tests/io/RTE/train.tsv create mode 100755 tests/data_for_tests/io/SNLI/snli_1.0_dev.jsonl create mode 100755 tests/data_for_tests/io/SNLI/snli_1.0_test.jsonl create mode 100755 tests/data_for_tests/io/SNLI/snli_1.0_train.jsonl create mode 100755 tests/data_for_tests/io/SST-2/dev.tsv create mode 100755 tests/data_for_tests/io/SST-2/test.tsv create mode 100755 tests/data_for_tests/io/SST-2/train.tsv create mode 100755 tests/data_for_tests/io/SST/dev.txt create mode 100755 tests/data_for_tests/io/SST/test.txt create mode 100755 tests/data_for_tests/io/SST/train.txt create mode 100755 tests/data_for_tests/io/THUCNews/dev.txt create mode 100755 tests/data_for_tests/io/THUCNews/test.txt create mode 100755 tests/data_for_tests/io/THUCNews/train.txt create mode 100755 tests/data_for_tests/io/WeiboSenti100k/dev.txt create mode 100755 tests/data_for_tests/io/WeiboSenti100k/test.txt create mode 100755 tests/data_for_tests/io/WeiboSenti100k/train.txt create mode 100755 tests/data_for_tests/io/XNLI/dev.txt create mode 100755 tests/data_for_tests/io/XNLI/test.txt create mode 100755 tests/data_for_tests/io/XNLI/train.txt create mode 100755 tests/data_for_tests/io/ag/test.csv create mode 100755 tests/data_for_tests/io/ag/train.csv create mode 100755 tests/data_for_tests/io/cmrc/dev.json create mode 100755 tests/data_for_tests/io/cmrc/train.json create mode 100755 tests/data_for_tests/io/cnndm/dev.label.jsonl create mode 100755 tests/data_for_tests/io/cnndm/test.label.jsonl create mode 100755 tests/data_for_tests/io/cnndm/train.cnndm.jsonl create mode 100755 tests/data_for_tests/io/cnndm/vocab create mode 100755 tests/data_for_tests/io/conll2003/dev.txt create mode 100755 tests/data_for_tests/io/conll2003/test.txt create mode 100755 tests/data_for_tests/io/conll2003/train.txt create mode 100755 tests/data_for_tests/io/cws_as/dev.txt create mode 100755 tests/data_for_tests/io/cws_as/test.txt create mode 100755 tests/data_for_tests/io/cws_as/train.txt create mode 100755 tests/data_for_tests/io/cws_cityu/dev.txt create mode 100755 tests/data_for_tests/io/cws_cityu/test.txt create mode 100755 tests/data_for_tests/io/cws_cityu/train.txt create mode 100755 tests/data_for_tests/io/cws_msra/dev.txt create mode 100755 tests/data_for_tests/io/cws_msra/test.txt create mode 100755 tests/data_for_tests/io/cws_msra/train.txt create mode 100755 tests/data_for_tests/io/cws_pku/dev.txt create mode 100755 tests/data_for_tests/io/cws_pku/test.txt create mode 100755 tests/data_for_tests/io/cws_pku/train.txt create mode 100755 tests/data_for_tests/io/dbpedia/test.csv create mode 100755 tests/data_for_tests/io/dbpedia/train.csv create mode 100755 tests/data_for_tests/io/imdb/dev.txt create mode 100755 tests/data_for_tests/io/imdb/test.txt create mode 100755 tests/data_for_tests/io/imdb/train.txt create mode 100755 tests/data_for_tests/io/mr/dev.csv create mode 100755 tests/data_for_tests/io/mr/test.csv create mode 100755 tests/data_for_tests/io/mr/train.csv create mode 100755 tests/data_for_tests/io/ohsumed/dev.csv create mode 100755 tests/data_for_tests/io/ohsumed/test.csv create mode 100755 tests/data_for_tests/io/ohsumed/train.csv create mode 100755 tests/data_for_tests/io/peopledaily/dev.txt create mode 100755 tests/data_for_tests/io/peopledaily/test.txt create mode 100755 tests/data_for_tests/io/peopledaily/train.txt create mode 100755 tests/data_for_tests/io/weibo_NER/dev.conll create mode 100755 tests/data_for_tests/io/weibo_NER/test.conll create mode 100755 tests/data_for_tests/io/weibo_NER/train.conll create mode 100755 tests/data_for_tests/io/yelp_review_full/dev.csv create mode 100755 tests/data_for_tests/io/yelp_review_full/test.csv create mode 100755 tests/data_for_tests/io/yelp_review_full/train.csv create mode 100755 tests/data_for_tests/io/yelp_review_polarity/dev.csv create mode 100755 tests/data_for_tests/io/yelp_review_polarity/test.csv create mode 100755 tests/data_for_tests/io/yelp_review_polarity/train.csv create mode 100755 tests/data_for_tests/people.txt create mode 100755 tests/data_for_tests/people_daily_raw.txt create mode 100755 tests/data_for_tests/sample_mnli.tsv create mode 100755 tests/data_for_tests/sample_snli.jsonl create mode 100755 tests/data_for_tests/text_classify.txt create mode 100755 tests/data_for_tests/zh_sample.conllx create mode 100755 tests/io/__init__.py create mode 100755 tests/io/loader/test_classification_loader.py create mode 100755 tests/io/loader/test_conll_loader.py create mode 100755 tests/io/loader/test_cws_loader.py create mode 100755 tests/io/loader/test_matching_loader.py create mode 100755 tests/io/loader/test_qa_loader.py create mode 100755 tests/io/pipe/test_classification.py create mode 100755 tests/io/pipe/test_conll.py create mode 100755 tests/io/pipe/test_cws.py create mode 100755 tests/io/pipe/test_matching.py create mode 100755 tests/io/pipe/test_qa.py create mode 100755 tests/io/pipe/test_summary.py create mode 100755 tests/io/test_embed_loader.py diff --git a/fastNLP/core/collators/padders/get_padder.py b/fastNLP/core/collators/padders/get_padder.py index dfc228a3..b0a82849 100644 --- a/fastNLP/core/collators/padders/get_padder.py +++ b/fastNLP/core/collators/padders/get_padder.py @@ -24,62 +24,62 @@ def get_padder(batch_field:Sequence[Any], pad_val, dtype, backend, field_name)-> :param field_name: 方便报错的。 :return: """ - assert len(batch_field)!=0, "Empty batch encountered." - logger.debug(f"The content in the field:`{field_name}` is:\n" + str(batch_field)) - if pad_val is None: - logger.debug(f"The pad_val for field:{field_name} is None, not padding this field.") - return NullPadder() - if backend is None: - logger.debug(f"The backend for field:{field_name} is None, not padding this field.") - return NullPadder() - - # 首先判断当前 field 是否是必须要 pad ,根据用户设置的 pad_val、dtype 等判断。 - must_pad = False - if pad_val != 0 or dtype is not None: - must_pad = True - - catalog = _get_element_shape_dtype(batch_field) # 首先获取数据的基本信息。 - - # 根据 catalog 来判定当前是否可以进行 pad 。 - # 首先检查是否所有的 key 是一样长的,表明深度是一致的 - depths = set(map(len, catalog.keys())) - num_depth = len(depths) - if num_depth != 1: - msg = f'Field:`{field_name}` cannot pad, since it has various depths({depths}) of data. To view more ' \ - f"information please set logger's level to DEBUG." - if must_pad: - raise InconsistencyError(msg) - raise NoProperPadderError(msg) - - # 再检查所有的元素 shape 是否一致? - shape_lens = set([len(v[0]) for v in catalog.values()]) - num_shape = len(shape_lens) - if num_shape != 1: - msg = f'Field:`{field_name}` cannot pad, since it has various shape length({shape_lens}) of data. To view more ' \ - f"information please set logger's level to DEBUG." - if must_pad: - raise InconsistencyError(msg) - raise NoProperPadderError(msg) - - # 再检查所有的元素 type 是否一致 try: - ele_dtypes = set([v[1] for v in catalog.values()]) - except TypeError: - ele_dtypes = set([str(v[1]) for v in catalog.values()]) - num_eletypes = len(ele_dtypes) - if num_eletypes != 1: - msg = f'Field:`{field_name}` cannot pad, since it has various types({ele_dtypes}) of data. To view more ' \ - f"information please set logger's level to DEBUG." - if must_pad: - raise InconsistencyError(msg) - raise NoProperPadderError(msg) + assert len(batch_field)!=0, "Empty batch encountered." + logger.debug(f"The content in the field:`{field_name}` is:\n" + str(batch_field)) + if pad_val is None: + logger.debug(f"The pad_val for field:{field_name} is None, not padding this field.") + return NullPadder() + if backend is None: + logger.debug(f"The backend for field:{field_name} is None, not padding this field.") + return NullPadder() + + # 首先判断当前 field 是否是必须要 pad ,根据用户设置的 pad_val、dtype 等判断。 + must_pad = False + if pad_val != 0 or dtype is not None: + must_pad = True + + catalog = _get_element_shape_dtype(batch_field) # 首先获取数据的基本信息。 + + # 根据 catalog 来判定当前是否可以进行 pad 。 + # 首先检查是否所有的 key 是一样长的,表明深度是一致的 + depths = set(map(len, catalog.keys())) + num_depth = len(depths) + if num_depth != 1: + msg = f'Field:`{field_name}` cannot pad, since it has various depths({depths}) of data. To view more ' \ + f"information please set logger's level to DEBUG." + if must_pad: + raise InconsistencyError(msg) + raise NoProperPadderError(msg) - depth = depths.pop() - shape_len = shape_lens.pop() - ele_dtype = list(catalog.values())[0][1] # 因为上面有except的情况,所以这样处理了 + # 再检查所有的元素 shape 是否一致? + shape_lens = set([len(v[0]) for v in catalog.values()]) + num_shape = len(shape_lens) + if num_shape != 1: + msg = f'Field:`{field_name}` cannot pad, since it has various shape length({shape_lens}) of data. To view more ' \ + f"information please set logger's level to DEBUG." + if must_pad: + raise InconsistencyError(msg) + raise NoProperPadderError(msg) - # 需要由 padder 自己决定是否能够 pad 。 - try: + # 再检查所有的元素 type 是否一致 + try: + ele_dtypes = set([v[1] for v in catalog.values()]) + except TypeError: + ele_dtypes = set([str(v[1]) for v in catalog.values()]) + num_eletypes = len(ele_dtypes) + if num_eletypes != 1: + msg = f'Field:`{field_name}` cannot pad, since it has various types({ele_dtypes}) of data. To view more ' \ + f"information please set logger's level to DEBUG." + if must_pad: + raise InconsistencyError(msg) + raise NoProperPadderError(msg) + + depth = depths.pop() + shape_len = shape_lens.pop() + ele_dtype = list(catalog.values())[0][1] # 因为上面有except的情况,所以这样处理了 + + # 需要由 padder 自己决定是否能够 pad 。 if depth == 1 and shape_len == 0: # 形如 [0, 1, 2] 或 [True, False, True] if backend == 'raw': return RawNumberPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) diff --git a/fastNLP/core/dataloaders/jittor_dataloader/fdl.py b/fastNLP/core/dataloaders/jittor_dataloader/fdl.py index 0c51c37b..83555f6e 100644 --- a/fastNLP/core/dataloaders/jittor_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/jittor_dataloader/fdl.py @@ -221,13 +221,12 @@ def prepare_jittor_dataloader(ds_or_db, batch_size: int = 16, shuffle: bool = Fa 其他 key 不包含 'train' 字符串的数据集则使用 non_train_size 和 non_train_sampler 作为参数。最终根据 ``key: JittorDataLoader`` 组成 ``Dict[key, JittorDataLoader]`` 的字典返回。 - :param ds_or_db: 实现 __getitem__() 和 __len__() 的对象;或这种对象的序列;或字典。其取值只能为 ``[DataSet, DataBundle, - Dict[str, DataSet]]``. + :param ds_or_db: 可以有以下三种取值, + + * ds_or_db 为 :class:`~fastNLP.io.DataBundle`, 返回值为 ``Dict[str, TorchDataLoader]`` 的字典 + * ds_or_db 为 ``Dict[str, DataSet]`` 字典, 返回值为 ``Dict[str, TorchDataLoader]`` 的字典 + * ds_or_db 为实现了 __getitem__() 和 __len__() 的对象 ,返回值为:class:`~fastNLP.TorchDataLoader` - * ds_or_db 为 :class:`~fastNLP.core.dataset.DataSet`,返回值为 :class:`~fastNLP.core.dataloaders.JittorDataLoader` - * ds_or_db 为 :class:`~fastNLP.io.DataBundle`, 返回值为 :class:`Dict[str, JittorDataLoader]` 的字典 - * ds_or_db 为 :class:`Dict[str, DataSet]` 字典, 返回值也为 :class:`Dict[str, JittorDataLoader]` 的字典 - :param non_train_batch_size: 如果传入的 ``ds_or_db`` 为 :class:`Dict` 或 :class:`~fastNLP.io.DataBundle` 对象,可以通过改参数 设置名称不为 `train` 的其他 ``dataset`` 的 ``batch_size``。 默认为 ``16``。 :param batch_size: 批次大小,默认为 ``16`` 且当 batch_sampler 为 None 有效。 diff --git a/fastNLP/core/dataloaders/paddle_dataloader/fdl.py b/fastNLP/core/dataloaders/paddle_dataloader/fdl.py index 68992e50..37130e3e 100644 --- a/fastNLP/core/dataloaders/paddle_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/paddle_dataloader/fdl.py @@ -258,7 +258,7 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, non_train_batch_size: int = None) \ -> Union[Dict[str, PaddleDataLoader], PaddleDataLoader]: """ - ``prepare_paddle_dataloader`` 的功能是将输入的单个或多个 dataset 同时转为 ``PaddleDataloader``对象, 详见 :class:`~fastNLP.core.dataloaders.PaddleDataLoader`。 + ``prepare_paddle_dataloader`` 的功能是将输入的单个或多个 dataset 同时转为 ``PaddleDataloader``对象, 详见 :class:`~fastNLP.PaddleDataLoader`。 根据 ds_or_db 的类型 ``[DataSet, DataBundle, Dict[name, Dataset]]`` 不同而有不同返回结果, 具体如下: * 当 ds_or_db 为 ``DataSet``时,``prepare_paddle_dataloader`` 会将使用的除了 non_train_batch_size 和 non_train_sampler 以外的参数来 @@ -272,12 +272,11 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, 其他 key 不包含 'train' 字符串的数据集则使用 non_train_size 和 non_train_sampler 作为参数。最终根据 ``key: PaddleDataLoader`` 组成 ``Dict[key, PaddleDataLoader]`` 的字典返回。 - ::param ds_or_db: 实现 __getitem__() 和 __len__() 的对象;或这种对象的序列;或字典。其取值只能为 ``[DataSet, DataBundle, - Dict[str, DataSet]]``. + :param ds_or_db: 可以有以下三种取值, - * ds_or_db 为 :class:`~fastNLP.core.dataset.DataSet`,返回值为:class:`~fastNLP.core.dataloaders.PaddleDataLoader` - * ds_or_db 为 :class:`~fastNLP.io.DataBundle`, 返回值为 ``Dict[str, PaddleDataLoader]`` 的字典 - * ds_or_db 为 ``Dict[str, DataSet]`` 字典, 返回值也为 ``Dict[str, PaddleDataLoader]`` 的字典 + * ds_or_db 为 :class:`~fastNLP.io.DataBundle`, 返回值为 ``Dict[str, TorchDataLoader]`` 的字典 + * ds_or_db 为 ``Dict[str, DataSet]`` 字典, 返回值为 ``Dict[str, TorchDataLoader]`` 的字典 + * ds_or_db 为实现了 __getitem__() 和 __len__() 的对象 ,返回值为:class:`~fastNLP.TorchDataLoader` :param feed_list: (list(Tensor)|tuple(Tensor)): feed Tensor list. 这个张量能被 :code:`paddle.static.data()` 创建。 如果:attr:`return_list` 是 ``False``, 那么 :attr:`feed_list` diff --git a/fastNLP/core/dataloaders/torch_dataloader/fdl.py b/fastNLP/core/dataloaders/torch_dataloader/fdl.py index 707c54ca..09fa2ff6 100644 --- a/fastNLP/core/dataloaders/torch_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/torch_dataloader/fdl.py @@ -227,7 +227,7 @@ def prepare_torch_dataloader(ds_or_db, non_train_batch_size: int = None) \ -> Union[TorchDataLoader, Dict[str, TorchDataLoader]]: """ - ``prepare_torch_dataloader`` 的功能是将输入的单个或多个 dataset 同时转为 ``TorchDataloader``对象, 详见 :class:`~fastNLP.core.dataloaders.TorchDataLoader`。 + ``prepare_torch_dataloader`` 的功能是将输入的单个或多个 dataset 同时转为 ``TorchDataloader``对象, 详见 :class:`~fastNLP.TorchDataLoader`。 根据 ds_or_db 的类型 ``[DataSet, DataBundle, Dict[name, Dataset]]`` 不同而有不同返回结果, 具体如下: * 当 ds_or_db 为 ``DataSet``时,``prepare_torch_dataloader`` 会将使用的除了 non_train_batch_size 和 non_train_sampler 以外的参数来 @@ -241,12 +241,11 @@ def prepare_torch_dataloader(ds_or_db, 其他 key 不包含 'train' 字符串的数据集则使用 non_train_size 和 non_train_sampler 作为参数。最终根据 ``key: TorchDataLoader`` 组成 ``Dict[key, TorchDataLoader]`` 的字典返回。 - :param ds_or_db: 实现 __getitem__() 和 __len__() 的对象;或这种对象的序列;或字典。其取值只能为 ``[DataSet, DataBundle, - Dict[str, DataSet]]``. + :param ds_or_db: 可以有以下三种取值, - * ds_or_db 为 :class:`~fastNLP.core.dataset.DataSet`,返回值为:class:`~fastNLP.core.dataloaders.TorchDataLoader` * ds_or_db 为 :class:`~fastNLP.io.DataBundle`, 返回值为 ``Dict[str, TorchDataLoader]`` 的字典 - * ds_or_db 为 ``Dict[str, DataSet]`` 字典, 返回值也为 ``Dict[str, TorchDataLoader]`` 的字典 + * ds_or_db 为 ``Dict[str, DataSet]`` 字典, 返回值为 ``Dict[str, TorchDataLoader]`` 的字典 + * ds_or_db 为实现了 __getitem__() 和 __len__() 的对象 ,返回值为:class:`~fastNLP.TorchDataLoader` :param batch_size: 批次大小,默认为 ``16`` 且当 batch_sampler 为 None 有效。 :param non_train_batch_size: 非 'train' 数据集的 ``TorchDataLoader`` 批次大小,默认为 ``16`` 且当 batch_sampler 为 None 有效。 diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 5ea9378b..ebd56330 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -84,8 +84,6 @@ __all__ = [ "BQCorpusLoader", "LCQMCLoader", - "CoReferenceLoader", - "CMRC2018Loader" ] @@ -95,7 +93,6 @@ from .classification import CLSBaseLoader, YelpFullLoader, YelpPolarityLoader, A MRLoader, R8Loader, R52Loader, OhsumedLoader, NG20Loader from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader -from .coreference import CoReferenceLoader from .csv import CSVLoader from .cws import CWSLoader from .json import JsonLoader @@ -103,5 +100,3 @@ from .loader import Loader from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, \ LCQMCLoader from .qa import CMRC2018Loader - - diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py index b15d31f7..0b597398 100644 --- a/fastNLP/io/loader/conll.py +++ b/fastNLP/io/loader/conll.py @@ -56,7 +56,7 @@ class ConllLoader(Loader): r""" :param list headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 - :param list sep: 指定分隔符,默认为制表符 + :param str sep: 指定分隔符,默认为制表符 :param list indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` :param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True`` :param bool drophashtag: 是否忽略以 ``#`` 开头的句子。 diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py deleted file mode 100644 index 66a39749..00000000 --- a/fastNLP/io/loader/coreference.py +++ /dev/null @@ -1,64 +0,0 @@ -r"""undocumented""" - -__all__ = [ - "CoReferenceLoader", -] - -from ...core.dataset import DataSet -from ..file_reader import _read_json -from fastNLP.core.dataset import Instance -# from ...core.const import Const -from .json import JsonLoader - - -class CoReferenceLoader(JsonLoader): - r""" - 原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 - - Example:: - - {"doc_key": "bc/cctv/00/cctv_0000_0", - "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], - "clusters": [[[70, 70], [485, 486], [500, 500], [73, 73], [55, 55], [153, 154], [366, 366]]], - "sentences": [["In", "the", "summer", "of", "2005", ",", "a", "picture", "that", "people", "have", "long", "been", "looking", "forward", "to", "started", "emerging", "with", "frequency", "in", "various", "major", "Hong", "Kong", "media", "."], ["With", "their", "unique", "charm", ",", "these", "well", "-", "known", "cartoon", "images", "once", "again", "caused", "Hong", "Kong", "to", "be", "a", "focus", "of", "worldwide", "attention", "."]] - } - - 读取预处理好的Conll2012数据,数据结构如下: - - .. csv-table:: - :header: "raw_words1", "raw_words2", "raw_words3", "raw_words4" - - "bc/cctv/00/cctv_0000_0", "[['Speaker#1', 'Speaker#1', 'Speaker#1...", "[[[70, 70], [485, 486], [500, 500], [7...", "[['In', 'the', 'summer', 'of', '2005',..." - "...", "...", "...", "..." - - """ - def __init__(self, fields=None, dropna=False): - super().__init__(fields, dropna) - self.fields = {"doc_key": "raw_words1", "speakers": "raw_words2", "clusters": "raw_words3", - "sentences": "raw_words4"} - - def _load(self, path): - r""" - 加载数据 - :param path: 数据文件路径,文件为json - - :return: - """ - dataset = DataSet() - for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): - if self.fields: - ins = {self.fields[k]: v for k, v in d.items()} - else: - ins = d - dataset.append(Instance(**ins)) - return dataset - - def download(self): - r""" - 由于版权限制,不能提供自动下载功能。可参考 - - https://www.aclweb.org/anthology/W12-4501 - - :return: - """ - raise RuntimeError("CoReference cannot be downloaded automatically.") diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 5d269cc5..05a82806 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -54,7 +54,6 @@ __all__ = [ "GranularizePipe", "MachingTruncatePipe", - "CoReferencePipe", "CMRC2018BertPipe", @@ -72,7 +71,6 @@ from .classification import CLSBasePipe, YelpFullPipe, YelpPolarityPipe, SSTPipe WeiboSenti100kPipe, AGsNewsPipe, DBPediaPipe, MRPipe, R8Pipe, R52Pipe, OhsumedPipe, NG20Pipe from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe from .conll import Conll2003Pipe, iob2, iob2bioes -from .coreference import CoReferencePipe from .cws import CWSPipe from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CNXNLIBertPipe, CNXNLIPipe, BQCorpusBertPipe, \ diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index cb328acc..98363fce 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -37,7 +37,7 @@ from fastNLP.core.log import logger class CLSBasePipe(Pipe): - def __init__(self, lower: bool = False, tokenizer: str = 'spacy', lang='en'): + def __init__(self, lower: bool = False, tokenizer: str = 'raw', lang='en'): super().__init__() self.lower = lower self.tokenizer = get_tokenizer(tokenizer, lang=lang) @@ -81,8 +81,6 @@ class CLSBasePipe(Pipe): for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len('words') - data_bundle.set_input('words', 'seq_len', 'target') - return data_bundle def process_from_file(self, paths) -> DataBundle: @@ -409,11 +407,11 @@ class SST2Pipe(CLSBasePipe): """ - def __init__(self, lower=False, tokenizer='spacy'): + def __init__(self, lower=False, tokenizer='raw'): r""" :param bool lower: 是否对输入进行小写化。 - :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。 """ super().__init__(lower=lower, tokenizer=tokenizer, lang='en') @@ -594,8 +592,6 @@ class ChnSentiCorpPipe(Pipe): for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len('chars') - data_bundle.set_input(*input_fields, *target_fields) - return data_bundle def process_from_file(self, paths=None): @@ -707,8 +703,6 @@ class THUCNewsPipe(CLSBasePipe): input_fields = ['target', 'seq_len'] + input_field_names target_fields = ['target'] - data_bundle.set_input(*input_fields, *target_fields) - return data_bundle def process_from_file(self, paths=None): @@ -809,8 +803,6 @@ class WeiboSenti100kPipe(CLSBasePipe): input_fields = ['target', 'seq_len'] + input_field_names target_fields = ['target'] - - data_bundle.set_input(*input_fields, *target_fields) return data_bundle diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index bb3b1d51..43982363 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -72,8 +72,6 @@ class _NERPipe(Pipe): for name, dataset in data_bundle.iter_datasets(): dataset.add_seq_len('words') - - data_bundle.set_input(*input_fields, *target_fields) return data_bundle @@ -202,8 +200,6 @@ class Conll2003Pipe(Pipe): for name, dataset in data_bundle.iter_datasets(): dataset.add_seq_len('words') - - data_bundle.set_input(*input_fields, *target_fields) return data_bundle @@ -325,8 +321,6 @@ class _CNNERPipe(Pipe): for name, dataset in data_bundle.iter_datasets(): dataset.add_seq_len('chars') - - data_bundle.set_input(*input_fields, *target_fields) return data_bundle diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py deleted file mode 100644 index 6d35cd1b..00000000 --- a/fastNLP/io/pipe/coreference.py +++ /dev/null @@ -1,186 +0,0 @@ -r"""undocumented""" - -__all__ = [ - "CoReferencePipe" -] - -import collections - -import numpy as np - -from fastNLP.core.vocabulary import Vocabulary -from .pipe import Pipe -from ..data_bundle import DataBundle -from ..loader.coreference import CoReferenceLoader - - -# from ...core.const import Const - - -class CoReferencePipe(Pipe): - r""" - 对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 - - 处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: - - .. csv-table:: - :header: "words1", "words2","words3","words4","chars","seq_len","target" - - "bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" - "[...]", "[...]","[...]","[...]","[...]","[...]","[...]" - - dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: - - +-------------+-----------+--------+-------+---------+ - | field_names | raw_chars | target | chars | seq_len | - +-------------+-----------+--------+-------+---------+ - | is_input | False | True | True | True | - | is_target | False | True | False | True | - | ignore_type | | False | False | False | - | pad_value | | 0 | 0 | 0 | - +-------------+-----------+--------+-------+---------+ - - """ - - def __init__(self, config): - super().__init__() - self.config = config - - def process(self, data_bundle: DataBundle): - r""" - 对load进来的数据进一步处理原始数据包含:raw_key,raw_speaker,raw_words,raw_clusters - - .. csv-table:: - :header: "raw_key", "raw_speaker","raw_words","raw_clusters" - - "bc/cctv/00/cctv_0000_0", "[[Speaker#1, Speaker#1],[]]","[['I','am'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" - "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" - "[...]", "[...]","[...]","[...]" - - - :param data_bundle: - :return: - """ - genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} - vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='raw_words4') - vocab.build_vocab() - word2id = vocab.word2idx - data_bundle.set_vocab(vocab, 'words1') - if self.config.char_path: - char_dict = get_char_dict(self.config.char_path) - else: - char_set = set() - for i, w in enumerate(word2id): - if i < 2: - continue - for c in w: - char_set.add(c) - - char_dict = collections.defaultdict(int) - char_dict.update({c: i for i, c in enumerate(char_set)}) - - for name, ds in data_bundle.iter_datasets(): - # genre - ds.apply(lambda x: genres[x['raw_words1'][:2]], new_field_name='words1') - - # speaker_ids_np - ds.apply(lambda x: speaker2numpy(x['raw_words2'], self.config.max_sentences, is_train=name == 'train'), - new_field_name='words2') - - # sentences - ds.rename_field('raw_words4', 'words3') - - # doc_np - ds.apply(lambda x: doc2numpy(x['words3'], word2id, char_dict, max(self.config.filter), - self.config.max_sentences, is_train=name == 'train')[0], - new_field_name='words4') - # char_index - ds.apply(lambda x: doc2numpy(x['words3'], word2id, char_dict, max(self.config.filter), - self.config.max_sentences, is_train=name == 'train')[1], - new_field_name='chars') - # seq len - ds.apply(lambda x: doc2numpy(x['words3'], word2id, char_dict, max(self.config.filter), - self.config.max_sentences, is_train=name == 'train')[2], - new_field_name='seq_len') - - # clusters - ds.rename_field('raw_words3', 'target') - - ds.set_input('words1', 'words2', 'words3', 'words4', 'chars', 'seq_len', 'target') - - return data_bundle - - def process_from_file(self, paths): - bundle = CoReferenceLoader().load(paths) - return self.process(bundle) - - -# helper - -def doc2numpy(doc, word2id, chardict, max_filter, max_sentences, is_train): - docvec, char_index, length, max_len = _doc2vec(doc, word2id, chardict, max_filter, max_sentences, is_train) - assert max(length) == max_len - assert char_index.shape[0] == len(length) - assert char_index.shape[1] == max_len - doc_np = np.zeros((len(docvec), max_len), int) - for i in range(len(docvec)): - for j in range(len(docvec[i])): - doc_np[i][j] = docvec[i][j] - return doc_np, char_index, length - - -def _doc2vec(doc, word2id, char_dict, max_filter, max_sentences, is_train): - max_len = 0 - max_word_length = 0 - docvex = [] - length = [] - if is_train: - sent_num = min(max_sentences, len(doc)) - else: - sent_num = len(doc) - - for i in range(sent_num): - sent = doc[i] - length.append(len(sent)) - if (len(sent) > max_len): - max_len = len(sent) - sent_vec = [] - for j, word in enumerate(sent): - if len(word) > max_word_length: - max_word_length = len(word) - if word in word2id: - sent_vec.append(word2id[word]) - else: - sent_vec.append(word2id["UNK"]) - docvex.append(sent_vec) - - char_index = np.zeros((sent_num, max_len, max_word_length), dtype=int) - for i in range(sent_num): - sent = doc[i] - for j, word in enumerate(sent): - char_index[i, j, :len(word)] = [char_dict[c] for c in word] - - return docvex, char_index, length, max_len - - -def speaker2numpy(speakers_raw, max_sentences, is_train): - if is_train and len(speakers_raw) > max_sentences: - speakers_raw = speakers_raw[0:max_sentences] - speakers = flatten(speakers_raw) - speaker_dict = {s: i for i, s in enumerate(set(speakers))} - speaker_ids = np.array([speaker_dict[s] for s in speakers]) - return speaker_ids - - -# 展平 -def flatten(l): - return [item for sublist in l for item in sublist] - - -def get_char_dict(path): - vocab = [""] - with open(path) as f: - vocab.extend(c.strip() for c in f.readlines()) - char_dict = collections.defaultdict(int) - char_dict.update({c: i for i, c in enumerate(vocab)}) - return char_dict diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py index 9ef19097..5983201e 100644 --- a/fastNLP/io/pipe/cws.py +++ b/fastNLP/io/pipe/cws.py @@ -262,8 +262,6 @@ class CWSPipe(Pipe): target_fields = ['target', 'seq_len'] for name, dataset in data_bundle.iter_datasets(): dataset.add_seq_len('chars') - - data_bundle.set_input(*input_fields, *target_fields) return data_bundle diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 52446d9b..a89f2f2b 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -161,11 +161,7 @@ class MatchingBertPipe(Pipe): for name, dataset in data_bundle.iter_datasets(): dataset.add_seq_len('words') - dataset.set_input(*input_fields) - for fields in target_fields: - if dataset.has_field(fields): - dataset.set_input(fields) - + return data_bundle @@ -311,11 +307,7 @@ class MatchingPipe(Pipe): for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len('words1', 'seq_len1') dataset.add_seq_len('words2', 'seq_len2') - dataset.set_input(*input_fields) - for fields in target_fields: - if dataset.has_field(fields): - dataset.set_input(fields) - + return data_bundle diff --git a/fastNLP/io/pipe/qa.py b/fastNLP/io/pipe/qa.py index 4e2a977c..23fe1367 100644 --- a/fastNLP/io/pipe/qa.py +++ b/fastNLP/io/pipe/qa.py @@ -135,8 +135,6 @@ class CMRC2018BertPipe(Pipe): src_vocab.index_dataset(*data_bundle.datasets.values(), field_name='raw_chars', new_field_name='chars') data_bundle.set_vocab(src_vocab, 'chars') - data_bundle.set_input('chars', 'raw_chars', 'answers', 'target_start', 'target_end', 'context_len') - return data_bundle def process_from_file(self, paths=None) -> DataBundle: diff --git a/fastNLP/io/pipe/summarization.py b/fastNLP/io/pipe/summarization.py index 5b38f344..359801c4 100644 --- a/fastNLP/io/pipe/summarization.py +++ b/fastNLP/io/pipe/summarization.py @@ -80,9 +80,6 @@ class ExtCNNDMPipe(Pipe): data_bundle = _drop_empty_instance(data_bundle, "label") - # set input and target - data_bundle.set_input('words', 'seq_len', 'target', 'seq_len') - # print("[INFO] Load existing vocab from %s!" % self.vocab_path) word_list = [] with open(self.vocab_path, 'r', encoding='utf8') as vocab_f: diff --git a/fastNLP/modules/torch/__init__.py b/fastNLP/modules/torch/__init__.py index da92ab9c..47e39bb5 100755 --- a/fastNLP/modules/torch/__init__.py +++ b/fastNLP/modules/torch/__init__.py @@ -5,6 +5,7 @@ __all__ = [ "Seq2SeqDecoder", "LSTMSeq2SeqDecoder", "TransformerSeq2SeqDecoder", + "MLP", "LSTM", "Seq2SeqEncoder", diff --git a/fastNLP/modules/torch/decoder/__init__.py b/fastNLP/modules/torch/decoder/__init__.py index 8181d271..601330d7 100755 --- a/fastNLP/modules/torch/decoder/__init__.py +++ b/fastNLP/modules/torch/decoder/__init__.py @@ -7,9 +7,12 @@ __all__ = [ "Seq2SeqDecoder", "LSTMSeq2SeqDecoder", - "TransformerSeq2SeqDecoder" + "TransformerSeq2SeqDecoder", + + "MLP" ] from .crf import ConditionalRandomField, allowed_transitions from .seq2seq_state import State -from .seq2seq_decoder import LSTMSeq2SeqDecoder, TransformerSeq2SeqDecoder, Seq2SeqDecoder \ No newline at end of file +from .seq2seq_decoder import LSTMSeq2SeqDecoder, TransformerSeq2SeqDecoder, Seq2SeqDecoder +from .mlp import MLP \ No newline at end of file diff --git a/fastNLP/modules/torch/decoder/mlp.py b/fastNLP/modules/torch/decoder/mlp.py new file mode 100755 index 00000000..b18a793d --- /dev/null +++ b/fastNLP/modules/torch/decoder/mlp.py @@ -0,0 +1,97 @@ +r"""undocumented""" + +__all__ = [ + "MLP" +] + +import torch +import torch.nn as nn + + +class MLP(nn.Module): + r""" + 多层感知器 + + + .. note:: + 隐藏层的激活函数通过activation定义。一个str/function或者一个str/function的list可以被传入activation。 + 如果只传入了一个str/function,那么所有隐藏层的激活函数都由这个str/function定义; + 如果传入了一个str/function的list,那么每一个隐藏层的激活函数由这个list中对应的元素定义,其中list的长度为隐藏层数。 + 输出层的激活函数由output_activation定义,默认值为None,此时输出层没有激活函数。 + + Examples:: + + >>> net1 = MLP([5, 10, 5]) + >>> net2 = MLP([5, 10, 5], 'tanh') + >>> net3 = MLP([5, 6, 7, 8, 5], 'tanh') + >>> net4 = MLP([5, 6, 7, 8, 5], 'relu', output_activation='tanh') + >>> net5 = MLP([5, 6, 7, 8, 5], ['tanh', 'relu', 'tanh'], 'tanh') + >>> for net in [net1, net2, net3, net4, net5]: + >>> x = torch.randn(5, 5) + >>> y = net(x) + >>> print(x) + >>> print(y) + """ + + def __init__(self, size_layer, activation='relu', output_activation=None, initial_method=None, dropout=0.0): + r""" + + :param List[int] size_layer: 一个int的列表,用来定义MLP的层数,列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1 + :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表,用来定义每一个隐层的激活函数,字符串包括relu,tanh和 + sigmoid,默认值为relu + :param Union[str,func] output_activation: 字符串或者函数,用来定义输出层的激活函数,默认值为None,表示输出层没有激活函数 + :param str initial_method: 参数初始化方式 + :param float dropout: dropout概率,默认值为0 + """ + super(MLP, self).__init__() + self.hiddens = nn.ModuleList() + self.output = None + self.output_activation = output_activation + for i in range(1, len(size_layer)): + if i + 1 == len(size_layer): + self.output = nn.Linear(size_layer[i - 1], size_layer[i]) + else: + self.hiddens.append(nn.Linear(size_layer[i - 1], size_layer[i])) + + self.dropout = nn.Dropout(p=dropout) + + actives = { + 'relu': nn.ReLU(), + 'tanh': nn.Tanh(), + 'sigmoid': nn.Sigmoid(), + } + if not isinstance(activation, list): + activation = [activation] * (len(size_layer) - 2) + elif len(activation) == len(size_layer) - 2: + pass + else: + raise ValueError( + f"the length of activation function list except {len(size_layer) - 2} but got {len(activation)}!") + self.hidden_active = [] + for func in activation: + if callable(func): + self.hidden_active.append(func) + elif func.lower() in actives: + self.hidden_active.append(actives[func]) + else: + raise ValueError("should set activation correctly: {}".format(activation)) + if self.output_activation is not None: + if callable(self.output_activation): + pass + elif self.output_activation.lower() in actives: + self.output_activation = actives[self.output_activation] + else: + raise ValueError("should set activation correctly: {}".format(activation)) + + def forward(self, x): + r""" + :param torch.Tensor x: MLP接受的输入 + :return: torch.Tensor : MLP的输出结果 + """ + for layer, func in zip(self.hiddens, self.hidden_active): + x = self.dropout(func(layer(x))) + x = self.output(x) + if self.output_activation is not None: + x = self.output_activation(x) + x = self.dropout(x) + return x diff --git a/tests/core/dataloaders/torch_dataloader/test_fdl.py b/tests/core/dataloaders/torch_dataloader/test_fdl.py index 1be34a1a..54bf5390 100644 --- a/tests/core/dataloaders/torch_dataloader/test_fdl.py +++ b/tests/core/dataloaders/torch_dataloader/test_fdl.py @@ -4,10 +4,10 @@ from fastNLP.core.dataloaders.torch_dataloader import TorchDataLoader, prepare_t from fastNLP.core.dataset import DataSet from fastNLP.io.data_bundle import DataBundle from fastNLP.envs.imports import _NEED_IMPORT_TORCH -from fastNLP.core import Trainer from pkg_resources import parse_version from tests.helpers.utils import Capturing, recover_logger from fastNLP import logger +import numpy as np if _NEED_IMPORT_TORCH: import torch @@ -141,6 +141,13 @@ class TestFdl: dl_dict1 = prepare_torch_dataloader(ds_dict1) assert isinstance(dl_dict1['train_1'], TorchDataLoader) assert isinstance(dl_dict1['val'], TorchDataLoader) + + ds = [[1, [1]], [2, [2, 2]]] + dl = prepare_torch_dataloader(ds, batch_size=2) + for batch in dl: + assert (batch[0] == torch.LongTensor([1, 2])).sum()==2 + assert (batch[1] == torch.LongTensor([[1, 0], [2, 2]])).sum()==4 + # sequence = [ds, ds1] # seq_ds = prepare_torch_dataloader(sequence) # assert isinstance(seq_ds[0], TorchDataLoader) diff --git a/tests/data_for_tests/conll_2003_example.txt b/tests/data_for_tests/conll_2003_example.txt new file mode 100755 index 00000000..d11a8264 --- /dev/null +++ b/tests/data_for_tests/conll_2003_example.txt @@ -0,0 +1,442 @@ +-DOCSTART- -X- -X- O + +SOCCER NN B-NP O +- : O O +JAPAN NNP B-NP B-LOC +GET VB B-VP O +LUCKY NNP B-NP O +WIN NNP I-NP O +, , O O +CHINA NNP B-NP B-PER +IN IN B-PP O +SURPRISE DT B-NP O +DEFEAT NN I-NP O +. . O O + +Nadim NNP B-NP B-PER +Ladki NNP I-NP I-PER + +AL-AIN NNP B-NP B-LOC +, , O O +United NNP B-NP B-LOC +Arab NNP I-NP I-LOC +Emirates NNPS I-NP I-LOC +1996-12-06 CD I-NP O + +Japan NNP B-NP B-LOC +began VBD B-VP O +the DT B-NP O +defence NN I-NP O +of IN B-PP O +their PRP$ B-NP O +Asian JJ I-NP B-MISC +Cup NNP I-NP I-MISC +title NN I-NP O +with IN B-PP O +a DT B-NP O +lucky JJ I-NP O +2-1 CD I-NP O +win VBP B-VP O +against IN B-PP O +Syria NNP B-NP B-LOC +in IN B-PP O +a DT B-NP O +Group NNP I-NP O +C NNP I-NP O +championship NN I-NP O +match NN I-NP O +on IN B-PP O +Friday NNP B-NP O +. . O O + +But CC O O +China NNP B-NP B-LOC +saw VBD B-VP O +their PRP$ B-NP O +luck NN I-NP O +desert VB B-VP O +them PRP B-NP O +in IN B-PP O +the DT B-NP O +second NN I-NP O +match NN I-NP O +of IN B-PP O +the DT B-NP O +group NN I-NP O +, , O O +crashing VBG B-VP O +to TO B-PP O +a DT B-NP O +surprise NN I-NP O +2-0 CD I-NP O +defeat NN I-NP O +to TO B-PP O +newcomers NNS B-NP O +Uzbekistan NNP I-NP B-LOC +. . O O + +China NNP B-NP B-LOC +controlled VBD B-VP O +most JJS B-NP O +of IN B-PP O +the DT B-NP O +match NN I-NP O +and CC O O +saw VBD B-VP O +several JJ B-NP O +chances NNS I-NP O +missed VBD B-VP O +until IN B-SBAR O +the DT B-NP O +78th JJ I-NP O +minute NN I-NP O +when WRB B-ADVP O +Uzbek NNP B-NP B-MISC +striker NN I-NP O +Igor JJ B-NP B-PER +Shkvyrin NNP I-NP I-PER +took VBD B-VP O +advantage NN B-NP O +of IN B-PP O +a DT B-NP O +misdirected JJ I-NP O +defensive JJ I-NP O +header NN I-NP O +to TO B-VP O +lob VB I-VP O +the DT B-NP O +ball NN I-NP O +over IN B-PP O +the DT B-NP O +advancing VBG I-NP O +Chinese JJ I-NP B-MISC +keeper NN I-NP O +and CC O O +into IN B-PP O +an DT B-NP O +empty JJ I-NP O +net NN I-NP O +. . O O + +Oleg NNP B-NP B-PER +Shatskiku NNP I-NP I-PER +made VBD B-VP O +sure JJ B-ADJP O +of IN B-PP O +the DT B-NP O +win VBP B-VP O +in IN B-PP O +injury NN B-NP O +time NN I-NP O +, , O O +hitting VBG B-VP O +an DT B-NP O +unstoppable JJ I-NP O +left VBD B-VP O +foot NN B-NP O +shot NN I-NP O +from IN B-PP O +just RB B-NP O +outside IN B-PP O +the DT B-NP O +area NN I-NP O +. . O O + +The DT B-NP O +former JJ I-NP O +Soviet JJ I-NP B-MISC +republic NN I-NP O +was VBD B-VP O +playing VBG I-VP O +in IN B-PP O +an DT B-NP O +Asian NNP I-NP B-MISC +Cup NNP I-NP I-MISC +finals NNS I-NP O +tie NN I-NP O +for IN B-PP O +the DT B-NP O +first JJ I-NP O +time NN I-NP O +. . O O + +Despite IN B-PP O +winning VBG B-VP O +the DT B-NP O +Asian JJ I-NP B-MISC +Games NNPS I-NP I-MISC +title NN I-NP O +two CD B-NP O +years NNS I-NP O +ago RB B-ADVP O +, , O O +Uzbekistan NNP B-NP B-LOC +are VBP B-VP O +in IN B-PP O +the DT B-NP O +finals NNS I-NP O +as IN B-SBAR O +outsiders NNS B-NP O +. . O O + +Two CD B-NP O +goals NNS I-NP O +from IN B-PP O +defensive JJ B-NP O +errors NNS I-NP O +in IN B-PP O +the DT B-NP O +last JJ I-NP O +six CD I-NP O +minutes NNS I-NP O +allowed VBD B-VP O +Japan NNP B-NP B-LOC +to TO B-VP O +come VB I-VP O +from IN B-PP O +behind NN B-NP O +and CC O O +collect VB B-VP O +all DT B-NP O +three CD I-NP O +points NNS I-NP O +from IN B-PP O +their PRP$ B-NP O +opening NN I-NP O +meeting NN I-NP O +against IN B-PP O +Syria NNP B-NP B-LOC +. . O O + +Takuya NNP B-NP B-PER +Takagi NNP I-NP I-PER +scored VBD B-VP O +the DT B-NP O +winner NN I-NP O +in IN B-PP O +the DT B-NP O +88th JJ I-NP O +minute NN I-NP O +, , O O +rising VBG B-VP O +to TO I-VP O +head VB I-VP O +a DT B-NP O +Hiroshige NNP I-NP B-PER +Yanagimoto NNP I-NP I-PER +cross VB B-VP O +towards IN B-PP O +the DT B-NP O +Syrian JJ I-NP B-MISC +goal NN I-NP O +which WDT B-NP O +goalkeeper VBD B-VP O +Salem NNP B-NP B-PER +Bitar NNP I-NP I-PER +appeared VBD B-VP O +to TO I-VP O +have VB I-VP O +covered VBN I-VP O +but CC O O +then RB B-VP O +allowed VBN I-VP O +to TO I-VP O +slip VB I-VP O +into IN B-PP O +the DT B-NP O +net NN I-NP O +. . O O + +It PRP B-NP O +was VBD B-VP O +the DT B-NP O +second JJ I-NP O +costly JJ I-NP O +blunder NN I-NP O +by IN B-PP O +Syria NNP B-NP B-LOC +in IN B-PP O +four CD B-NP O +minutes NNS I-NP O +. . O O + +Defender NNP B-NP O +Hassan NNP I-NP B-PER +Abbas NNP I-NP I-PER +rose VBD B-VP O +to TO I-VP O +intercept VB I-VP O +a DT B-NP O +long JJ I-NP O +ball NN I-NP O +into IN B-PP O +the DT B-NP O +area NN I-NP O +in IN B-PP O +the DT B-NP O +84th JJ I-NP O +minute NN I-NP O +but CC O O +only RB B-ADVP O +managed VBD B-VP O +to TO I-VP O +divert VB I-VP O +it PRP B-NP O +into IN B-PP O +the DT B-NP O +top JJ I-NP O +corner NN I-NP O +of IN B-PP O +Bitar NN B-NP B-PER +'s POS B-NP O +goal NN I-NP O +. . O O + +Nader NNP B-NP B-PER +Jokhadar NNP I-NP I-PER +had VBD B-VP O +given VBN I-VP O +Syria NNP B-NP B-LOC +the DT B-NP O +lead NN I-NP O +with IN B-PP O +a DT B-NP O +well-struck NN I-NP O +header NN I-NP O +in IN B-PP O +the DT B-NP O +seventh JJ I-NP O +minute NN I-NP O +. . O O + +Japan NNP B-NP B-LOC +then RB B-ADVP O +laid VBD B-VP O +siege NN B-NP O +to TO B-PP O +the DT B-NP O +Syrian JJ I-NP B-MISC +penalty NN I-NP O +area NN I-NP O +for IN B-PP O +most JJS B-NP O +of IN B-PP O +the DT B-NP O +game NN I-NP O +but CC O O +rarely RB B-VP O +breached VBD I-VP O +the DT B-NP O +Syrian JJ I-NP B-MISC +defence NN I-NP O +. . O O + +Bitar NN B-NP B-PER +pulled VBD B-VP O +off RP B-PRT O +fine JJ B-NP O +saves VBZ B-VP O +whenever WRB B-ADVP O +they PRP B-NP O +did VBD B-VP O +. . O O + +Japan NNP B-NP B-LOC +coach NN I-NP O +Shu NNP I-NP B-PER +Kamo NNP I-NP I-PER +said VBD B-VP O +: : O O +' '' O O +' POS B-NP O +The DT I-NP O +Syrian JJ I-NP B-MISC +own JJ I-NP O +goal NN I-NP O +proved VBD B-VP O +lucky JJ B-ADJP O +for IN B-PP O +us PRP B-NP O +. . O O + +The DT B-NP O +Syrians NNPS I-NP B-MISC +scored VBD B-VP O +early JJ B-NP O +and CC O O +then RB B-VP O +played VBN I-VP O +defensively RB B-ADVP O +and CC O O +adopted VBD B-VP O +long RB I-VP O +balls VBZ I-VP O +which WDT B-NP O +made VBD B-VP O +it PRP B-NP O +hard JJ B-ADJP O +for IN B-PP O +us PRP B-NP O +. . O O +' '' O O + +' '' O O + +Japan NNP B-NP B-LOC +, , O O +co-hosts VBZ B-VP O +of IN B-PP O +the DT B-NP O +World NNP I-NP B-MISC +Cup NNP I-NP I-MISC +in IN B-PP O +2002 CD B-NP O +and CC O O +ranked VBD B-VP O +20th JJ B-NP O +in IN B-PP O +the DT B-NP O +world NN I-NP O +by IN B-PP O +FIFA NNP B-NP B-ORG +, , O O +are VBP B-VP O +favourites JJ B-ADJP O +to TO B-VP O +regain VB I-VP O +their PRP$ B-NP O +title NN I-NP O +here RB B-ADVP O +. . O O + +Hosts NNPS B-NP O +UAE NNP I-NP B-LOC +play NN I-NP O +Kuwait NNP I-NP B-LOC +and CC O O +South NNP B-NP B-LOC +Korea NNP I-NP I-LOC +take VBP B-VP O +on IN B-PP O +Indonesia NNP B-NP B-LOC +on IN B-PP O +Saturday NNP B-NP O +in IN B-PP O +Group NNP B-NP O +A NNP I-NP O +matches VBZ B-VP O +. . O O + +All DT B-NP O +four CD I-NP O +teams NNS I-NP O +are VBP B-VP O +level NN B-NP O +with IN B-PP O +one CD B-NP O +point NN I-NP O +each DT B-NP O +from IN B-PP O +one CD B-NP O +game NN I-NP O +. . O O \ No newline at end of file diff --git a/tests/data_for_tests/conll_example.txt b/tests/data_for_tests/conll_example.txt new file mode 100755 index 00000000..14fac0ad --- /dev/null +++ b/tests/data_for_tests/conll_example.txt @@ -0,0 +1,15 @@ +1 I _ PRP PRP _ 2 SUB +2 solved _ VBD VBD _ 0 ROOT +3 the _ DT DT _ 4 NMOD +4 problem _ NN NN _ 2 OBJ +5 with _ IN IN _ 2 VMOD +6 statistics _ NNS NNS _ 5 PMOD +7 . _ . . _ 2 P + +1 I _ PRP PRP _ 2 SUB +2 solved _ VBD VBD _ 0 ROOT +3 the _ DT DT _ 4 NMOD +4 problem _ NN NN _ 2 OBJ +5 with _ IN IN _ 2 VMOD +6 statistics _ NNS NNS _ 5 PMOD +7 . _ . . _ 2 P diff --git a/tests/data_for_tests/cws_pku_utf_8 b/tests/data_for_tests/cws_pku_utf_8 new file mode 100755 index 00000000..fa75a4a1 --- /dev/null +++ b/tests/data_for_tests/cws_pku_utf_8 @@ -0,0 +1,56 @@ +迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 ) +中共中央 总书记 、 国家 主席 江 泽民 +( 一九九七年 十二月 三十一日 ) +12月 31日 , 中共中央 总书记 、 国家 主席 江 泽民 发表 1998年 新年 讲话 《 迈向 充满 希望 的 新 世纪 》 。 ( 新华社 记者 兰 红光 摄 ) +同胞 们 、 朋友 们 、 女士 们 、 先生 们 : +在 1998年 来临 之际 , 我 十分 高兴 地 通过 中央 人民 广播 电台 、 中国 国际 广播 电台 和 中央 电视台 , 向 全国 各族 人民 , 向 香港 特别 行政区 同胞 、 澳门 和 台湾 同胞 、 海外 侨胞 , 向 世界 各国 的 朋友 们 , 致以 诚挚 的 问候 和 良好 的 祝愿 ! +1997年 , 是 中国 发展 历史 上 非常 重要 的 很 不 平凡 的 一 年 。 中国 人民 决心 继承 邓 小平 同志 的 遗志 , 继续 把 建设 有 中国 特色 社会主义 事业 推向 前进 。 中国 政府 顺利 恢复 对 香港 行使 主权 , 并 按照 “ 一国两制 ” 、 “ 港人治港 ” 、 高度 自治 的 方针 保持 香港 的 繁荣 稳定 。 中国 共产党 成功 地 召开 了 第十五 次 全国 代表大会 , 高举 邓小平理论 伟大 旗帜 , 总结 百年 历史 , 展望 新 的 世纪 , 制定 了 中国 跨 世纪 发展 的 行动 纲领 。 +在 这 一 年 中 , 中国 的 改革 开放 和 现代化 建设 继续 向前 迈进 。 国民经济 保持 了 “ 高 增长 、 低 通胀 ” 的 良好 发展 态势 。 农业 生产 再次 获得 好 的 收成 , 企业 改革 继续 深化 , 人民 生活 进一步 改善 。 对外 经济 技术 合作 与 交流 不断 扩大 。 民主 法制 建设 、 精神文明 建设 和 其他 各项 事业 都 有 新 的 进展 。 我们 十分 关注 最近 一个 时期 一些 国家 和 地区 发生 的 金融 风波 , 我们 相信 通过 这些 国家 和 地区 的 努力 以及 有关 的 国际 合作 , 情况 会 逐步 得到 缓解 。 总的来说 , 中国 改革 和 发展 的 全局 继续 保持 了 稳定 。 +在 这 一 年 中 , 中国 的 外交 工作 取得 了 重要 成果 。 通过 高层 互访 , 中国 与 美国 、 俄罗斯 、 法国 、 日本 等 大国 确定 了 双方 关系 未来 发展 的 目标 和 指导 方针 。 中国 与 周边 国家 和 广大 发展中国家 的 友好 合作 进一步 加强 。 中国 积极 参与 亚太经合 组织 的 活动 , 参加 了 东盟 — 中 日 韩 和 中国 — 东盟 首脑 非正式 会晤 。 这些 外交 活动 , 符合 和平 与 发展 的 时代 主题 , 顺应 世界 走向 多极化 的 趋势 , 对于 促进 国际 社会 的 友好 合作 和 共同 发展 作出 了 积极 的 贡献 。 +1998年 , 中国 人民 将 满怀信心 地 开创 新 的 业绩 。 尽管 我们 在 经济社会 发展 中 还 面临 不少 困难 , 但 我们 有 邓小平理论 的 指引 , 有 改革 开放 近 20 年 来 取得 的 伟大 成就 和 积累 的 丰富 经验 , 还有 其他 的 各种 有利 条件 , 我们 一定 能够 克服 这些 困难 , 继续 稳步前进 。 只要 我们 进一步 解放思想 , 实事求是 , 抓住 机遇 , 开拓进取 , 建设 有 中国 特色 社会主义 的 道路 就 会 越 走 越 宽广 。 +实现 祖国 的 完全 统一 , 是 海内外 全体 中国 人 的 共同 心愿 。 通过 中 葡 双方 的 合作 和 努力 , 按照 “ 一国两制 ” 方针 和 澳门 《 基本法 》 , 1999年 12月 澳门 的 回归 一定 能够 顺利 实现 。 +台湾 是 中国 领土 不可分割 的 一 部分 。 完成 祖国 统一 , 是 大势所趋 , 民心所向 。 任何 企图 制造 “ 两 个 中国 ” 、 “ 一中一台 ” 、 “ 台湾 独立 ” 的 图谋 , 都 注定 要 更 失败 。 希望 台湾 当局 以 民族 大义 为重 , 拿 出 诚意 , 采取 实际 的 行动 , 推动 两岸 经济 文化 交流 和 人员 往来 , 促进 两岸 直接 通邮 、 通航 、 通商 的 早日 实现 , 并 尽早 回应 我们 发出 的 在 一个 中国 的 原则 下 两岸 进行 谈判 的 郑重 呼吁 。 +环顾 全球 , 日益 密切 的 世界 经济 联系 , 日新月异 的 科技 进步 , 正在 为 各国 经济 的 发展 提供 历史 机遇 。 但是 , 世界 还 不 安宁 。 南北 之间 的 贫富 差距 继续 扩大 ; 局部 冲突 时有发生 ; 不 公正 不 合理 的 旧 的 国际 政治经济 秩序 还 没有 根本 改变 ; 发展中国家 在 激烈 的 国际 经济 竞争 中 仍 处于 弱势 地位 ; 人类 的 生存 与 发展 还 面临 种种 威胁 和 挑战 。 和平 与 发展 的 前景 是 光明 的 , 21 世纪 将 是 充满 希望 的 世纪 。 但 前进 的 道路 不 会 也 不 可能 一帆风顺 , 关键 是 世界 各国 人民 要 进一步 团结 起来 , 共同 推动 早日 建立 公正 合理 的 国际 政治经济 新 秩序 。 +中国 政府 将 继续 坚持 奉行 独立自主 的 和平 外交 政策 , 在 和平共处 五 项 原则 的 基础 上 努力 发展 同 世界 各国 的 友好 关系 。 中国 愿意 加强 同 联合国 和 其他 国际 组织 的 协调 , 促进 在 扩大 经贸 科技 交流 、 保护 环境 、 消除 贫困 、 打击 国际 犯罪 等 方面 的 国际 合作 。 中国 永远 是 维护 世界 和平 与 稳定 的 重要 力量 。 中国 人民 愿 与 世界 各国 人民 一道 , 为 开创 持久 和平 、 共同 发展 的 新 世纪 而 不懈努力 ! +在 这 辞旧迎新 的 美好 时刻 , 我 祝 大家 新年 快乐 , 家庭 幸福 ! +谢谢 ! ( 新华社 北京 12月 31日 电 ) +在 十五大 精神 指引 下 胜利 前进 —— 元旦 献辞 +我们 即将 以 丰收 的 喜悦 送 走 牛年 , 以 昂扬 的 斗志 迎来 虎年 。 我们 伟大 祖国 在 新 的 一 年 , 将 是 充满 生机 、 充满 希望 的 一 年 。 +刚刚 过去 的 一 年 , 大气磅礴 , 波澜壮阔 。 在 这 一 年 , 以 江 泽民 同志 为 核心 的 党中央 , 继承 邓 小平 同志 的 遗志 , 高举 邓小平理论 的 伟大 旗帜 , 领导 全党 和 全国 各族 人民 坚定不移 地 沿着 建设 有 中国 特色 社会主义 道路 阔步 前进 , 写 下 了 改革 开放 和 社会主义 现代化 建设 的 辉煌 篇章 。 顺利 地 恢复 对 香港 行使 主权 , 胜利 地 召开 党 的 第十五 次 全国 代表大会 ——— 两 件 大事 办 得 圆满 成功 。 国民经济 稳中求进 , 国家 经济 实力 进一步 增强 , 人民 生活 继续 改善 , 对外 经济 技术 交流 日益 扩大 。 在 国际 金融 危机 的 风浪 波及 许多 国家 的 情况 下 , 我国 保持 了 金融 形势 和 整个 经济 形势 的 稳定 发展 。 社会主义 精神文明 建设 和 民主 法制 建设 取得 新 的 成绩 , 各项 社会 事业 全面 进步 。 外交 工作 取得 可喜 的 突破 , 我国 的 国际 地位 和 国际 威望 进一步 提高 。 实践 使 亿万 人民 对 邓小平理论 更加 信仰 , 对 以 江 泽民 同志 为 核心 的 党中央 更加 信赖 , 对 伟大 祖国 的 光辉 前景 更加 充满 信心 。 +1998年 , 是 全面 贯彻 落实 党 的 十五大 提 出 的 任务 的 第一 年 , 各 条 战线 改革 和 发展 的 任务 都 十分 繁重 , 有 许多 深 层次 的 矛盾 和 问题 有待 克服 和 解决 , 特别 是 国有 企业 改革 已经 进入 攻坚 阶段 。 我们 必须 进一步 深入 学习 和 掌握 党 的 十五大 精神 , 统揽全局 , 精心 部署 , 狠抓 落实 , 团结 一致 , 艰苦奋斗 , 开拓 前进 , 为 夺取 今年 改革 开放 和 社会主义 现代化 建设 的 新 胜利 而 奋斗 。 +今年 是 党 的 十一 届 三中全会 召开 20 周年 , 是 我们 党 和 国家 实现 伟大 的 历史 转折 、 进入 改革 开放 历史 新 时期 的 20 周年 。 在 新 的 一 年 里 , 大力 发扬 十一 届 三中全会 以来 我们 党 所 恢复 的 优良 传统 和 在 新 的 历史 条件 下 形成 的 优良 作风 , 对于 完成 好 今年 的 各项 任务 具有 十分 重要 的 意义 。 +我们 要 更 好 地 坚持 解放思想 、 实事求是 的 思想 路线 。 解放思想 、 实事求是 , 是 邓小平理论 的 精髓 。 实践 证明 , 只有 解放思想 、 实事求是 , 才 能 冲破 各种 不 切合 实际 的 或者 过时 的 观念 的 束缚 , 真正 做到 尊重 、 认识 和 掌握 客观 规律 , 勇于 突破 , 勇于 创新 , 不断 开创 社会主义 现代化 建设 的 新 局面 。 党 的 十五大 是 我们 党 解放思想 、 实事求是 的 新 的 里程碑 。 进一步 认真 学习 和 掌握 十五大 精神 , 解放思想 、 实事求是 , 我们 的 各项 事业 就 能 结 出 更加 丰硕 的 成果 。 +我们 要 更 好 地 坚持 以 经济 建设 为 中心 。 各项 工作 必须 以 经济 建设 为 中心 , 是 邓小平理论 的 基本 观点 , 是 党 的 基本 路线 的 核心 内容 , 近 20 年 来 的 实践 证明 , 坚持 这个 中心 , 是 完全 正确 的 。 今后 , 我们 能否 把 建设 有 中国 特色 社会主义 伟大 事业 全面 推向 21 世纪 , 关键 仍然 要 看 能否 把 经济 工作 搞 上去 。 各级 领导 干部 要 切实 把 精力 集中 到 贯彻 落实 好 中央 关于 今年 经济 工作 的 总体 要求 和 各项 重要 任务 上 来 , 不断 提高 领导 经济 建设 的 能力 和 水平 。 +我们 要 更 好 地 坚持 “ 两手抓 、 两手 都 要 硬 ” 的 方针 。 在 坚持 以 经济 建设 为 中心 的 同时 , 积极 推进 社会主义 精神文明 建设 和 民主 法制 建设 , 是 建设 富强 、 民主 、 文明 的 社会主义 现代化 国家 的 重要 内容 。 实践 证明 , 经济 建设 的 顺利 进行 , 离 不 开 精神文明 建设 和 民主 法制 建设 的 保证 。 党 的 十五大 依据 邓小平理论 和 党 的 基本 路线 提 出 的 党 在 社会主义 初级阶段 经济 、 政治 、 文化 的 基本 纲领 , 为 “ 两手抓 、 两手 都 要 硬 ” 提供 了 新 的 理论 根据 , 提 出 了 更 高 要求 , 现在 的 关键 是 认真 抓好 落实 。 +我们 要 更 好 地 发扬 求真务实 、 密切 联系 群众 的 作风 。 这 是 把 党 的 方针 、 政策 落到实处 , 使 改革 和 建设 取得 胜利 的 重要 保证 。 在 当前 改革 进一步 深化 , 经济 不断 发展 , 同时 又 出现 一些 新 情况 、 新 问题 和 新 困难 的 形势 下 , 更 要 发扬 这样 的 好 作风 。 要 尊重 群众 的 意愿 , 重视 群众 的 首创 精神 , 关心 群众 的 生活 疾苦 。 江 泽民 同志 最近 强调 指出 , 要 大力 倡导 说实话 、 办 实事 、 鼓 实劲 、 讲 实效 的 作风 , 坚决 制止 追求 表面文章 , 搞 花架子 等 形式主义 , 坚决 杜绝 脱离 群众 、 脱离 实际 、 浮躁 虚夸 等 官僚主义 。 这 是 非常 重要 的 。 因此 , 各级 领导 干部 务必 牢记 全心全意 为 人民 服务 的 宗旨 , 在 勤政廉政 、 艰苦奋斗 方面 以身作则 , 当 好 表率 。 +1998 , 瞩目 中华 。 新 的 机遇 和 挑战 , 催 人 进取 ; 新 的 目标 和 征途 , 催 人 奋发 。 英雄 的 中国 人民 在 以 江 泽民 同志 为 核心 的 党中央 坚强 领导 和 党 的 十五大 精神 指引 下 , 更 高 地 举起 邓小平理论 的 伟大 旗帜 , 团结 一致 , 扎实 工作 , 奋勇前进 , 一定 能够 创造 出 更加 辉煌 的 业绩 ! +北京 举行 新年 音乐会 +江 泽民 李 鹏 乔 石 朱 镕基 李 瑞环 刘 华清 尉 健行 李 岚清 与 万 名 首都 各界 群众 和 劳动模范 代表 一起 辞旧迎新 ( 附 图片 1 张 ) +党 和 国家 领导人 江 泽民 、 李 鹏 、 乔 石 、 朱 镕基 、 李 瑞环 、 刘 华清 、 尉 健行 、 李 岚清 等 与 万 名 首都 各界 群众 和 劳动模范 代表 一起 欣赏 了 ’98 北京 新年 音乐会 的 精彩 节目 。 这 是 江 泽民 等 在 演出 结束 后 同 演出 人员 合影 。 +( 新华社 记者 樊 如钧 摄 ) +本报 北京 12月 31日 讯 新华社 记者 陈 雁 、 本报 记者 何 加正 报道 : 在 度过 了 非凡 而 辉煌 的 1997年 , 迈向 充满 希望 的 1998年 之际 , ’98 北京 新年 音乐会 今晚 在 人民 大会堂 举行 。 党 和 国家 领导人 江 泽民 、 李 鹏 、 乔 石 、 朱 镕基 、 李 瑞环 、 刘 华清 、 尉 健行 、 李 岚清 与 万 名 首都 各界 群众 和 劳动模范 代表 一起 , 在 激昂 奋进 的 音乐声 中 辞旧迎新 。 +今晚 的 长安街 流光溢彩 , 火树银花 ; 人民 大会堂 里 灯火辉煌 , 充满 欢乐 祥和 的 喜庆 气氛 。 在 这 场 由 中共 北京 市委 宣传部 、 市政府 办公厅 等 单位 主办 的 题 为 “ 世纪 携手 、 共 奏 华章 ” 的 新年 音乐会 上 , 中国 三 个 著名 交响乐团 ——— 中国 交响乐团 、 上海 交响乐团 、 北京 交响乐团 首 次 联袂 演出 。 著名 指挥家 陈 佐湟 、 陈 燮阳 、 谭 利华 分别 指挥 演奏 了 一 批 中外 名曲 , 京 沪 两地 200 多 位 音乐家 组成 的 大型 乐队 以 饱满 的 激情 和 精湛 的 技艺 为 观众 奉献 了 一 台 高 水准 的 交响音乐会 。 +音乐会 在 雄壮 的 管弦乐 《 红旗 颂 》 中 拉开 帷幕 , 舒展 、 优美 的 乐曲声 使 人们 仿佛 看到 : 五星红旗 在 天安门 城楼 上 冉冉 升起 ; 仿佛 听到 : 在 红旗 的 指引 下 中国 人民 向 现代化 新 征程 迈进 的 脚步声 。 钢琴 与 管弦乐队 作品 《 东方 之 珠 》 , 把 广大 听众 耳熟能详 的 歌曲 改编 为 器乐曲 , 以 其 优美 感人 的 旋律 抒发 了 洗雪 百年 耻辱 的 香港 明天 会 更 好 的 情感 。 专程 回国 参加 音乐会 的 著名 女高音 歌唱家 迪里拜尔 演唱 的 《 春 之 声 》 , 把 人们 带 到 了 万象更新 的 田野 和 山谷 ; 享誉 国际 乐坛 的 男高音 歌唱家 莫 华伦 演唱 了 著名 歌剧 《 图兰朵 》 选段 “ 今夜 无 人 入睡 ” , 把 人们 带入 迷人 的 艺术 境地 。 音乐会 上 还 演奏 了 小提琴 协奏曲 《 梁 山伯 与 祝 英台 》 、 柴可夫斯基 的 《 第四 交响曲 ——— 第四 乐章 》 、 交响诗 《 罗马 的 松树 》 等 中外 著名 交响曲 。 +万 人 大会堂 今晚 座无虚席 , 观众 被 艺术家 们 精湛 的 表演 深深 打动 , 不断 报 以 经久不息 的 热烈 掌声 。 艺术家 们 频频 谢幕 , 指挥家 依次 指挥 演出 返 场 曲目 , 最后 音乐会 在 《 红色 娘子军 》 选曲 、 《 白毛女 》 选曲 、 《 北京 喜讯 到 边寨 》 等 乐曲声 中 达到 高潮 。 +演出 结束 后 , 江 泽民 等 党 和 国家 领导人 走 上 舞台 , 亲切 会见 了 参加 演出 的 全体 人员 , 祝贺 演出 成功 , 并 与 他们 合影 留念 。 +李 铁映 、 贾 庆林 、 曾 庆红 等 领导 同志 也 出席 了 今晚 音乐会 。 +李 鹏 在 北京 考察 企业 +向 广大 职工 祝贺 新年 , 对 节日 坚守 岗位 的 同志 们 表示 慰问 +新华社 北京 十二月 三十一日 电 ( 中央 人民 广播 电台 记者 刘 振英 、 新华社 记者 张 宿堂 ) 今天 是 一九九七年 的 最后 一 天 。 辞旧迎新 之际 , 国务院 总理 李 鹏 今天 上午 来到 北京 石景山 发电 总厂 考察 , 向 广大 企业 职工 表示 节日 的 祝贺 , 向 将要 在 节日 期间 坚守 工作 岗位 的 同志 们 表示 慰问 。 +上午 九时 二十分 , 李 鹏 总理 在 北京 市委 书记 、 市长 贾 庆林 的 陪同 下 , 来到 位于 北京 西郊 的 北京 石景山 发电 总厂 。 始建 于 一九一九年 的 北京 石景山 发电 总厂 是 华北 电力 集团公司 骨干 发电 企业 , 承担 着 向 首都 供电 、 供热 任务 , 装机 总 容量 一百一十六点六万 千瓦 。 总厂 年发电量 四十五亿 千瓦时 , 供热 能力 八百 百万大卡/小时 , 现 供热 面积 已 达 八百 多 万 平方米 。 早 在 担任 华北 电管局 领导 时 , 李 鹏 就 曾 多次 到 发电 总厂 检查 指导 工作 。 +在 总厂 所 属 的 石景山 热电厂 , 李 鹏 首先 向 华北 电管局 、 电厂 负责人 详细 询问 了 目前 电厂 生产 、 职工 生活 和 华北 电网 向 首都 供电 、 供热 的 有关 情况 。 随后 , 他 又 实地 察看 了 发电机组 的 运行 情况 和 电厂 一号机 、 二号机 控制室 。 在 控制室 , 李 鹏 与 职工 们 一一 握手 , 向 大家 表示 慰问 。 他 说 , 在 一九九八年 即将 到来之际 , 有 机会 再次 回到 石景山 发电 总厂 , 感到 十分 高兴 。 李 鹏 亲切 地 说 : 『 今天 我 看到 了 许多 新 的 、 年轻 的 面孔 , 这 说明 在 老 同志 们 作出 贡献 退 下来 后 , 新 一代 的 年轻人 成长 起来 了 、 成熟 起来 了 , 我 感到 十分 欣慰 。 』 +( A 、 B ) +李 鹏 说 : “ 作为 首都 的 电力 工作者 , 你们 为 首都 的 各项 重大 活动 的 顺利 进行 , 为 保障 人民 群众 的 工作 、 生活 和 学习 , 为 促进 首都 经济 的 发展 作出 了 自己 的 贡献 。 明天 就 是 元旦 , 你们 还有 许多 同志 要 坚守 岗位 , 我 向 你们 、 向 全体 电力 工作者 表示 感谢 。 现在 , 我们 的 首都 已经 结束 了 拉 闸 限 电 的 历史 , 希望 依靠 大家 , 使 拉 闸 限 电 的 历史 永远 不再 重演 。 同时 , 也 希望 你们 安全 生产 、 经济 调度 , 实现 经济 增长 方式 的 转变 。 ” 李 鹏 最后 向 电业 职工 , 向 全 北京市 的 人民 拜年 , 向 大家 致以 新春 的 问候 , 祝愿 电力 事业 取得 新 的 成绩 , 祝愿 北京市 在 改革 、 发展 和 稳定 的 各项 工作 中 取得 新 的 成就 。 +参观 工厂 结束 后 , 李 鹏 又 来到 工厂 退休 职工 郭 树范 和 闫 戌麟 家 看望 慰问 , 向 他们 拜年 。 曾经 是 高级 工程师 的 郭 树范 退休 前 一直 在 发电厂 从事 土建工程 建设 , 退休 后 , 与 老伴 一起 抚养 着 身体 欠佳 的 孙子 。 李 鹏 对 他们 倾心 照顾 下 一 代 表示 肯定 。 他 说 : “ 人 老 了 , 照顾 照顾 后代 也 是 一 件 可以 带来 快乐 的 事 , 当然 , 对 孩子 们 不 能 溺爱 , 要 让 他们 健康 成长 。 ” 在 老工人 闫 戌麟 家 , 当 李 鹏 了解 到 老闫 退休 前 一直 都 是 厂里 的 先进 工作者 、 曾经 被 评为 北京市 “ 五好 职工 ” , 退休 后 仍然 为 改善 职工 的 住房 而 奔波 时 , 十分 高兴 , 对 他 为 工厂 建设 作出 的 贡献 表示 感谢 。 在 郭 家 和 闫 家 , 李 鹏 都 具体 地 了解 了 他们 退休 后 的 生活 保障 问题 , 并 与 一些 老 职工 一起 回忆 起 了 当年 建设 电厂 的 情景 。 李 鹏 说 : “ 当年 搞 建设 , 条件 比 现在 差 多 了 , 大家 也 很 少 计较 什么 , 只是 一心 想 着 把 电厂 建 好 。 现在 条件 好 了 , 但 艰苦奋斗 、 无私奉献 的 精神 可 不 能 丢 。 ” 李 鹏 最后 祝 他们 新春 快乐 , 身体 健康 , 家庭 幸福 。 +陪同 考察 企业 并 看望 慰问 职工 的 国务院 有关 部门 和 北京市 负责人 还有 : 史 大桢 、 高 严 、 石 秀诗 、 阳 安江 等 。 +挂 起 红灯 迎 新年 ( 图片 ) +元旦 来临 , 安徽省 合肥市 长江路 悬挂 起 3300 盏 大 红灯笼 , 为 节日 营造 出 “ 千 盏 灯笼 凌空 舞 , 十 里 长街 别样 红 ” 的 欢乐 祥和 气氛 。 ( 新华社 记者 戴 浩 摄 ) +( 传真 照片 ) +全总 致 全国 各族 职工 慰问信 +勉励 广大 职工 发挥 工人阶级 主力军 作用 , 为 企业 改革 发展 建功立业 +本报 北京 1月 1日 讯 中华 全国 总工会 今日 发出 《 致 全国 各族 职工 慰问信 》 , 向 全国 各族 职工 祝贺 新年 。 +慰问信 说 , 实现 党 的 十五大 提 出 的 宏伟 目标 , 必须 依靠 工人阶级 和 全体 人民 的 长期 奋斗 。 工人阶级 是 我们 国家 的 领导 阶级 , 是 先进 生产力 和 生产关系 的 代表 , 是 两 个 文明 建设 的 主力军 , 是 维护 社会 安定团结 的 中坚 力量 。 党 的 十五大 再次 强调 要 坚持 全心全意 依靠 工人阶级 的 方针 , 具有 重大 的 意义 。 广大 职工 要 以 邓小平理论 和 党 的 基本 路线 为 指导 , 坚持 党 的 基本 纲领 和 各项 方针 政策 , 积极 投身 于 改革 和 建设 事业 。 要 坚持 站 在 改革 的 前列 , 转变 思想 观念 , 增强 市场 意识 、 竞争 意识 和 效益 意识 , 以 实际 行动 促进 改革 的 不断 深化 。 要 发扬 工人阶级 的 首创 精神 , 不断 为 企业 转机建制 、 调整 结构 、 加强 管理 、 提高 效益 献计献策 。 要 大力 开展 劳动 竞赛 、 合理化 建议 、 技术 革新 、 技术 协作 和 发明 创造 等 活动 , 努力 提高 产品 质量 和 经济效益 , 推动 企业 加快 技术 进步 , 实现 增长 方式 的 根本 转变 , 再 创 国有 企业 的 辉煌 。 要 正确 对待 企业 改革 和 发展 中 的 困难 和 问题 , 树立 起 战胜 困难 的 勇气 和 信心 , 锲而不舍 , 迎难而上 , 为 企业 的 改革 和 发展 建功立业 。 +慰问信 指出 , 广大 职工 要 以 主人翁 的 姿态 , 积极 行使 当家作主 的 权利 。 要 不断 提高 自身 素质 , 发扬 爱国 奉献 、 爱厂如家 、 爱岗敬业 的 精神 , 学习 掌握 先进 科学 文化 知识 , 成为 本职工作 的 行家里手 , 迎接 新 世纪 面临 的 挑战 。 +慰问信 最后 说 , 让 我们 在 邓小平理论 和 党 的 基本 路线 指导 下 , 更加 紧密 地 团结 在 以 江 泽民 同志 为 核心 的 党中央 周围 , 统揽全局 , 精心 部署 , 狠抓 落实 , 团结 一致 , 艰苦奋斗 , 开拓 前进 , 在 两 个 文明 建设 中 充分 发挥 工人阶级 主力军 作用 , 为 实现 跨 世纪 宏伟 目标 作出 新 的 更 大 的 贡献 。 +忠诚 的 共产主义 战士 , 久经考验 的 无产阶级 革命家 刘 澜涛 同志 逝世 +( 附 图片 1 张 ) \ No newline at end of file diff --git a/tests/data_for_tests/cws_test b/tests/data_for_tests/cws_test new file mode 100755 index 00000000..cec07ac3 --- /dev/null +++ b/tests/data_for_tests/cws_test @@ -0,0 +1,1018 @@ +然#然#B_N +而#而#E_N +,#,#S_N +這#這#B_N +樣#樣#E_N +的#的#S_N +處#處#B_N +理#理#E_N +也#也#S_N +衍#衍#B_N +生#生#E_N +了#了#S_N +一#一#B_N +些#些#E_N +問#問#B_N +題#題#E_N +.#.#S_N + +自#自#B_N +從#從#E_N +2004##S_N +年#年#S_N +提#提#B_N +出#出#E_N +了#了#S_N +興#興#B_N +建#建#E_N +人#人#B_N +文#文#E_N +大#大#B_N +樓#樓#E_N +的#的#S_N +構#構#B_N +想#想#E_N +,#,#S_N +企#企#B_N +業#業#E_N +界#界#S_N +陸#陸#B_N +續#續#E_N +有#有#S_N +人#人#S_N +提#提#B_N +供#供#E_N +捐#捐#B_N +款#款#E_N +.#.#S_N + +杜#杜#B_N +鵑#鵑#M_N +花#花#E_N +為#為#S_N +溫#溫#B_N +帶#帶#E_N +植#植#B_N +物#物#E_N +,#,#S_N +台#台#B_N +北#北#E_N +雖#雖#B_N +然#然#E_N +在#在#S_N +亞#亞#S_N +熱#熱#B_N +帶#帶#E_N +,#,#S_N +但#但#S_N +冬#冬#B_N +季#季#E_N +的#的#S_N +東#東#B_N +北#北#E_N +季#季#B_N +風#風#E_N +卻#卻#S_N +使#使#B_N +得#得#E_N +杜#杜#B_N +鵑#鵑#M_N +花#花#E_N +在#在#S_N +臺#臺#B_N +大#大#E_N +宜#宜#B_N +然#然#M_N +自#自#M_N +得#得#E_N +.#.#S_N + +臺#臺#B_N +大#大#E_N +醫#醫#B_N +學#學#E_N +人#人#B_N +文#文#E_N +博#博#B_N +物#物#E_N +館#館#S_N +是#是#S_N +一#一#S_N +棟#棟#S_N +兩#兩#S_N +層#層#S_N +樓#樓#S_N +的#的#S_N +建#建#B_N +築#築#E_N +,#,#S_N +沿#沿#S_N +中#中#B_N +山#山#E_N +南#南#S_N +路#路#S_N +與#與#S_N +仁#仁#B_N +愛#愛#E_N +路#路#S_N +成#成#S_N +L#L#S_N +型#型#S_N +.#.#S_N + +樓#樓#B_N +頂#頂#E_N +有#有#S_N +天#天#B_N +文#文#E_N +台#台#S_N +,#,#S_N +現#現#S_N +為#為#S_N +天#天#B_N +文#文#E_N +社#社#S_N +使#使#B_N +用#用#E_N +.#.#S_N + +國#國#B_N +際#際#E_N +北#北#B_N +極#極#E_N +研#研#B_N +究#究#E_N +中#中#B_N +心#心#E_N +的#的#S_N +主#主#B_N +要#要#E_N +夥#夥#B_N +伴#伴#E_N +是#是#S_N +日#日#B_N +本#本#E_N +和#和#S_N +美#美#B_N +國#國#E_N +,#,#S_N +參#參#B_N +與#與#E_N +會#會#B_N +務#務#E_N +的#的#S_N +還#還#S_N +有#有#S_N +來#來#S_N +自#自#S_N +加#加#B_N +拿#拿#M_N +大#大#E_N +、#、#S_N +中#中#B_N +國#國#E_N +、#、#S_N +丹#丹#B_N +麥#麥#E_N +、#、#S_N +德#德#B_N +國#國#E_N +、#、#S_N +日#日#B_N +本#本#E_N +、#、#S_N +挪#挪#B_N +威#威#E_N +、#、#S_N +俄#俄#B_N +羅#羅#M_N +斯#斯#E_N +、#、#S_N +英#英#B_N +國#國#E_N +和#和#S_N +美#美#B_N +國#國#E_N +的#的#S_N +代#代#B_N +表#表#E_N +.#.#S_N + +其#其#B_N +中#中#E_N +參#參#B_N +賽#賽#E_N +者#者#S_N +年#年#B_N +齡#齡#E_N +不#不#B_N +可#可#E_N +超#超#B_N +過#過#E_N +18##S_N +歲#歲#S_N +(#(#S_N +以#以#S_N +當#當#B_N +年#年#E_N +7##S_N +月#月#S_N +1##S_N +日#日#S_N +為#為#S_N +準#準#S_N +)#)#S_N +,#,#S_N +且#且#S_N +必#必#B_N +須#須#E_N +就#就#B_N +讀#讀#E_N +於#於#S_N +中#中#S_N +學#學#B_N +校#校#E_N +(#(#S_N +S#S#B_N +e#e#M_N +c#c#M_N +o#o#M_N +n#n#M_N +d#d#M_N +a#a#M_N +r#r#M_N +y#y#E_N +S#S#B_N +c#c#M_N +h#h#M_N +o#o#M_N +o#o#M_N +l#l#E_N +)#)#S_N +.#.#S_N + +同#同#B_N +年#年#E_N +9##S_N +月#月#S_N +7##S_N +日#日#S_N +,#,#S_N +亞#亞#B_N +奧#奧#E_N +理#理#B_N +事#事#E_N +會#會#S_N +主#主#B_N +席#席#E_N +薩#薩#B_N +巴#巴#M_N +赫#赫#E_N +親#親#B_N +王#王#E_N +為#為#S_N +國#國#B_N +際#際#E_N +射#射#B_N +擊#擊#E_N +中#中#B_N +心#心#E_N +主#主#B_N +持#持#E_N +銅#銅#B_N +像#像#E_N +揭#揭#B_N +幕#幕#E_N +儀#儀#B_N +式#式#E_N +.#.#S_N + +這#這#B_N +些#些#E_N +電#電#B_N +話#話#E_N +經#經#S_N +交#交#B_N +換#換#E_N +機#機#S_N +處#處#B_N +理#理#E_N +,#,#S_N +使#使#B_N +用#用#E_N +的#的#S_N +媒#媒#B_N +介#介#E_N +包#包#B_N +括#括#E_N +海#海#B_N +底#底#E_N +電#電#B_N +纜#纜#E_N +、#、#S_N +人#人#S_N +造#造#S_N +衛#衛#B_N +星#星#E_N +、#、#S_N +無#無#B_N +線#線#E_N +電#電#S_N +、#、#S_N +光#光#B_N +纖#纖#E_N +及#及#S_N +I#I#B_N +P#P#E_N +電#電#B_N +話#話#E_N +(#(#S_N +V#V#B_N +O#O#M_N +I#I#M_N +P#P#E_N +)#)#S_N +.#.#S_N + +《#《#S_N +圓#圓#S_N +月#月#S_N +彎#彎#S_N +刀#刀#S_N +》#》#S_N +為#為#S_N +古#古#B_N +龍#龍#E_N +晚#晚#B_N +期#期#E_N +作#作#B_N +品#品#E_N +,#,#S_N +1976##S_N +年#年#S_N +6##S_N +月#月#S_N +至#至#S_N +1978##S_N +年#年#S_N +5##S_N +月#月#S_N +,#,#S_N +香#香#B_N +港#港#E_N +〈#〈#S_N +武#武#B_N +俠#俠#E_N +春#春#B_N +秋#秋#E_N +〉#〉#S_N +282##S_N +至#至#S_N +348##S_N +期#期#S_N +斷#斷#B_N +續#續#E_N +連#連#B_N +載#載#E_N +,#,#S_N +原#原#B_N +名#名#E_N +《#《#S_N +刀#刀#B_N +神#神#E_N +》#》#S_N +,#,#S_N +1978##S_N +年#年#S_N +漢#漢#B_N +麟#麟#E_N +出#出#B_N +版#版#E_N +改#改#B_N +名#名#E_N +《#《#S_N +圓#圓#S_N +月#月#S_N +彎#彎#S_N +刀#刀#S_N +》#》#S_N +.#.#S_N + +圓#圓#B_N +齒#齒#M_N +龍#龍#E_N +(#(#S_N +G#G#B_N +l#l#M_N +o#o#M_N +b#b#M_N +i#i#M_N +d#d#M_N +e#e#M_N +n#n#M_N +s#s#E_N +)#)#S_N +意#意#S_N +為#為#S_N +「#「#S_N +球#球#B_N +狀#狀#E_N +牙#牙#B_N +齒#齒#E_N +」#」#S_N +,#,#S_N +是#是#S_N +滄#滄#B_N +龍#龍#E_N +科#科#S_N +的#的#S_N +一#一#S_N +個#個#S_N +屬#屬#S_N +.#.#S_N + +圖#圖#B_N +波#波#M_N +列#列#M_N +夫#夫#E_N +設#設#B_N +計#計#E_N +局#局#S_N +在#在#S_N +1960##S_N +年#年#B_N +代#代#E_N +末#末#B_N +期#期#E_N +推#推#B_N +出#出#E_N +圖#圖#S_N +-#-#E_N +154##S_N +客#客#B_N +機#機#E_N +後#後#S_N +,#,#S_N +圖#圖#B_N +波#波#M_N +列#列#M_N +夫#夫#E_N +便#便#S_N +成#成#S_N +為#為#S_N +社#社#B_N +會#會#E_N +主#主#B_N +義#義#E_N +國#國#B_N +家#家#E_N +民#民#B_N +航#航#E_N +飛#飛#B_N +機#機#E_N +的#的#S_N +主#主#B_N +要#要#E_N +供#供#B_N +應#應#E_N +商#商#S_N +.#.#S_N + +事#事#B_N +實#實#E_N +上#上#S_N +,#,#S_N +團#團#B_N +購#購#E_N +網#網#B_N +站#站#E_N +的#的#S_N +產#產#B_N +品#品#E_N +原#原#B_N +價#價#E_N +和#和#S_N +購#購#B_N +買#買#E_N +數#數#B_N +量#量#E_N +經#經#B_N +常#常#E_N +被#被#S_N +「#「#S_N +注#注#B_N +水#水#E_N +」#」#S_N +,#,#S_N +而#而#S_N +產#產#B_N +品#品#E_N +和#和#S_N +服#服#B_N +務#務#E_N +的#的#S_N +品#品#B_N +質#質#E_N +則#則#S_N +經#經#B_N +常#常#E_N +「#「#S_N +縮#縮#B_N +水#水#E_N +」#」#S_N +.#.#S_N + +團#團#B_N +購#購#E_N +網#網#B_N +站#站#E_N +的#的#S_N +主#主#B_N +要#要#E_N +產#產#B_N +品#品#E_N +分#分#S_N +為#為#S_N +家#家#B_N +居#居#E_N +類#類#S_N +、#、#S_N +日#日#B_N +用#用#E_N +品#品#S_N +類#類#S_N +、#、#S_N +旅#旅#B_N +遊#遊#E_N +優#優#B_N +惠#惠#E_N +、#、#S_N +機#機#B_N +票#票#E_N +、#、#S_N +酒#酒#B_N +店#店#E_N +及#及#S_N +郵#郵#B_N +輪#輪#E_N +等#等#S_N +.#.#S_N + +可#可#B_N +是#是#E_N +,#,#S_N +魔#魔#B_N +牛#牛#M_N +肝#肝#M_N +菌#菌#E_N +的#的#S_N +菌#菌#B_N +肉#肉#E_N +在#在#S_N +被#被#S_N +切#切#B_N +割#割#E_N +或#或#S_N +撞#撞#B_N +傷#傷#E_N +後#後#S_N +會#會#S_N +變#變#B_N +成#成#E_N +藍#藍#B_N +色#色#E_N +的#的#S_N +,#,#S_N +而#而#S_N +這#這#S_N +種#種#S_N +菌#菌#S_N +的#的#S_N +菌#菌#B_N +肉#肉#E_N +無#無#B_N +論#論#M_N +如#如#M_N +何#何#E_N +都#都#B_N +是#是#E_N +白#白#B_N +色#色#E_N +的#的#S_N +,#,#S_N +因#因#S_N +此#此#S_N +透#透#B_N +過#過#E_N +切#切#B_N +割#割#E_N +菌#菌#B_N +肉#肉#E_N +便#便#S_N +能#能#S_N +分#分#B_N +辨#辨#E_N +二#二#S_N +者#者#S_N +.#.#S_N + +站#站#B_N +牌#牌#E_N +上#上#S_N +標#標#B_N +示#示#E_N +著#著#S_N +「#「#S_N +浪#浪#B_N +漫#漫#E_N +和#和#S_N +傳#傳#B_N +奇#奇#E_N +的#的#S_N +入#入#B_N +野#野#M_N +松#松#M_N +原#原#E_N +」#」#S_N +.#.#S_N + +自#自#B_N +然#然#E_N +界#界#S_N +的#的#S_N +土#土#S_N +是#是#S_N +由#由#S_N +岩#岩#B_N +石#石#E_N +經#經#S_N +風#風#B_N +化#化#E_N +、#、#S_N +搬#搬#B_N +運#運#E_N +、#、#S_N +堆#堆#B_N +積#積#E_N +而#而#S_N +形#形#B_N +成#成#E_N +的#的#S_N +.#.#S_N + +毛#毛#S_N +澤#澤#B_N +東#東#E_N +早#早#S_N +在#在#S_N +1949##S_N +年#年#S_N +3##S_N +月#月#S_N +中#中#B_N +共#共#E_N +七#七#S_N +屆#屆#S_N +二#二#B_N +中#中#M_N +全#全#M_N +會#會#E_N +的#的#S_N +報#報#B_N +告#告#E_N +中#中#S_N +就#就#S_N +明#明#B_N +確#確#E_N +地#地#S_N +說#說#S_N +:#:#S_N +「#「#S_N +占#占#S_N +國#國#B_N +民#民#E_N +經#經#B_N +濟#濟#E_N +總#總#S_N +產#產#B_N +值#值#E_N +90%##S_N +的#的#S_N +分#分#B_N +散#散#E_N +的#的#S_N +個#個#B_N +體#體#E_N +的#的#S_N +農#農#B_N +業#業#E_N +經#經#B_N +濟#濟#E_N +和#和#S_N +手#手#B_N +工#工#E_N +業#業#S_N +經#經#B_N +濟#濟#E_N +,#,#S_N +是#是#S_N +可#可#B_N +能#能#E_N +和#和#S_N +必#必#B_N +須#須#E_N +謹#謹#B_N +慎#慎#E_N +地#地#S_N +、#、#S_N +逐#逐#B_N +步#步#E_N +地#地#S_N +而#而#S_N +又#又#S_N +積#積#B_N +極#極#E_N +地#地#S_N +引#引#B_N +導#導#E_N +它#它#B_N +們#們#E_N +向#向#S_N +著#著#S_N +現#現#B_N +代#代#E_N +化#化#S_N +和#和#S_N +集#集#B_N +體#體#E_N +化#化#S_N +的#的#S_N +方#方#B_N +向#向#E_N +發#發#B_N +展#展#E_N +的#的#S_N +,#,#S_N +任#任#B_N +其#其#M_N +自#自#M_N +流#流#E_N +的#的#S_N +觀#觀#B_N +點#點#E_N +是#是#S_N +錯#錯#B_N +誤#誤#E_N +的#的#S_N +.#.#S_N +」#」#S_N + +它#它#S_N +主#主#B_N +要#要#E_N +分#分#B_N +布#布#E_N +在#在#S_N +表#表#B_N +土#土#E_N +層#層#S_N +或#或#S_N +耕#耕#B_N +層#層#E_N +中#中#S_N +,#,#S_N +深#深#B_N +受#受#E_N +耕#耕#B_N +作#作#E_N +施#施#B_N +肥#肥#E_N +等#等#S_N +人#人#B_N +為#為#E_N +因#因#B_N +素#素#E_N +的#的#S_N +影#影#B_N +響#響#E_N +而#而#S_N +極#極#S_N +不#不#S_N +穩#穩#B_N +定#定#E_N +.#.#S_N + +土#土#B_N +壤#壤#E_N +是#是#S_N +重#重#B_N +要#要#E_N +的#的#S_N +自#自#B_N +然#然#E_N +資#資#B_N +源#源#E_N +和#和#S_N +生#生#B_N +產#產#E_N +資#資#B_N +料#料#E_N +,#,#S_N +土#土#B_N +壤#壤#E_N +的#的#S_N +植#植#B_N +物#物#E_N +生#生#B_N +產#產#E_N +能#能#B_N +力#力#E_N +是#是#S_N +衡#衡#B_N +量#量#E_N +土#土#B_N +壤#壤#E_N +資#資#B_N +源#源#E_N +質#質#B_N +量#量#E_N +的#的#S_N +標#標#B_N +誌#誌#E_N +.#.#S_N + +一#一#B_N +些#些#E_N +土#土#B_N +星#星#E_N +1##S_N +號#號#S_N +儀#儀#B_N +錶#錶#E_N +組#組#S_N +中#中#S_N +的#的#S_N +部#部#B_N +分#分#E_N +也#也#S_N +被#被#S_N +用#用#S_N +在#在#S_N +土#土#B_N +星#星#E_N +1B##S_N +中#中#S_N +了#了#S_N +.#.#S_N + +它#它#S_N +在#在#S_N +位#位#S_N +於#於#S_N +亨#亨#B_N +茨#茨#M_N +維#維#M_N +爾#爾#E_N +的#的#S_N +空#空#B_N +間#間#E_N +系#系#B_N +統#統#E_N +中#中#B_N +心#心#E_N +建#建#B_N +造#造#E_N +.#.#S_N + +這#這#S_N +個#個#S_N +計#計#B_N +算#算#E_N +機#機#S_N +控#控#B_N +制#制#E_N +了#了#S_N +火#火#B_N +箭#箭#E_N +從#從#S_N +起#起#B_N +飛#飛#E_N +前#前#S_N +一#一#B_N +直#直#E_N +到#到#S_N +拋#拋#B_N +棄#棄#E_N +S#S#B_N +-#-#M_N +I#I#M_N +V#V#M_N +B#B#E_N +推#推#B_N +進#進#E_N +器#器#S_N +的#的#S_N +操#操#B_N +作#作#E_N +過#過#B_N +程#程#E_N +.#.#S_N + +2007##S_N +年#年#S_N +7##S_N +月#月#S_N +6##S_N +日#日#S_N +,#,#S_N +聖#聖#B_N +喬#喬#M_N +治#治#E_N +教#教#B_N +堂#堂#E_N +被#被#S_N +馬#馬#B_N +來#來#M_N +西#西#M_N +亞#亞#E_N +政#政#B_N +府#府#E_N +列#列#S_N +為#為#S_N +50##S_N +個#個#S_N +馬#馬#B_N +來#來#M_N +西#西#M_N +亞#亞#E_N +國#國#B_N +家#家#E_N +寶#寶#B_N +藏#藏#E_N +之#之#S_N +一#一#S_N +.#.#S_N + +聖#聖#B_N +伯#伯#M_N +多#多#M_N +祿#祿#E_N +堂#堂#S_N +(#(#S_N +I#I#B_N +g#g#M_N +l#l#M_N +e#e#M_N +s#s#M_N +i#i#M_N +a#a#E_N +d#d#B_N +e#e#E_N +S#S#B_N +a#a#M_N +n#n#E_N +P#P#B_N +e#e#M_N +d#d#M_N +r#r#M_N +o#o#E_N +)#)#S_N +是#是#S_N +西#西#B_N +班#班#M_N +牙#牙#E_N +南#南#B_N +部#部#E_N +城#城#B_N +市#市#E_N +科#科#B_N +爾#爾#M_N +多#多#M_N +瓦#瓦#E_N +的#的#S_N +一#一#S_N +座#座#S_N +羅#羅#B_N +馬#馬#E_N +天#天#B_N +主#主#E_N +教#教#S_N +教#教#B_N +堂#堂#E_N +,#,#S_N +供#供#B_N +奉#奉#E_N +聖#聖#B_N +伯#伯#M_N +多#多#M_N +祿#祿#E_N +,#,#S_N +位#位#S_N +於#於#S_N +同#同#B_N +名#名#E_N +的#的#S_N +廣#廣#B_N +場#場#E_N +上#上#S_N +.#.#S_N diff --git a/tests/data_for_tests/cws_train b/tests/data_for_tests/cws_train new file mode 100755 index 00000000..085eb912 --- /dev/null +++ b/tests/data_for_tests/cws_train @@ -0,0 +1,1002 @@ +看#看#B_N +似#似#E_N +簡#簡#B_N +單#單#E_N +,#,#S_N +只#只#S_N +是#是#S_N +二#二#S_N +選#選#S_N +一#一#S_N +做#做#S_N +決#決#B_N +擇#擇#E_N +,#,#S_N +但#但#S_N +其#其#B_N +實#實#E_N +他#他#B_N +們#們#E_N +代#代#B_N +表#表#E_N +的#的#S_N +是#是#S_N +你#你#S_N +周#周#B_N +遭#遭#E_N +的#的#S_N +親#親#B_N +朋#朋#E_N +好#好#B_N +友#友#E_N +,#,#S_N +試#試#S_N +著#著#S_N +給#給#S_N +你#你#S_N +不#不#B_N +同#同#E_N +的#的#S_N +意#意#B_N +見#見#E_N +,#,#S_N +但#但#S_N +追#追#B_N +根#根#M_N +究#究#M_N +底#底#E_N +,#,#S_N +最#最#B_N +後#後#E_N +決#決#B_N +定#定#E_N +的#的#S_N +還#還#B_N +是#是#E_N +自#自#B_N +己#己#E_N +.#.#S_N + +其#其#S_N +便#便#B_N +當#當#E_N +都#都#B_N +是#是#E_N +買#買#B_N +來#來#E_N +的#的#S_N +,#,#S_N +就#就#B_N +算#算#E_N +加#加#B_N +熱#熱#E_N +也#也#B_N +是#是#E_N +由#由#S_N +媽#媽#B_N +媽#媽#E_N +負#負#B_N +責#責#E_N +(#(#S_N +後#後#B_N +來#來#E_N +揭#揭#B_N +曉#曉#E_N +其#其#B_N +實#實#E_N +是#是#S_N +避#避#B_N +免#免#E_N +帶#帶#B_N +來#來#E_N +厄#厄#B_N +運#運#E_N +)#)#S_N +,#,#S_N +父#父#B_N +親#親#E_N +則#則#S_N +在#在#S_N +電#電#B_N +視#視#E_N +台#台#S_N +上#上#B_N +班#班#E_N +.#.#S_N + +這#這#S_N +次#次#S_N +遊#遊#B_N +行#行#E_N +最#最#B_N +大#大#E_N +的#的#S_N +特#特#B_N +色#色#E_N +,#,#S_N +在#在#S_N +於#於#S_N +越#越#B_N +來#來#M_N +越#越#E_N +多#多#S_N +年#年#B_N +輕#輕#E_N +人#人#S_N +上#上#B_N +街#街#E_N +遊#遊#B_N +行#行#E_N +,#,#S_N +而#而#B_N +且#且#E_N +當#當#B_N +中#中#E_N +不#不#B_N +乏#乏#E_N +行#行#B_N +動#動#E_N +激#激#B_N +烈#烈#E_N +的#的#S_N +躁#躁#S_N +少#少#B_N +年#年#E_N +.#.#S_N + +懷#懷#B_N +孕#孕#E_N +期#期#S_N +為#為#S_N +421##S_N +至#至#S_N +457##S_N +日#日#S_N +.#.#S_N + +婷#婷#B_N +婷#婷#E_N +向#向#S_N +昏#昏#B_N +迷#迷#E_N +中#中#S_N +的#的#S_N +婆#婆#B_N +婆#婆#E_N +訴#訴#B_N +說#說#E_N +,#,#S_N +為#為#S_N +什#什#B_N +麼#麼#E_N +生#生#B_N +活#活#E_N +會#會#S_N +與#與#S_N +她#她#S_N +想#想#B_N +像#像#E_N +的#的#S_N +不#不#S_N +一#一#B_N +樣#樣#E_N +.#.#S_N + +就#就#B_N +算#算#E_N +數#數#B_N +論#論#E_N +的#的#S_N +應#應#B_N +用#用#E_N +被#被#S_N +找#找#B_N +到#到#E_N +了#了#S_N +,#,#S_N +也#也#S_N +不#不#B_N +會#會#E_N +有#有#S_N +人#人#S_N +會#會#S_N +因#因#S_N +此#此#S_N +罷#罷#B_N +黜#黜#E_N +這#這#S_N +一#一#S_N +數#數#B_N +學#學#E_N +的#的#S_N +皇#皇#B_N +后#后#E_N +.#.#S_N + +一#一#B_N +中#中#E_N +商#商#B_N +圈#圈#E_N +另#另#S_N +一#一#S_N +特#特#B_N +色#色#E_N +為#為#S_N +同#同#S_N +類#類#B_N +型#型#E_N +商#商#B_N +店#店#E_N +會#會#S_N +聚#聚#B_N +集#集#E_N +,#,#S_N +短#短#B_N +短#短#E_N +的#的#S_N +育#育#B_N +才#才#E_N +街#街#S_N +聚#聚#B_N +集#集#E_N +了#了#S_N +十#十#B_N +數#數#E_N +家#家#S_N +知#知#B_N +名#名#E_N +眼#眼#B_N +鏡#鏡#E_N +連#連#B_N +鎖#鎖#E_N +店#店#S_N +,#,#S_N +而#而#S_N +體#體#B_N +育#育#E_N +用#用#B_N +品#品#E_N +店#店#S_N +沿#沿#B_N +著#著#E_N +太#太#B_N +平#平#E_N +路#路#S_N +連#連#B_N +成#成#E_N +一#一#S_N +線#線#S_N +,#,#S_N +在#在#S_N +激#激#B_N +烈#烈#E_N +競#競#B_N +爭#爭#E_N +下#下#S_N +價#價#B_N +格#格#E_N +比#比#S_N +外#外#B_N +地#地#E_N +便#便#B_N +宜#宜#E_N +不#不#B_N +少#少#E_N +,#,#S_N +貨#貨#S_N +比#比#S_N +三#三#S_N +家#家#S_N +更#更#S_N +增#增#B_N +加#加#E_N +購#購#B_N +物#物#E_N +樂#樂#B_N +趣#趣#E_N +.#.#S_N + +《#《#S_N +一#一#S_N +代#代#S_N +女#女#B_N +皇#皇#E_N +》#》#S_N +開#開#B_N +錄#錄#E_N +當#當#B_N +日#日#E_N +掌#掌#B_N +鏡#鏡#E_N +者#者#S_N +是#是#S_N +導#導#B_N +播#播#E_N +出#出#B_N +身#身#E_N +的#的#S_N +當#當#B_N +時#時#E_N +中#中#B_N +視#視#E_N +節#節#B_N +目#目#E_N +部#部#S_N +經#經#B_N +理#理#E_N +王#王#S_N +世#世#B_N +綱#綱#E_N +.#.#S_N + +我#我#B_N +們#們#E_N +只#只#S_N +希#希#B_N +望#望#E_N +,#,#S_N +藉#藉#B_N +著#著#E_N +這#這#S_N +個#個#S_N +歷#歷#B_N +史#史#E_N +上#上#S_N +真#真#B_N +實#實#E_N +人#人#B_N +物#物#E_N +的#的#S_N +一#一#S_N +生#生#S_N +,#,#S_N +利#利#B_N +用#用#E_N +一#一#B_N +些#些#E_N +稗#稗#B_N +官#官#M_N +野#野#M_N +史#史#E_N +的#的#S_N +片#片#B_N +段#段#E_N +資#資#B_N +料#料#E_N +,#,#S_N +再#再#S_N +加#加#B_N +上#上#E_N +一#一#B_N +些#些#E_N +善#善#B_N +意#意#E_N +改#改#B_N +編#編#E_N +的#的#S_N +部#部#B_N +分#分#E_N +情#情#B_N +節#節#E_N +,#,#S_N +而#而#S_N +能#能#S_N +帶#帶#B_N +給#給#E_N +觀#觀#B_N +眾#眾#E_N +一#一#B_N +些#些#E_N +啟#啟#B_N +示#示#E_N +.#.#S_N +」#」#S_N + +當#當#B_N +時#時#E_N +外#外#B_N +界#界#E_N +傳#傳#B_N +聞#聞#E_N +樊#樊#S_N +日#日#B_N +行#行#E_N +是#是#S_N +在#在#S_N +中#中#B_N +視#視#E_N +主#主#B_N +管#管#E_N +授#授#B_N +意#意#E_N +下#下#S_N +裝#裝#B_N +病#病#E_N +,#,#S_N +樊#樊#S_N +日#日#B_N +行#行#E_N +否#否#B_N +認#認#E_N +:#:#S_N +「#「#S_N +人#人#S_N +都#都#B_N +是#是#E_N +吃#吃#S_N +五#五#S_N +穀#穀#S_N +雜#雜#B_N +糧#糧#E_N +長#長#B_N +大#大#E_N +,#,#S_N +本#本#B_N +來#來#E_N +就#就#S_N +會#會#S_N +生#生#B_N +病#病#E_N +;#;#S_N +而#而#B_N +且#且#E_N +裝#裝#B_N +病#病#E_N +萬#萬#B_N +一#一#E_N +被#被#S_N +拆#拆#B_N +穿#穿#E_N +了#了#S_N +,#,#S_N +豈#豈#S_N +不#不#B_N +是#是#E_N +無#無#B_N +法#法#E_N +對#對#S_N +廣#廣#B_N +大#大#E_N +的#的#S_N +觀#觀#B_N +眾#眾#E_N +交#交#B_N +代#代#E_N +?#?#S_N + +該#該#B_N +劇#劇#E_N +兩#兩#S_N +位#位#S_N +編#編#B_N +劇#劇#E_N +獨#獨#B_N +孤#孤#E_N +紅#紅#S_N +、#、#S_N +羅#羅#S_N +文#文#B_N +忠#忠#E_N +在#在#S_N +寫#寫#S_N +劇#劇#B_N +本#本#E_N +時#時#S_N +,#,#S_N +幾#幾#B_N +乎#乎#E_N +參#參#B_N +考#考#E_N +了#了#S_N +所#所#B_N +有#有#E_N +有#有#B_N +關#關#E_N +武#武#S_N +則#則#B_N +天#天#E_N +的#的#S_N +資#資#B_N +料#料#E_N +如#如#S_N +林#林#S_N +語#語#B_N +堂#堂#E_N +《#《#S_N +武#武#S_N +則#則#B_N +天#天#E_N +傳#傳#S_N +》#》#S_N +、#、#S_N +陳#陳#S_N +虹#虹#S_N +《#《#S_N +武#武#S_N +則#則#B_N +天#天#E_N +與#與#S_N +狄#狄#S_N +仁#仁#B_N +傑#傑#E_N +》#》#S_N +、#、#S_N +《#《#S_N +歷#歷#B_N +代#代#E_N +通#通#B_N +俗#俗#E_N +演#演#B_N +義#義#E_N +》#》#S_N +、#、#S_N +童#童#S_N +煦#煦#S_N +《#《#S_N +中#中#B_N +國#國#E_N +后#后#B_N +妃#妃#E_N +列#列#B_N +傳#傳#E_N +》#》#S_N +、#、#S_N +黃#黃#S_N +柏#柏#B_N +松#松#E_N +《#《#S_N +中#中#B_N +國#國#E_N +艷#艷#B_N +姬#姬#E_N +》#》#S_N +.#.#B_N +.#.#M_N +.#.#M_N +.#.#M_N +.#.#M_N +.#.#E_N +等#等#S_N +.#.#S_N + +電#電#B_N +視#視#E_N +台#台#S_N +把#把#S_N +一#一#S_N +個#個#S_N +跋#跋#B_N +扈#扈#E_N +專#專#B_N +橫#橫#E_N +、#、#S_N +亂#亂#B_N +倫#倫#E_N +篡#篡#B_N +國#國#E_N +的#的#S_N +武#武#S_N +則#則#B_N +天#天#E_N +演#演#B_N +成#成#E_N +柔#柔#B_N +弱#弱#E_N +堪#堪#B_N +憐#憐#E_N +的#的#S_N +女#女#B_N +子#子#E_N +,#,#S_N +這#這#S_N +是#是#S_N +我#我#B_N +們#們#E_N +所#所#S_N +不#不#B_N +能#能#E_N +容#容#B_N +忍#忍#E_N +的#的#S_N +.#.#S_N + +U#U#B_N +M#M#M_N +L#L#M_N +S#S#E_N +可#可#S_N +用#用#S_N +於#於#S_N +設#設#B_N +計#計#E_N +信#信#B_N +息#息#E_N +檢#檢#B_N +索#索#E_N +或#或#S_N +病#病#B_N +歷#歷#E_N +系#系#B_N +統#統#E_N +,#,#S_N +促#促#B_N +進#進#E_N +不#不#B_N +同#同#E_N +系#系#B_N +統#統#E_N +之#之#B_N +間#間#E_N +的#的#S_N +通#通#B_N +訊#訊#E_N +交#交#B_N +流#流#E_N +,#,#S_N +或#或#B_N +者#者#E_N +用#用#S_N +於#於#S_N +開#開#B_N +發#發#E_N +能#能#B_N +夠#夠#E_N +解#解#B_N +析#析#E_N +生#生#B_N +物#物#E_N +醫#醫#B_N +學#學#E_N +文#文#B_N +獻#獻#E_N +的#的#S_N +系#系#B_N +統#統#E_N +.#.#S_N + +一#一#S_N +支#支#S_N +由#由#S_N +白#白#B_N +人#人#E_N +軍#軍#B_N +官#官#E_N +帶#帶#B_N +領#領#E_N +的#的#S_N +黑#黑#B_N +人#人#E_N +民#民#B_N +兵#兵#E_N +洗#洗#B_N +劫#劫#E_N +了#了#S_N +卡#卡#B_N +梅#梅#M_N +隆#隆#E_N +莊#莊#B_N +園#園#E_N +,#,#S_N +並#並#B_N +且#且#E_N +企#企#B_N +圖#圖#E_N +強#強#B_N +暴#暴#E_N +卡#卡#B_N +梅#梅#M_N +隆#隆#E_N +家#家#S_N +的#的#S_N +每#每#S_N +一#一#S_N +個#個#S_N +女#女#B_N +眷#眷#E_N +,#,#S_N +但#但#S_N +邦#邦#B_N +聯#聯#E_N +軍#軍#B_N +隊#隊#E_N +擊#擊#B_N +潰#潰#E_N +這#這#B_N +些#些#E_N +民#民#B_N +兵#兵#E_N +,#,#S_N +拯#拯#B_N +救#救#E_N +了#了#S_N +她#她#B_N +們#們#E_N +.#.#S_N + +2006##S_N +年#年#S_N +剩#剩#B_N +下#下#E_N +的#的#S_N +時#時#B_N +光#光#E_N +,#,#S_N +樂#樂#B_N +團#團#E_N +都#都#S_N +用#用#S_N +來#來#S_N +編#編#B_N +寫#寫#E_N +與#與#S_N +錄#錄#B_N +製#製#E_N +樂#樂#B_N +團#團#E_N +的#的#S_N +第#第#B_N +三#三#E_N +張#張#S_N +專#專#B_N +輯#輯#E_N +閒#閒#B_N +言#言#M_N +閒#閒#M_N +語#語#E_N +就#就#B_N +是#是#E_N +不#不#B_N +道#道#E_N +自#自#B_N +己#己#E_N +,#,#S_N +該#該#S_N +專#專#B_N +輯#輯#E_N +並#並#S_N +在#在#S_N +2007##S_N +年#年#S_N +5##S_N +月#月#S_N +發#發#B_N +行#行#E_N +.#.#S_N + +當#當#B_N +時#時#E_N +他#他#S_N +對#對#S_N +巴#巴#B_N +黎#黎#E_N +這#這#S_N +個#個#S_N +法#法#B_N +國#國#E_N +首#首#B_N +都#都#E_N +興#興#B_N +奮#奮#E_N +不#不#B_N +已#已#E_N +,#,#S_N +並#並#S_N +決#決#B_N +定#定#E_N +將#將#S_N +各#各#B_N +種#種#E_N +印#印#B_N +象#象#E_N +和#和#S_N +想#想#B_N +像#像#E_N +化#化#B_N +成#成#E_N +音#音#B_N +樂#樂#E_N +作#作#S_N +為#為#S_N +表#表#B_N +達#達#E_N +;#;#S_N +他#他#S_N +在#在#S_N +巴#巴#B_N +黎#黎#E_N +譜#譜#B_N +寫#寫#E_N +此#此#S_N +樂#樂#B_N +曲#曲#E_N +,#,#S_N +在#在#S_N +一#一#S_N +次#次#S_N +前#前#B_N +往#往#E_N +維#維#B_N +也#也#M_N +納#納#E_N +行#行#B_N +程#程#E_N +的#的#S_N +途#途#B_N +中#中#E_N +進#進#B_N +行#行#E_N +配#配#B_N +器#器#E_N +工#工#B_N +作#作#E_N +,#,#S_N +首#首#B_N +演#演#E_N +則#則#S_N +在#在#S_N +紐#紐#B_N +約#約#E_N +,#,#S_N +於#於#S_N +1928##S_N +年#年#S_N +12##S_N +月#月#S_N +31##S_N +日#日#S_N +由#由#S_N +瓦#瓦#B_N +德#德#E_N +·#·#S_N +丹#丹#B_N +路#路#M_N +殊#殊#E_N +(#(#S_N +W#W#B_N +a#a#M_N +l#l#M_N +t#t#M_N +e#e#M_N +r#r#E_N +D#D#B_N +a#a#M_N +m#m#M_N +r#r#M_N +o#o#M_N +s#s#M_N +c#c#M_N +h#h#E_N +)#)#S_N +指#指#B_N +揮#揮#E_N +紐#紐#B_N +約#約#E_N +愛#愛#B_N +樂#樂#E_N +演#演#B_N +出#出#E_N +.#.#S_N + +而#而#S_N +雪#雪#B_N +莉#莉#E_N +的#的#S_N +卧#卧#B_N +底#底#E_N +身#身#B_N +份#份#E_N +也#也#S_N +被#被#S_N +拆#拆#B_N +穿#穿#E_N +,#,#S_N +原#原#B_N +來#來#E_N +雪#雪#B_N +莉#莉#E_N +的#的#S_N +真#真#B_N +正#正#E_N +身#身#B_N +份#份#E_N +是#是#S_N +雲#雲#B_N +想#想#E_N +集#集#B_N +團#團#E_N +董#董#B_N +事#事#E_N +長#長#S_N +高#高#S_N +超#超#S_N +之#之#S_N +女#女#S_N +高#高#S_N +蓓#蓓#S_N +,#,#S_N +受#受#S_N +後#後#B_N +母#母#E_N +品#品#S_N +緹#緹#S_N +(#(#S_N +王#王#S_N +琳#琳#S_N +飾#飾#S_N +)#)#S_N +指#指#B_N +使#使#E_N +,#,#S_N +化#化#B_N +名#名#E_N +雪#雪#B_N +莉#莉#E_N +潛#潛#B_N +入#入#E_N +天#天#B_N +衣#衣#E_N +集#集#B_N +團#團#E_N +搞#搞#B_N +垮#垮#E_N +天#天#B_N +衣#衣#E_N +.#.#S_N + +靜#靜#B_N +華#華#E_N +後#後#B_N +來#來#E_N +發#發#B_N +現#現#E_N +葉#葉#S_N +廣#廣#B_N +義#義#E_N +不#不#B_N +愛#愛#E_N +自#自#B_N +己#己#E_N +便#便#S_N +偷#偷#B_N +取#取#E_N +美#美#B_N +麗#麗#E_N +的#的#S_N +設#設#B_N +計#計#E_N +創#創#B_N +意#意#E_N +投#投#B_N +靠#靠#E_N +雲#雲#B_N +想#想#E_N +集#集#B_N +團#團#E_N +.#.#S_N + +吳#吳#S_N +湯#湯#B_N +興#興#E_N +將#將#S_N +髮#髮#B_N +辮#辮#E_N +交#交#B_N +給#給#E_N +妻#妻#B_N +子#子#E_N +,#,#S_N +並#並#B_N +且#且#E_N +囑#囑#B_N +咐#咐#E_N +若#若#S_N +不#不#B_N +幸#幸#E_N +戰#戰#B_N +死#死#E_N +,#,#S_N +以#以#S_N +髮#髮#S_N +為#為#S_N +塚#塚#S_N +.#.#S_N diff --git a/tests/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt b/tests/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt new file mode 100755 index 00000000..707e48e8 --- /dev/null +++ b/tests/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt @@ -0,0 +1,6 @@ +the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581 +of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375 +to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044 +and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 -0.51332 -0.47368 -0.33075 -0.13834 0.2702 0.30938 -0.45012 -0.4127 -0.09932 0.038085 0.029749 0.10076 -0.25058 -0.51818 0.34558 0.44922 0.48791 -0.080866 -0.10121 -1.3777 -0.10866 -0.23201 0.012839 -0.46508 3.8463 0.31362 0.13643 -0.52244 0.3302 0.33707 -0.35601 0.32431 0.12041 0.3512 -0.069043 0.36885 0.25168 -0.24517 0.25381 0.1367 -0.31178 -0.6321 -0.25028 -0.38097 +in 0.33042 0.24995 -0.60874 0.10923 0.036372 0.151 -0.55083 -0.074239 -0.092307 -0.32821 0.09598 -0.82269 -0.36717 -0.67009 0.42909 0.016496 -0.23573 0.12864 -1.0953 0.43334 0.57067 -0.1036 0.20422 0.078308 -0.42795 -1.7984 -0.27865 0.11954 -0.12689 0.031744 3.8631 -0.17786 -0.082434 -0.62698 0.26497 -0.057185 -0.073521 0.46103 0.30862 0.12498 -0.48609 -0.0080272 0.031184 -0.36576 -0.42699 0.42164 -0.11666 -0.50703 -0.027273 -0.53285 +a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796 \ No newline at end of file diff --git a/tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt b/tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt new file mode 100755 index 00000000..c16170f2 --- /dev/null +++ b/tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt @@ -0,0 +1,7 @@ +5 50 +the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581 +of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375 +to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044 +and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 -0.51332 -0.47368 -0.33075 -0.13834 0.2702 0.30938 -0.45012 -0.4127 -0.09932 0.038085 0.029749 0.10076 -0.25058 -0.51818 0.34558 0.44922 0.48791 -0.080866 -0.10121 -1.3777 -0.10866 -0.23201 0.012839 -0.46508 3.8463 0.31362 0.13643 -0.52244 0.3302 0.33707 -0.35601 0.32431 0.12041 0.3512 -0.069043 0.36885 0.25168 -0.24517 0.25381 0.1367 -0.31178 -0.6321 -0.25028 -0.38097 +in 0.33042 0.24995 -0.60874 0.10923 0.036372 0.151 -0.55083 -0.074239 -0.092307 -0.32821 0.09598 -0.82269 -0.36717 -0.67009 0.42909 0.016496 -0.23573 0.12864 -1.0953 0.43334 0.57067 -0.1036 0.20422 0.078308 -0.42795 -1.7984 -0.27865 0.11954 -0.12689 0.031744 3.8631 -0.17786 -0.082434 -0.62698 0.26497 -0.057185 -0.073521 0.46103 0.30862 0.12498 -0.48609 -0.0080272 0.031184 -0.36576 -0.42699 0.42164 -0.11666 -0.50703 -0.027273 -0.53285 +a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796 \ No newline at end of file diff --git a/tests/data_for_tests/io/20ng/dev.csv b/tests/data_for_tests/io/20ng/dev.csv new file mode 100755 index 00000000..1cfb7c56 --- /dev/null +++ b/tests/data_for_tests/io/20ng/dev.csv @@ -0,0 +1,6 @@ +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject clarification organization cookamunga tourist bureau lines 14 sorry , san jose based rosicrucian order called r c , n't remember time stand r c ordo rosae crucis , words latin order rose cross sigh , seems loosing long term memory otherwise headquarters san jose pretty decent metaphysical bookstore , interested books son loves run around egyptian museum cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"subject catholic lit nunnally acs harding edu \( john nunnally \) distribution world organization harding university , , ar nntp posting host acs harding edu x news reader vms news 1 reply dlphknob camelot bradley edu 's message 16 apr 93 18 57 20 gmtlines 45 lines 45 dlphknob camelot dlphknob camelot bradley edu writes 1993apr14 476 mtechca maintech com foster mtechca maintech com writes surprised saddened would expect kind behavior evangelical born gospel thumping face 're true christian protestants , always thought catholics behaved better please stoop level e b g f w c protestants , think best way witness strident , intrusive , loud , insulting self righteous \( pleading mode \) please ! i'm begging ! quit confusing religious groups , stop making generalizations ! i'm protestant ! i'm evangelical ! n't believe way way ! i'm creation scientist ! n't think homosexuals hung ! want discuss bible thumpers , would better singling \( making generalizations \) fundamentalists compared actions methodists southern baptists , would think different religions ! sarcasm sure pick correct groups bible thumpers , fundamentalists , southern baptists deserve hasty generalizations prejudicial statements n't pick methodists ! sarcasm please , prejudice thinking people group , please n't write protestants evangelicals ! \( pleading mode \) god wish could get ahold thomas stories n , n tha gb , gb n yvan sasha david cole iv chief research dlphknob camelot bradley edu" +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject alt sex stories literary critical analysis \) organization cookamunga tourist bureau lines 16 article h7v agate berkeley edu , dzkriz ocf berkeley edu \( dennis kriz \) wrote i'm going try something , perhaps many would thought even possible want begin process initiating literary critical study pornography posted alt sex stories , identify major themes motifs present stories posted opening possibility objective moral evaluation material present dennis , i'm astounded n't know interested even study filth alt sex stories provide cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"anthony landreneau ozonehole com \( anthony landreneau \) subject abortion distribution world organization ozone online operations , inc , dba ozone hole bbs reply anthony landreneau ozonehole com \( anthony landreneau \) lines 21 margoli watson ibm com \( larry margolis \) anthony landreneau ozonehole com lm rape passed , nothing ever take away lm true forcing remain pregnant continues violation lm body another 9 months see unbelievably cruel life violation cruel , killing living solely friend right cold anthony slmr 2 1 's difference orange \? ozone hole bbs private bulletin board service \( 504 \) 891 3142 3 full service nodes usrobotics 16 8k bps 10 gigs 100 , 000 files skydive new orleans ! rime network mail hub 500 usenet newsgroups please route questions inquiries postmaster ozonehole com" +talk.religion.misc,"kevin rotag mi org \( kevin darcy \) subject 2000 years , say christian morality organization , \? \? \? lines 15 article pww spac at1 59 rice edu pww spacsun rice edu \( peter walker \) writes article 1993apr18 rotag mi org , kevin rotag mi org \( kevin darcy \) wrote , one , considered intentionality primary ontological stuff built perceptions , consciousness , thoughts , etc frank means alone seeing intentionality \( values , puts \) underlying human experience , even called objective experiences , measurements natural world , output des chip others us see intellectual masturbation 'll defer greater firsthand knowledge matters kevin" +talk.religion.misc,"bil okcforum osrhe edu \( bill conner \) subject 2000 years , say christian morality nntp posting host okcforum osrhe edu organization okcforum unix users group x newsreader tin version 1 1 pl9 lines 54 mind , say science basis values bit reach science basis observable fact 'd say one chooses observe observation interpreted significance 's given depends great deal values observer science human activity , subject potential distortion human activity myth scientists moral influence ethical concern , knowledge whole pure nature biases scientist , nonsense bill one argue objective values \( moral sense \) one must first start demonstrating morality objective considering meaning word objective doubt ever happen , back original question objective morality \? may unfortunate choice words , almost self contradictory objective sense used means something immutable absolute morality describes behavior group people first term inclusive , second specific concept supposedly described may meaning however god described christians \( instance \) , existence apart independent humankind existence outside frame reference \( reality \) declares thing , necessarily since defined omnipotent , claims believed , least omnipotent relative us god intrinsically self defined reality whatever says objective sense god determines standard conduct , standard objective human beings held accountable conformance standard permitted ignore , substitute relative morality mode conduct , giving term morality nebulous , meaningless sense argued pretending misunderstand standard objective conduct required meet standard therefore objectively determined convenient pretend term morality infinitely , n't mean objective standard n't exist morality come mean little cultural norm , preferred conduct decent people , making seem subjective , derived absolute , objective , standard ironically , objective standard perfect accord true nature \( according christianity least \) , yet condemned contrary human , oppressive severe may due bill much amoral standard , like , 's x" diff --git a/tests/data_for_tests/io/20ng/test.csv b/tests/data_for_tests/io/20ng/test.csv new file mode 100755 index 00000000..b636bc65 --- /dev/null +++ b/tests/data_for_tests/io/20ng/test.csv @@ -0,0 +1,6 @@ +talk.religion.misc,"halat pooh bears \( jim halat \) subject 2000 years , say christian morality reply halat pooh bears \( jim halat \) lines 43 article 1993apr15 wam umd edu , wam umd edu \( jay stein objectively subjective \) writes horus ap mchp sni de frank d012s658 uucp \( frank o'dwyer \) discussion christianity objective morals question effective difference objective values exist , disagreement values subjective \? n't see difference saying absolute truth exists , people think lie truth relative \? think examples , first statement fundamental disagreement least two people second statement agreed upon put another way , someone says objective values exist agree values subjective jim halat" +talk.religion.misc,"halat pooh bears \( jim halat \) subject 2000 years , say christian morality reply halat pooh bears \( jim halat \) lines 17 article na4 horus ap mchp sni de , frank d012s658 uucp \( frank o'dwyer \) writes really \? n't know objective value \? offered people u , collectively , 1 land america , would sound like good deal \? happens subjective example people us would happen agree continue move price point people would accept probably would accept high enough number endpoints subjective scale given homes objective viewpoints jim halat" +talk.religion.misc,"halat pooh bears \( jim halat \) subject 2000 years , say christian morality reply halat pooh bears \( jim halat \) lines 34 article horus ap mchp sni de , frank d012s658 uucp \( frank o'dwyer \) writes firstly , science basis values , way round better explain objective atoms , get subjective values , go atoms objective n't even real scientists call atom nothing mathematical model describes certain physical , observable properties surroundings subjective objective , though , approach scientist takes discussing model observations objective science objective approach subjectively selected scientist objective case means specified , unchanging set rules colleagues use discuss science contrast objective morality may objective approach subjectively discuss beliefs morality exists objective morality also , science deals discuss observations physical world around us method discussion objective \( science discussion \) science makes claims know even sometimes observe simply gives us way discuss surroundings meaningful , consistent way think bohr said \( paraphrase \) science say physical world jim halat" +talk.religion.misc,"mwilson ncratl atlantaga ncr com \( mark wilson \) subject message mr president know happened \? organization ncr engineering manufacturing atlanta atlanta , ga lines 58 noose ecn purdue edu tbrent bank ecn purdue edu \( timothy j brent \) writes probably , n't pack heavy weaponry intent use please cite evidence intending use n't really think allowed keep stuff \? \? , tell live sure steer well clear check sig public also rights , placed individual society rights individuals rights go ahead , call commie , ok , commie 'd singing different tune exercised right rape daughter think right rape anyone \? wonder n't care others broke law , please indicate law feel koresh broke , convicted said crime threat society , feel owning guns makes threat society ou going start going knives baseball bats well feel someone spouts unpopular ideas definition threat society job simple simple think job assualt civilians support first , second , fourth , fifth , sixth , eighth amendment rights , lest taken away fbi davidians think 'll support \( except 2 \) words n't support mob rule n't prettier merely mob calls government ai n't charity using someone else 's money wilson 's theory relativity go back far enough , 're related mark wilson atlantaga ncr com" +talk.religion.misc,"alizard tweekco uucp \( lizard \) subject 14 apr 93 god 's promise 1 john 1 7 organization com systems bbs , , ca \( 510 \) 631 lines 20 starowl rahul net \( michael adams \) writes anyone netland process devising new religion , use lamb bull , already reserved please choose another animal , preferably one endangered species list washed blood barney dinosaur \? \) judging postings 've read usenet non usenet bbs conferences , barney definitely endangered species especially runs dark alley lizard lizard internet addresses alizard tweekco boo pacbell com \( preferred \) pacbell com ! boo ! tweekco ! alizard \( bang path \) alizard com \( backup \) pgp2 2 public key available request" +talk.religion.misc,"alizard tweekco uucp \( lizard \) subject oto , ancient order oriental templars organization com systems bbs , , ca \( 510 \) 631 lines 18 thyagi cup portal com \( thyagi morgoth nagasiva \) writes organization known present time ancient order oriental templars ordo templi orientis otherwise hermetic brotherhood light organization official e mail address days \? \( address sf bay area lodges , e g would \) 93 lizard lizard internet addresses alizard tweekco boo pacbell com \( preferred \) pacbell com ! boo ! tweekco ! alizard \( bang path \) alizard com \( backup \) pgp2 2 public key available request" diff --git a/tests/data_for_tests/io/20ng/train.csv b/tests/data_for_tests/io/20ng/train.csv new file mode 100755 index 00000000..55307ad6 --- /dev/null +++ b/tests/data_for_tests/io/20ng/train.csv @@ -0,0 +1,6 @@ +talk.religion.misc,"deane binah cc brandeis edu \( david matthew deane \) subject flaming nazis reply deane binah cc brandeis edu organization brandeis university lines 106 okay , 'll bite probably leave alone , heck article 1993apr14 422 sun0 urz uni heidelberg de , gsmith lauren iwr uni heidelberg de \( gene w smith \) writes article brewich hou tx us popec brewich hou tx us \( pope charles \) writes name guy responsible much uniforms , props used early nazis rallies name roehm , hitler claim came swastika business n't credit actual flag design party member dentist \? believe gives credit mein kampf killed early nazi purge many associates flaming homosexuals well know also trying find actual evidence common assertion recently postings groups soc history soc culture german uncovered net experts could provide well , i'm expert , histories nazi germany assert make reference several scandals occurred long night long knives impression got homosexuality portions sa common knowledge also , book \( homosexual author whose name escapes moment \) called homosexuals history asserts roehm heines homosexuals , well others roehm 's sa circle books say roehm associate , edmund heines , homosexual able find nothing beyond , suspect sort historical urban legend well , 're one germany n't believe history books , look primary sources us outside germany access seems plenty documented instances several scandals , fact knight long knives several sa members \( including heines \) found sleeping together , etc also believe people complaining sa 's homosexual activities \( young boys , etc \) histories 've read make convincing case none sounds like urban legend \( irving , notoriously unreliable historian , says funk , nazi finance minister , homosexual gives sources \) know next nothing irving nothing funk precisely know , would contradict history books read concerning existence homosexual nazis \? trying say historians taking part anti homosexual smear \? homosexual writers agree official history \? n't think would found truth roehm heines homosexuals \? would think would want homosexuality nazism one use connection two bash homosexuals case challenge anyone document claim going challenge historians point \( irving \) , burden proof track references find stories originate , one germany , close archival material people net found great deal evidence many flaming heterosexuals among nazis seems include worst ones hitler , himmler , goebbels , goering , , eichmann , many eh \? agenda \? prove nazis heterosexuals , bash heterosexuals \? bother nazis might homosexuals \? make homosexuals bad true \? course bisexuals \? half nazis \? n't know would difficult believe nazis homosexuals german officer corps ww1 , instance , notorious homosexuality numerous scandals rocked german govt late 19th early 20th century many kaiser 's friends prosecuted kaiser homosexual , germany army long tradition homosexuality , going far back prussian history back frederick great least , homosexual roehm product prussian officer tradition , old german army \( like english public school system \) , well known center homosexuality , would quite willing overlook roehm 's homosexuality addition , nazis complained homosexuality hitler youth hitler youth swallowed pre nazi youth groups , various pre war , bund , youth groups known promote ideals friendship , many cases , homosexuality seems unlikely plenty homosexual nazis , regardless official nazi dogmas concerning evils homosexuality suprise anyone \? homosexuality always existed , societies would unusual nazis exception , n't sources , think kind proof accept would citations archival material , access intend reread every book nazis modern homosexuality ever read n't time nothing stopping , however , chasing sources prove otherwise , though , stick established histories david matthew deane \( deane binah cc brandeis edu \) eternal bleak wind let gods speak softly us days hereafter \( ezra pound \)" +talk.religion.misc,"psyrobtw ubvmsd cc buffalo edu \( robert weiss \) subject 18 apr 93 god 's promise philippians 4 9 organization university buffalo lines 8 news software vax vms vnews 1 41 nntp posting host ubvmsd cc buffalo edu things , ye learned , received , heard , seen , god peace shall philippians 4 9" +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject 14 apr 93 god 's promise 1 john 1 7 organization cookamunga tourist bureau lines 14 article tweekco uucp , alizard tweekco uucp \( lizard \) wrote judging postings 've read usenet non usenet bbs conferences , barney definitely endangered species especially runs dark alley please , please n't make barney modern martyr saviour mythical figure , humans create religion name , life unbearable \) cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"sandvik newton apple com \( kent sandvik \) subject disillusioned protestant finds christ organization cookamunga tourist bureau lines 23 article boi hp com , jburrill boi hp com \( jim burrill \) wrote jesus never taught concept trinity , deal following mat 28 18 jesus came said , authority heaven earth given mat 28 19 therefore go make disciples nations , baptizing name father son holy spirit , mat 28 20 teaching obey everything commanded surely always , end age jim , please , 's lame explanation trinity jesus provides baptizing people name three things ! trinity case , i'm wrong , assumed trinity implies god three entities , yet cheers , kent sandvik newton apple com alink ksand private activities net" +talk.religion.misc,"cutter gloster via mind org \( cutter \) subject biblical backing koresh 's 3 02 tape \( cites enclosed \) distribution world organization gordian knot , gloster , ga lines 22 netd susie sbc com \( \) writes article 20apr199301460499 utarlg uta edu b645zaw utarlg uta edu \( stephen think david koresh n't solid structure , sound biblical backing hour long tape broadcast , n't think anyone really cares solid structure sermon 's deaths 's responsible concern people think ought hold christ followers died hand romans also fault believing god , society reminds roman empire every day guess 'll log go watch american cutter gloster via mind org \( chris \) jobs easy person n't holt 's law" +talk.religion.misc,"subject albert sabin rfox charlie usd edu \( rich fox , univ south dakota \) reply rfox charlie usd edu organization university south dakota computer science dept nntp posting host charlie lines 91 article 1993apr15 nntpd2 cxo dec com , sharpe enet dec com \( system privileged account \) writes article 885 sunfish usd edu , rfox charlie usd edu \( rich fox , univ south dakota \) writes article 1993apr10 rambo atlanta dg com , wpr atlanta dg com \( bill rawlins \) writes earlier dialogue deleted perhaps read stop advancing bible evidence relating questions science jesus exist \? g wells great fallacy statement question origins based science alone nope , fallacy yep , science best determining religions handle rich , curious others award custody baby theists religion \? hope n't award custody , rich purposely used handle order avoid e , happens religions \( course like scientific creationism \) used best part indicate science currently time , domains mostly ignored also attempted brief , doubt confused matter aside , science written nobody seems argue theists , theologians better investigate magicians , , , athiests agnostics seems answer would vary individual individual i'm trying evasive , societal perspective , religion works hand , sometimes abused misused , many suffer , know net result seems positive , anthropological perspective human affairs might call neo insofar think masses ca n't get along without religion generally incapable n't , myriad reasons , main one seems promise immortality , immortality therefore seems theologians better equipped others mention answers suggest holds regardless truth answers simply people believe end , spiritual beliefs real scientific facts explanation \( caution take context \) suggest forever closed scientific investigation \? fact , n't think closed , least individuals n't group theoretical physicists argue matter created nothing big bang singularity \? approach might absence , except seems could argued something responsible nothing \? maybe something n't supernatural , maybe 's tough one people today grasp case , theory without empirical data explanation , question require data words , agree theorizing \( within scientific parameters \) scientific explaining answer , closed scientists , sense science currently inadequate data necessary improvement , seems long way , ever pretty convoluted hope 've made sense seems 200 years ago , question origin life earth considered open scientific agree generally prefer put way questions , , open inquiry enlightenment , reason questioning theological answers , , part , science thus born curiosity , eventually away largely leaving behind ignorant , selfish , intolerant , arrogant , course , still claim authority four domains rich fox , anthro , usouthdakota like discussion around , figure original post \) much obliged funny facts tend things , n't \? well , sure plenty scientific creationist somewhere , even created nothing record , , modern humans best regards \) , rich fox , anthro , usouthdakota" diff --git a/tests/data_for_tests/io/BQCorpus/dev.txt b/tests/data_for_tests/io/BQCorpus/dev.txt new file mode 100755 index 00000000..2bd7414e --- /dev/null +++ b/tests/data_for_tests/io/BQCorpus/dev.txt @@ -0,0 +1,6 @@ +sentence1,sentence2,label +综合评分不足什么原因,综合评估的依据,0 +什么时候我能使用微粒贷,你就赶快给我开通就行了,0 +如何修改每个月的还款日期,可以申请延期还款日吗?,0 +没什么问的,不能登陆就是我最大的问题了,登录不上,1 +你的意思是不能取现,借到的钱可不可以提出来,1 diff --git a/tests/data_for_tests/io/BQCorpus/test.txt b/tests/data_for_tests/io/BQCorpus/test.txt new file mode 100755 index 00000000..949583ad --- /dev/null +++ b/tests/data_for_tests/io/BQCorpus/test.txt @@ -0,0 +1,6 @@ +sentence1,sentence2,label +你电话号码多少,你们的客服电话是多少?,1 +10000块日利息是多少,0.05%就是借2000块,利息为1块钱一天,1 +17号还款了,我现在想提前几天还,怎么弄,一直按时还款,提前还款,怎么会评估不足,0 +我昨晚申请的,现在钱没到,也没有人联系我,审核多久才会打电话,1 +假如我贷四万还款怎么,18号还款日可以不凌晨扣款,我18日下午还款可以吗,0 diff --git a/tests/data_for_tests/io/BQCorpus/train.txt b/tests/data_for_tests/io/BQCorpus/train.txt new file mode 100755 index 00000000..f2ac4e84 --- /dev/null +++ b/tests/data_for_tests/io/BQCorpus/train.txt @@ -0,0 +1,6 @@ +sentence1,sentence2,label +一天了还是不能登录,你好,用app干嘛但是无法登入,1 +为什么我的钱包点开,没显示微粒贷呀,点击我进入钱包,没有,借款的,提示呀!,1 +什么要求,借款没有,0 +微信注册的手机号停机了,还可以办理吗,没有邀请可以注册嘛,0 +开通微粒贷,开通微粒贷!强烈要求,1 diff --git a/tests/data_for_tests/io/ChnSentiCorp/dev.txt b/tests/data_for_tests/io/ChnSentiCorp/dev.txt new file mode 100755 index 00000000..9387b569 --- /dev/null +++ b/tests/data_for_tests/io/ChnSentiCorp/dev.txt @@ -0,0 +1,7 @@ +label text_a +1 基金痛所有投资项目一样,必须先要有所了解,才能把握分寸,不至于跟风而造成损失。此本基金入门的书是一个不错的选择,不像一般的书一样偏重概念,虽然也涉及到概念,但作者用自己的方式解读,使第一次接触基金的人能更好的理解。内容以非常容易理解的语言象大众普及了基金的很多观念,对于普通基民来说,要想有所收获,必须了解基金界的很多情况,在关键的时候才不会盲目跟风。对于新手,强烈推荐。 +1 系统很好装,LED屏是不错,就是16比9的比例看起来比较长,是14.0的屏。外观比较酷,适合年轻人,键盘模仿SONY的,还不错。 +1 这书的装帧很好的,既适合家庭收藏亦适合阅读了解。了解一个人,通过他的书信,而且是家书,再好不过了,而了解这个人也更了解些那个时代,那个社会,给我们现代人些许启发吧。而我从中也知道了他的学习习惯、方法以及教子方面。比较有收获。软精装的封面,封面要是每个唐老师那个照片就更好了,分上下册便于阅读。内里字体有分别:信是用的启功老师的手写字体,评点是宋体。 +0 屏幕没有坏点和暗点,这个比较不错。配置性价比较高,目前使用已有半个月,基本正常。 +0 典型的国营酒店,管理层缺乏责任心,管理混乱。房间里的大灯镜灯台灯都是坏的,只有一盏床头灯可用,不知道酒店是怎么维护的。最可气的是结帐时竟然要求客人赔偿房间里已损坏很久的鞋盒,简直是讹诈。 +0 普通游客旅馆 还三星 让我伤心 店名好大 奇差无比 补充点评 2006年12月8日 : 还说有地下车库 谁敢下去 晕 狭小 黑暗 要卡壳儿的 CTRIP上怎么让它这么忽悠顾客的 ?!!!!!!! diff --git a/tests/data_for_tests/io/ChnSentiCorp/test.txt b/tests/data_for_tests/io/ChnSentiCorp/test.txt new file mode 100755 index 00000000..35f7d2c5 --- /dev/null +++ b/tests/data_for_tests/io/ChnSentiCorp/test.txt @@ -0,0 +1,7 @@ +label text_a +0 v系统和XP系统能做到二选一就更好了,毕竟大部分人还是更偏爱XP系统。 +0 自带的Linix系统上上网还可以,想玩其他的功能毫无疑问得换XP.偶在京东订的时候为了装XP方便,一起买了阿帕奇的USB光驱。到货后,发现该USB光驱无法引导系统光盘启动,已验证过该光驱读写功能正常。 +1 非常不错的酒店,依山傍水,里面大片森林,散散步很不错,坐在湖边也休息也是不错的选择;房间很幽静,房间的设施很好,服务员态度也很好。 +0 5月8日付款成功,当当网显示5月10日发货,可是至今还没看到货物,也没收到任何通知,简不知怎么说好!!! +1 收到书,还未打开就被封面的鲜艳色彩及版样吸引,迫不急待的打开,书内的设计及彩图也不错,色泽及印刷质量都称的上好,没有味道,贴图也从简入深。价格也不贵。拿回家,小宝贝也很喜欢,我家宝宝只有2岁5个月对于她贴片不太好撕,大一些的贴片要我来帮她撕。不过,今天再玩时已经比昨天撕的好很多了,可以锻炼她的小手呢。等这几本用完了,我想我还会再给她买一些类似的书。 +0 挺失望的,还不如买一本张爱玲文集呢,以<色戒>命名,可这篇文章仅仅10多页,且无头无尾的,完全比不上里面的任意一篇其它文章. diff --git a/tests/data_for_tests/io/ChnSentiCorp/train.txt b/tests/data_for_tests/io/ChnSentiCorp/train.txt new file mode 100755 index 00000000..9e53f1bd --- /dev/null +++ b/tests/data_for_tests/io/ChnSentiCorp/train.txt @@ -0,0 +1,7 @@ +label text_a +1 很好的酒店,很规范植得一住.餐厅一般不应该的,不知道为什么. 宾馆反馈 2008年4月17日 : 餐厅现已重新装修,用餐环境较以前要好的多。谢谢您的宝贵意见! +0 这是我看过文字写得很糟糕的书,因为买了,还是耐着性子看完了,但是总体来说不好,文字、内容、结构都不好 +1 拿房时没大床房了,给我们免费升成套房,这点还蛮满意的。酒店大致不错,有国内五星水准。比国际品牌的要差一点。酒店有点年纪了,维修要加强,比如我们浴室的下水就堵塞不通,这些在客人入住前就该发觉修好。其它都还可以。 +1 开始看了2005年的几位朋友的评价,都不敢去入住。没想到现在改观了很多,房间虽小,但很整洁。下次再来的话,还会选择这个酒店。只是希望宽带能一直免费! +0 本机预装的Vista跟瑞星杀软不兼容,蓝屏,不能进入系统,不能自行卸载!!千万小心别装,用卡巴可以。 +0 跟心灵鸡汤没什么本质区别嘛,至少我不喜欢这样读经典,把经典都解读成这样有点去中国化的味道了 diff --git a/tests/data_for_tests/io/LCQMC/dev.txt b/tests/data_for_tests/io/LCQMC/dev.txt new file mode 100755 index 00000000..3e253c93 --- /dev/null +++ b/tests/data_for_tests/io/LCQMC/dev.txt @@ -0,0 +1,6 @@ +开初婚未育证明怎么弄? 初婚未育情况证明怎么开? 1 +脚气怎么治疗 醋怎么治疗脚气 0 +世界是先有男人还是先有女人 世界上是先有男人还是先有女人 1 +有什么小说软件好用的 那个看小说的阅读器较好 1 +网上兼职是做什么的,手机可以做吗 手机可以做什么网上兼职,拍单子是什么 0 +郑州有什么好玩的地方? 郑州有什么好玩的地方啊 1 diff --git a/tests/data_for_tests/io/LCQMC/test.txt b/tests/data_for_tests/io/LCQMC/test.txt new file mode 100755 index 00000000..bc694d3a --- /dev/null +++ b/tests/data_for_tests/io/LCQMC/test.txt @@ -0,0 +1,5 @@ +谁有狂三这张高清的 这张高清图,谁有 0 +淘宝模特叫什么?急 淘宝的模特她叫什么 1 +不要嘛用韩语怎么说 韩语的请不要走怎么说 0 +倒瓜子脸适合什么发型 额头高又是瓜子脸的女生适合什么刘海 0 +淘宝流量怎么买 刚淘宝店如何才能有流量 0 diff --git a/tests/data_for_tests/io/LCQMC/train.txt b/tests/data_for_tests/io/LCQMC/train.txt new file mode 100755 index 00000000..9f6d4924 --- /dev/null +++ b/tests/data_for_tests/io/LCQMC/train.txt @@ -0,0 +1,6 @@ +喜欢打篮球的男生喜欢什么样的女生 爱打篮球的男生喜欢什么样的女生 1 +你帮我设计小说的封面吧 谁能帮我给小说设计个封面? 0 +移动手机卡刷砖 关于移动手机卡 0 +有什么好听的短信铃声啊 有什么好听的韩剧短信铃声 0 +人生的三大事是什么 人生三大事是什么? 1 +您好是后8位的 您提供后8位即可, 1 diff --git a/tests/data_for_tests/io/MNLI/dev_matched.tsv b/tests/data_for_tests/io/MNLI/dev_matched.tsv new file mode 100755 index 00000000..ace2dd27 --- /dev/null +++ b/tests/data_for_tests/io/MNLI/dev_matched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 label2 label3 label4 label5 gold_label +0 63735 63735n slate ( ( The ( new rights ) ) ( are ( nice enough ) ) ) ( Everyone ( really ( likes ( the ( newest benefits ) ) ) ) ) (ROOT (S (NP (DT The) (JJ new) (NNS rights)) (VP (VBP are) (ADJP (JJ nice) (RB enough))))) (ROOT (S (NP (NN Everyone)) (VP (ADVP (RB really)) (VBZ likes) (NP (DT the) (JJS newest) (NNS benefits))))) The new rights are nice enough Everyone really likes the newest benefits neutral entailment neutral neutral neutral neutral +1 91383 91383c government ( ( This site ) ( ( includes ( ( ( ( a list ) ( of ( all ( award winners ) ) ) ) and ) ( ( a ( searchable database ) ) ( of ( Government ( Executive articles ) ) ) ) ) ) . ) ) ( ( ( The ( Government ( Executive articles ) ) ) ( housed ( on ( the website ) ) ) ) ( ( ( are not ) ( able ( to ( be searched ) ) ) ) . ) ) (ROOT (S (NP (DT This) (NN site)) (VP (VBZ includes) (NP (NP (NP (DT a) (NN list)) (PP (IN of) (NP (DT all) (NN award) (NNS winners)))) (CC and) (NP (NP (DT a) (JJ searchable) (NN database)) (PP (IN of) (NP (NNP Government) (NNP Executive) (NNS articles)))))) (. .))) (ROOT (S (NP (NP (DT The) (NNP Government) (NNP Executive) (NNS articles)) (VP (VBN housed) (PP (IN on) (NP (DT the) (NN website))))) (VP (VBP are) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB be) (ADJP (JJ searched))))))) (. .))) This site includes a list of all award winners and a searchable database of Government Executive articles. The Government Executive articles housed on the website are not able to be searched. contradiction contradiction contradiction contradiction contradiction contradiction +2 755 755e telephone ( ( ( ( uh ( i ( ( do n't ) ( know ( ( i i ) ( have ( ( mixed emotions ) ( about ( him ( ( uh sometimes ) ( i ( like him ) ) ) ) ) ) ) ) ) ) ) ) but ) ( ( at ( the ( same times ) ) ) ( i ( love ( to ( see somebody ) ) ) ) ) ) ( beat him ) ) ( I ( ( ( ( ( ( like him ) ( for ( the ( most part ) ) ) ) , ) but ) ( ( would still ) ( enjoy ( seeing ( someone ( beat him ) ) ) ) ) ) . ) ) (ROOT (SINV (S (S (INTJ (UH uh)) (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP have) (VP (VBN mixed) (NP (NNS emotions)) (PP (IN about) (S (NP (PRP him)) (VP (VBG uh) (ADVP (RB sometimes)) (NP (NP (FW i)) (PP (IN like) (NP (PRP him))))))))))))))) (CC but) (S (PP (IN at) (NP (DT the) (JJ same) (NNS times))) (NP (FW i)) (VP (VBP love) (S (VP (TO to) (VP (VB see) (NP (NN somebody)))))))) (VP (VBD beat)) (NP (PRP him)))) (ROOT (S (NP (PRP I)) (VP (VP (VBP like) (NP (PRP him)) (PP (IN for) (NP (DT the) (JJS most) (NN part)))) (, ,) (CC but) (VP (MD would) (ADVP (RB still)) (VP (VB enjoy) (S (VP (VBG seeing) (S (NP (NN someone)) (VP (VB beat) (NP (PRP him))))))))) (. .))) uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him I like him for the most part, but would still enjoy seeing someone beat him. entailment entailment entailment entailment entailment entailment +3 78013 78013c telephone ( yeah ( ( i i ) ( think ( ( my ( favorite restaurant ) ) ( ( is always ) ( been ( ( the ( one closest ) ) ( you ( ( know ( the closest ) ) ( ( as long ) ( as ( it ( 's ( it ( meets ( ( the ( minimum criteria ) ) ( you ( know ( of ( good food ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( My ( favorite restaurants ) ) ( ( ( ( are always ) ( ( ( ( ( at least ) a ) hundred ) miles ) away ) ) ( from ( my house ) ) ) . ) ) (ROOT (S (VP (VB yeah) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP think) (SBAR (S (NP (PRP$ my) (JJ favorite) (NN restaurant)) (VP (VBZ is) (ADVP (RB always)) (VP (VBN been) (NP (NP (DT the) (CD one) (JJS closest)) (SBAR (S (NP (PRP you)) (VP (VBP know) (NP (DT the) (JJS closest)) (ADVP (ADVP (RB as) (RB long)) (SBAR (IN as) (S (NP (PRP it)) (VP (VBZ 's) (SBAR (S (NP (PRP it)) (VP (VBZ meets) (NP (NP (DT the) (JJ minimum) (NNS criteria)) (SBAR (S (NP (PRP you)) (VP (VBP know) (PP (IN of) (NP (JJ good) (NN food))))))))))))))))))))))))))))) (ROOT (S (NP (PRP$ My) (JJ favorite) (NNS restaurants)) (VP (VBP are) (ADVP (RB always)) (ADVP (NP (QP (IN at) (JJS least) (DT a) (CD hundred)) (NNS miles)) (RB away)) (PP (IN from) (NP (PRP$ my) (NN house)))) (. .))) yeah i i think my favorite restaurant is always been the one closest you know the closest as long as it's it meets the minimum criteria you know of good food My favorite restaurants are always at least a hundred miles away from my house. contradiction contradiction contradiction contradiction contradiction contradiction +4 96377 96377c telephone ( i ( ( do n't ) ( know ( um ( do ( you ( do ( ( a lot ) ( of camping ) ) ) ) ) ) ) ) ) ( I ( ( know exactly ) . ) ) (ROOT (S (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (SBAR (S (NP (NN um)) (VP (VBP do) (SBAR (S (NP (PRP you)) (VP (VBP do) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN camping)))))))))))))) (ROOT (S (NP (PRP I)) (VP (VBP know) (ADVP (RB exactly))) (. .))) i don't know um do you do a lot of camping I know exactly. contradiction contradiction contradiction contradiction contradiction contradiction diff --git a/tests/data_for_tests/io/MNLI/dev_mismatched.tsv b/tests/data_for_tests/io/MNLI/dev_mismatched.tsv new file mode 100755 index 00000000..a1da8897 --- /dev/null +++ b/tests/data_for_tests/io/MNLI/dev_mismatched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 label2 label3 label4 label5 gold_label +0 75290 75290c letters ( ( Your contribution ) ( ( helped ( make ( it ( possible ( for ( us ( to ( ( provide ( our students ) ) ( with ( a ( quality education ) ) ) ) ) ) ) ) ) ) ) . ) ) ( ( Your contributions ) ( ( were ( of ( ( no help ) ( with ( ( our ( students ' ) ) education ) ) ) ) ) . ) ) (ROOT (S (NP (PRP$ Your) (NN contribution)) (VP (VBD helped) (VP (VB make) (S (NP (PRP it)) (ADJP (JJ possible)) (SBAR (IN for) (S (NP (PRP us)) (VP (TO to) (VP (VB provide) (NP (PRP$ our) (NNS students)) (PP (IN with) (NP (DT a) (NN quality) (NN education)))))))))) (. .))) (ROOT (S (NP (PRP$ Your) (NNS contributions)) (VP (VBD were) (PP (IN of) (NP (NP (DT no) (NN help)) (PP (IN with) (NP (NP (PRP$ our) (NNS students) (POS ')) (NN education)))))) (. .))) Your contribution helped make it possible for us to provide our students with a quality education. Your contributions were of no help with our students' education. contradiction contradiction contradiction contradiction contradiction contradiction +1 133794 133794c verbatim ( ( ( ( ( ( The answer ) ( ( ( ( has nothing ) ( to ( do ( with ( their cause ) ) ) ) ) , ) however ) ) , ) but ) ( ( with ( ( ( ( ( ( ( ( the ( simple fact ) ) ( that ( dictionaries ( ( are not ) ( exercises ( in ( bi-unique substitutability ) ) ) ) ) ) ) ; ) ( in ( ( ( other words ) , ) ( if ( ( one ( of ( ( the senses ) ( of run ) ) ) ) ( ( is ` ) ( ( ( ( operate ' ) -LRB- ) ( as ( in ( She ( runs ( an ( engine factory ) ) ) ) ) ) ) -RRB- ) ) ) ) ) ) ) , ) ( that ( ( does not ) ( ( make it ) ( ( valid ( to ( assume ( that ( one ( can ( substitute ( ( operate ( for run ) ) ( in ( We ( ( run ( in ( ( the marathon ) ( every year ) ) ) ) . ) ) ) ) ) ) ) ) ) ) ) ( Although ( ( ( ( recognizing this ) ( as ( ( a shortcoming ) ( of dictionaries ) ) ) ) and ) ( ( ( assigning it ) arbitrarily ) ( to ( what ( , ( ( for ( lack ( of ( a ( better term ) ) ) ) ) ( , ( we ( might ( call ( ( the genius ) ( of ( the language ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) , ) ( might ( seem ( trivial ( to ( the ( casual observer ) ) ) ) ) ) ) ) ( , ( it ( is ( ( a ( valid matter ) ) ( for ( concern ( in ( ( the realm ) ( of lexicology ) ) ) ) ) ) ) ) ) ) ) . ) ( Dictionaries ( ( ( are indeed ) ( exercises ( in ( bi-unique substitutability ) ) ) ) . ) ) (ROOT (S (S (NP (DT The) (NN answer)) (VP (VBZ has) (ADVP (NN nothing)) (S (VP (TO to) (VP (VB do) (PP (IN with) (NP (PRP$ their) (NN cause)))))) (, ,) (ADVP (RB however)))) (, ,) (CC but) (S (SBAR (IN with) (S (NP (NP (DT the) (JJ simple) (NN fact)) (SBAR (IN that) (S (NP (NNS dictionaries)) (VP (VBP are) (RB not) (NP (NP (NNS exercises)) (PP (IN in) (NP (JJ bi-unique) (NN substitutability))))))) (: ;) (PP (IN in) (NP (NP (JJ other) (NNS words)) (, ,) (SBAR (IN if) (S (NP (NP (CD one)) (PP (IN of) (NP (NP (DT the) (NNS senses)) (PP (IN of) (NP (NN run)))))) (VP (VBZ is) (`` `) (VP (VB operate) ('' ') (-LRB- -LRB-) (SBAR (RB as) (IN in) (S (NP (PRP She)) (VP (VBZ runs) (NP (DT an) (NN engine) (NN factory))))) (-RRB- -RRB-))))))) (, ,) (SBAR (WHNP (WDT that)) (S (VP (VBZ does) (RB not) (VP (VB make) (NP (PRP it)) (S (ADJP (JJ valid) (S (VP (TO to) (VP (VB assume) (SBAR (IN that) (S (NP (PRP one)) (VP (MD can) (VP (VB substitute) (VP (VB operate) (PP (IN for) (NP (NN run))) (SBAR (IN in) (S (NP (PRP We)) (VP (VB run) (PP (IN in) (NP (NP (DT the) (NN marathon)) (NP (DT every) (NN year)))) (. .))))))))))))) (SBAR (IN Although) (S (S (VP (VBG recognizing) (NP (DT this)) (PP (IN as) (NP (NP (DT a) (NN shortcoming)) (PP (IN of) (NP (NNS dictionaries))))))) (CC and) (S (VP (VBG assigning) (NP (PRP it)) (ADVP (RB arbitrarily)) (PP (TO to) (SBAR (WHNP (WP what)) (S (, ,) (PP (IN for) (NP (NP (NN lack)) (PP (IN of) (NP (DT a) (JJR better) (NN term))))) (, ,) (NP (PRP we)) (VP (MD might) (VP (VB call) (NP (NP (DT the) (NN genius)) (PP (IN of) (NP (DT the) (NN language)))))))))))))))))) (, ,)) (VP (MD might) (VP (VB seem) (ADJP (JJ trivial) (PP (TO to) (NP (DT the) (JJ casual) (NN observer)))))))) (, ,) (NP (PRP it)) (VP (VBZ is) (NP (NP (DT a) (JJ valid) (NN matter)) (PP (IN for) (NP (NP (NN concern)) (PP (IN in) (NP (NP (DT the) (NN realm)) (PP (IN of) (NP (NN lexicology)))))))))) (. .))) (ROOT (S (NP (NNS Dictionaries)) (VP (VBP are) (ADVP (RB indeed)) (NP (NP (NNS exercises)) (PP (IN in) (NP (JJ bi-unique) (NN substitutability))))) (. .))) The answer has nothing to do with their cause, however, but with the simple fact that dictionaries are not exercises in bi-unique substitutability; in other words, if one of the senses of run is `operate' (as in She runs an engine factory ), that does not make it valid to assume that one can substitute operate for run in We run in the marathon every year . Although recognizing this as a shortcoming of dictionaries and assigning it arbitrarily to what, for lack of a better term, we might call the genius of the language, might seem trivial to the casual observer, it is a valid matter for concern in the realm of lexicology. Dictionaries are indeed exercises in bi-unique substitutability. contradiction contradiction contradiction contradiction contradiction contradiction +2 3628 3628c verbatim ( We ( ( serve ( ( a ( classic ( Tuscan meal ) ) ) ( that ( includes ( ( a ( Florentine terrine ) ) ( made ( with ( dick ( and ( chicken livers ) ) ) ) ) ) ) ) ) ) . ) ) ( We ( ( serve ( ( a meal ) ( of ( Florentine terrine ) ) ) ) . ) ) (ROOT (S (NP (PRP We)) (VP (VBP serve) (NP (NP (DT a) (JJ classic) (NNP Tuscan) (NN meal)) (SBAR (WHNP (WDT that)) (S (VP (VBZ includes) (NP (NP (DT a) (JJ Florentine) (NN terrine)) (VP (VBN made) (PP (IN with) (NP (NN dick) (CC and) (NN chicken) (NNS livers)))))))))) (. .))) (ROOT (S (NP (PRP We)) (VP (VBP serve) (NP (NP (DT a) (NN meal)) (PP (IN of) (NP (NNP Florentine) (NN terrine))))) (. .))) We serve a classic Tuscan meal that includes a Florentine terrine made with dick and chicken livers. We serve a meal of Florentine terrine. contradiction neutral entailment entailment entailment entailment +3 89411 89411c letters ( ( ( A ( few months ) ) ago ) ( , ( ( ( ( Carl Newton ) and ) I ) ( ( ( wrote ( a letter ) ) ( asking ( you ( to ( ( consider ( a ( financial contribution ) ) ) ( to ( ( graduate Endodontics ) ( at ( Indiana University ) ) ) ) ) ) ) ) ) . ) ) ) ) ( ( ( ( Carl Newton ) and ) I ) ( ( ( have never ) ( ( had ( any ( other ( previous contact ) ) ) ) ( with you ) ) ) . ) ) (ROOT (S (ADVP (NP (DT A) (JJ few) (NNS months)) (RB ago)) (, ,) (NP (NP (NNP Carl) (NNP Newton)) (CC and) (NP (PRP I))) (VP (VBD wrote) (NP (DT a) (NN letter)) (S (VP (VBG asking) (S (NP (PRP you)) (VP (TO to) (VP (VB consider) (NP (DT a) (JJ financial) (NN contribution)) (PP (TO to) (NP (NP (JJ graduate) (NNS Endodontics)) (PP (IN at) (NP (NNP Indiana) (NNP University))))))))))) (. .))) (ROOT (S (NP (NP (NNP Carl) (NNP Newton)) (CC and) (NP (PRP I))) (VP (VBP have) (ADVP (RB never)) (VP (VBN had) (NP (DT any) (JJ other) (JJ previous) (NN contact)) (PP (IN with) (NP (PRP you))))) (. .))) A few months ago, Carl Newton and I wrote a letter asking you to consider a financial contribution to graduate Endodontics at Indiana University. Carl Newton and I have never had any other previous contact with you. contradiction contradiction contradiction contradiction contradiction contradiction +4 136158 136158e facetoface ( I ( ( was ( on ( ( this earth ) ( you ( know ( ( , ( ( I ( 've ( lived ( on ( ( this earth ) ( for ( some reason ) ) ) ) ) ) ) , ) ) ( I ( just ( ( do n't ) ( know ( what ( it ( is yet ) ) ) ) ) ) ) ) ) ) ) ) ) . ) ) ( I ( ( ( ( do n't ) yet ) ( ( know ( the reason ) ) ( why ( I ( have ( lived ( on earth ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (PRP I)) (VP (VBD was) (PP (IN on) (NP (NP (DT this) (NN earth)) (SBAR (S (NP (PRP you)) (VP (VBP know) (SBAR (S (PRN (, ,) (S (NP (PRP I)) (VP (VBP 've) (VP (VBN lived) (PP (IN on) (NP (NP (DT this) (NN earth)) (PP (IN for) (NP (DT some) (NN reason)))))))) (, ,)) (NP (PRP I)) (ADVP (RB just)) (VP (VBP do) (RB n't) (VP (VB know) (SBAR (WHNP (WP what)) (S (NP (PRP it)) (VP (VBZ is) (ADVP (RB yet))))))))))))))) (. .))) (ROOT (S (NP (PRP I)) (VP (VBP do) (RB n't) (ADVP (RB yet)) (VP (VB know) (NP (DT the) (NN reason)) (SBAR (WHADVP (WRB why)) (S (NP (PRP I)) (VP (VBP have) (VP (VBN lived) (PP (IN on) (NP (NN earth))))))))) (. .))) I was on this earth you know, I've lived on this earth for some reason, I just don't know what it is yet. I don't yet know the reason why I have lived on earth. entailment entailment entailment entailment entailment entailment diff --git a/tests/data_for_tests/io/MNLI/test_matched.tsv b/tests/data_for_tests/io/MNLI/test_matched.tsv new file mode 100755 index 00000000..b90c2d2a --- /dev/null +++ b/tests/data_for_tests/io/MNLI/test_matched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 +0 31493 31493 travel ( ( ( ( ( ( ( ( Hierbas , ) ( ans seco ) ) , ) ( ans dulce ) ) , ) and ) frigola ) ( ( ( are just ) ( ( a ( few names ) ) ( worth ( ( keeping ( a look-out ) ) for ) ) ) ) . ) ) ( Hierbas ( ( is ( ( a name ) ( worth ( ( looking out ) for ) ) ) ) . ) ) (ROOT (S (NP (NP (NNS Hierbas)) (, ,) (NP (NN ans) (NN seco)) (, ,) (NP (NN ans) (NN dulce)) (, ,) (CC and) (NP (NN frigola))) (VP (VBP are) (ADVP (RB just)) (NP (NP (DT a) (JJ few) (NNS names)) (PP (JJ worth) (S (VP (VBG keeping) (NP (DT a) (NN look-out)) (PP (IN for))))))) (. .))) (ROOT (S (NP (NNS Hierbas)) (VP (VBZ is) (NP (NP (DT a) (NN name)) (PP (JJ worth) (S (VP (VBG looking) (PRT (RP out)) (PP (IN for))))))) (. .))) Hierbas, ans seco, ans dulce, and frigola are just a few names worth keeping a look-out for. Hierbas is a name worth looking out for. +1 92164 92164 government ( ( ( The extent ) ( of ( the ( behavioral effects ) ) ) ) ( ( would ( ( depend ( in ( part ( on ( ( the structure ) ( of ( ( ( the ( individual ( account program ) ) ) and ) ( any limits ) ) ) ) ) ) ) ) ( on ( accessing ( the funds ) ) ) ) ) . ) ) ( ( Many people ) ( ( would ( be ( very ( unhappy ( to ( ( loose control ) ( over ( their ( own money ) ) ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (NP (DT The) (NN extent)) (PP (IN of) (NP (DT the) (JJ behavioral) (NNS effects)))) (VP (MD would) (VP (VB depend) (PP (IN in) (NP (NP (NN part)) (PP (IN on) (NP (NP (DT the) (NN structure)) (PP (IN of) (NP (NP (DT the) (JJ individual) (NN account) (NN program)) (CC and) (NP (DT any) (NNS limits)))))))) (PP (IN on) (S (VP (VBG accessing) (NP (DT the) (NNS funds))))))) (. .))) (ROOT (S (NP (JJ Many) (NNS people)) (VP (MD would) (VP (VB be) (ADJP (RB very) (JJ unhappy) (PP (TO to) (NP (NP (JJ loose) (NN control)) (PP (IN over) (NP (PRP$ their) (JJ own) (NN money)))))))) (. .))) The extent of the behavioral effects would depend in part on the structure of the individual account program and any limits on accessing the funds. Many people would be very unhappy to loose control over their own money. +2 9662 9662 government ( ( ( Timely access ) ( to information ) ) ( ( is ( in ( ( the ( best interests ) ) ( of ( ( ( both GAO ) and ) ( the agencies ) ) ) ) ) ) . ) ) ( It ( ( ( is ( in ( ( everyone 's ) ( best interest ) ) ) ) ( to ( ( have access ) ( to ( information ( in ( a ( timely manner ) ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (NP (JJ Timely) (NN access)) (PP (TO to) (NP (NN information)))) (VP (VBZ is) (PP (IN in) (NP (NP (DT the) (JJS best) (NNS interests)) (PP (IN of) (NP (NP (DT both) (NNP GAO)) (CC and) (NP (DT the) (NNS agencies))))))) (. .))) (ROOT (S (NP (PRP It)) (VP (VBZ is) (PP (IN in) (NP (NP (NN everyone) (POS 's)) (JJS best) (NN interest))) (S (VP (TO to) (VP (VB have) (NP (NN access)) (PP (TO to) (NP (NP (NN information)) (PP (IN in) (NP (DT a) (JJ timely) (NN manner))))))))) (. .))) Timely access to information is in the best interests of both GAO and the agencies. It is in everyone's best interest to have access to information in a timely manner. +3 5991 5991 travel ( ( Based ( in ( ( the ( Auvergnat ( spa town ) ) ) ( of Vichy ) ) ) ) ( , ( ( the ( French government ) ) ( often ( ( ( ( proved ( more zealous ) ) ( than ( its masters ) ) ) ( in ( ( ( suppressing ( civil liberties ) ) and ) ( ( drawing up ) ( anti-Jewish legislation ) ) ) ) ) . ) ) ) ) ) ( ( The ( French government ) ) ( ( passed ( ( anti-Jewish laws ) ( aimed ( at ( helping ( the Nazi ) ) ) ) ) ) . ) ) (ROOT (S (PP (VBN Based) (PP (IN in) (NP (NP (DT the) (NNP Auvergnat) (NN spa) (NN town)) (PP (IN of) (NP (NNP Vichy)))))) (, ,) (NP (DT the) (JJ French) (NN government)) (ADVP (RB often)) (VP (VBD proved) (NP (JJR more) (NNS zealous)) (PP (IN than) (NP (PRP$ its) (NNS masters))) (PP (IN in) (S (VP (VP (VBG suppressing) (NP (JJ civil) (NNS liberties))) (CC and) (VP (VBG drawing) (PRT (RP up)) (NP (JJ anti-Jewish) (NN legislation))))))) (. .))) (ROOT (S (NP (DT The) (JJ French) (NN government)) (VP (VBD passed) (NP (NP (JJ anti-Jewish) (NNS laws)) (VP (VBN aimed) (PP (IN at) (S (VP (VBG helping) (NP (DT the) (JJ Nazi)))))))) (. .))) Based in the Auvergnat spa town of Vichy, the French government often proved more zealous than its masters in suppressing civil liberties and drawing up anti-Jewish legislation. The French government passed anti-Jewish laws aimed at helping the Nazi. +4 50156 50156 travel ( ( ( ( ( Built ( in 1870 ) ) ( , ( ( ( its canopy ) ( of ( stained ( glass ( and ( cast iron ) ) ) ) ) ) ( is ( ( the oldest ) ( in Dublin ) ) ) ) ) ) ; ) ( ( its ( enthusiastic ( interior decoration ) ) ) ( ( is also ) ( typical ( of ( the era ) ) ) ) ) ) . ) ( It ( ( ( ( was ( constructed ( in 1870 ) ) ) and ) ( has ( ( the ( oldest canopy ) ) ( in Dublin ) ) ) ) . ) ) (ROOT (S (S (S (VP (VBN Built) (PP (IN in) (NP (CD 1870))))) (, ,) (NP (NP (PRP$ its) (NN canopy)) (PP (IN of) (NP (JJ stained) (NN glass) (CC and) (NN cast) (NN iron)))) (VP (VBZ is) (NP (NP (DT the) (JJS oldest)) (PP (IN in) (NP (NNP Dublin)))))) (: ;) (S (NP (PRP$ its) (JJ enthusiastic) (JJ interior) (NN decoration)) (VP (VBZ is) (ADVP (RB also)) (ADJP (JJ typical) (PP (IN of) (NP (DT the) (NN era)))))) (. .))) (ROOT (S (NP (PRP It)) (VP (VP (VBD was) (VP (VBN constructed) (PP (IN in) (NP (CD 1870))))) (CC and) (VP (VBZ has) (NP (NP (DT the) (JJS oldest) (NN canopy)) (PP (IN in) (NP (NNP Dublin)))))) (. .))) Built in 1870, its canopy of stained glass and cast iron is the oldest in Dublin; its enthusiastic interior decoration is also typical of the era. It was constructed in 1870 and has the oldest canopy in Dublin. diff --git a/tests/data_for_tests/io/MNLI/test_mismatched.tsv b/tests/data_for_tests/io/MNLI/test_mismatched.tsv new file mode 100755 index 00000000..798cd395 --- /dev/null +++ b/tests/data_for_tests/io/MNLI/test_mismatched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 +0 16130 16130 facetoface ( ( What ( have ( you decided ) ) ) ( , ( what ( ( ( are you ) ( going ( to do ) ) ) ? ) ) ) ) ( So ( what ( ( 's ( your decision ) ) ? ) ) ) (ROOT (SBARQ (SBAR (WHNP (WP What)) (S (VP (VBP have) (S (NP (PRP you)) (VP (VBD decided)))))) (, ,) (WHNP (WP what)) (SQ (VBP are) (NP (PRP you)) (VP (VBG going) (S (VP (TO to) (VP (VB do)))))) (. ?))) (ROOT (SBARQ (RB So) (WHNP (WP what)) (SQ (VBZ 's) (NP (PRP$ your) (NN decision))) (. ?))) What have you decided, what are you going to do? So what's your decision? +1 128269 128269 oup ( ( ( Women 's ) clothing ) ( ( is ( characterized ( by ( ( great diversity ) ( in ( ( styles and ) ( short ( production runs ) ) ) ) ) ) ) ) . ) ) ( ( ( Men 's ) clothing ) ( typically ( ( ( has ( the ( ( most stylistic ) diversity ) ) ) ( unlike ( ( the blandness ) ( of ( ( women 's ) fashion ) ) ) ) ) . ) ) ) (ROOT (S (NP (NP (NNP Women) (POS 's)) (NN clothing)) (VP (VBZ is) (VP (VBN characterized) (PP (IN by) (NP (NP (JJ great) (NN diversity)) (PP (IN in) (NP (NP (NNS styles)) (CC and) (NP (JJ short) (NN production) (NNS runs)))))))) (. .))) (ROOT (S (NP (NP (NNP Men) (POS 's)) (NN clothing)) (ADVP (RB typically)) (VP (VBZ has) (NP (DT the) (ADJP (RBS most) (JJ stylistic)) (NN diversity)) (PP (IN unlike) (NP (NP (DT the) (NN blandness)) (PP (IN of) (NP (NP (NNS women) (POS 's)) (NN fashion)))))) (. .))) Women's clothing is characterized by great diversity in styles and short production runs. Men's clothing typically has the most stylistic diversity unlike the blandness of women's fashion. +2 130938 130938 nineeleven ( ( ( ( ( Reports ( from ( ( two ( flight attendants ) ) ( in ( the ( coach cabin ) ) ) ) ) ) , ) ( ( ( Betty Ong ) and ) ( Madeline ( Amy Sweeney ) ) ) ) , ) ( ( ( tell us ) ( ( most ( of what ) ) ( we ( know ( about ( how ( ( the hijacking ) happened ) ) ) ) ) ) ) . ) ) ( ( ( The report ) ( on ( the hijacking ) ) ) ( ( ( was ( ( over ( five hundred ) ) pages ) ) long ) . ) ) (ROOT (S (NP (NP (NP (NNS Reports)) (PP (IN from) (NP (NP (CD two) (NN flight) (NNS attendants)) (PP (IN in) (NP (DT the) (NN coach) (NN cabin)))))) (, ,) (NP (NP (NNP Betty) (NNP Ong)) (CC and) (NP (NNP Madeline) (NNP Amy) (NNP Sweeney))) (, ,)) (VP (VBP tell) (NP (PRP us)) (SBAR (WHNP (JJS most) (WHPP (IN of) (WHNP (WP what)))) (S (NP (PRP we)) (VP (VBP know) (PP (IN about) (SBAR (WHADVP (WRB how)) (S (NP (DT the) (NN hijacking)) (VP (VBD happened))))))))) (. .))) (ROOT (S (NP (NP (DT The) (NN report)) (PP (IN on) (NP (DT the) (NN hijacking)))) (VP (VBD was) (NP (QP (RB over) (CD five) (CD hundred)) (NNS pages)) (ADVP (RB long))) (. .))) Reports from two flight attendants in the coach cabin, Betty Ong and Madeline Amy Sweeney, tell us most of what we know about how the hijacking happened. The report on the hijacking was over five hundred pages long. +3 40009 40009 nineeleven ( ( At ( about 9:20 ) ) ( , ( ( ( security personnel ) ( at ( FAA headquarters ) ) ) ( ( ( ( set up ) ( a ( hijacking teleconference ) ) ) ( with ( ( ( several agencies ) , ) ( including ( the ( Defense Department ) ) ) ) ) ) . ) ) ) ) ( ( The teleconference ) ( ( lasted ( for ( 13 ( straight hours ) ) ) ) . ) ) (ROOT (S (PP (IN At) (NP (QP (RB about) (CD 9:20)))) (, ,) (NP (NP (NN security) (NNS personnel)) (PP (IN at) (NP (NNP FAA) (NNS headquarters)))) (VP (VBD set) (PRT (RP up)) (NP (DT a) (VBG hijacking) (NN teleconference)) (PP (IN with) (NP (NP (JJ several) (NNS agencies)) (, ,) (PP (VBG including) (NP (DT the) (NNP Defense) (NNP Department)))))) (. .))) (ROOT (S (NP (DT The) (NN teleconference)) (VP (VBD lasted) (PP (IN for) (NP (CD 13) (JJ straight) (NNS hours)))) (. .))) At about 9:20, security personnel at FAA headquarters set up a hijacking teleconference with several agencies, including the Defense Department. The teleconference lasted for 13 straight hours. +4 105266 105266 nineeleven ( So ( we ( ( 've ( ( got ( ( a couple ) ( of aircraft ) ) ) ( ( up there ) ( that ( ( have ( those instructions ) ) ( at ( this ( present time ) ) ) ) ) ) ) ) ? ) ) ) ( ( At ( the ( present time ) ) ) ( , ( there ( ( ( ( ( were n't ) ( ( any aircraft ) ( in ( the air ) ) ) ) , ) right ) ? ) ) ) ) (ROOT (S (IN So) (NP (PRP we)) (VP (VBP 've) (VP (VBD got) (NP (NP (DT a) (NN couple)) (PP (IN of) (NP (NN aircraft)))) (ADVP (ADVP (RB up) (RB there)) (SBAR (WHNP (WDT that)) (S (VP (VBP have) (NP (DT those) (NNS instructions)) (PP (IN at) (NP (DT this) (JJ present) (NN time))))))))) (. ?))) (ROOT (S (PP (IN At) (NP (DT the) (JJ present) (NN time))) (, ,) (NP (EX there)) (VP (VBD were) (RB n't) (NP (NP (DT any) (NN aircraft)) (PP (IN in) (NP (DT the) (NN air)))) (, ,) (ADJP (JJ right))) (. ?))) So we've got a couple of aircraft up there that have those instructions at this present time? At the present time, there weren't any aircraft in the air, right? diff --git a/tests/data_for_tests/io/MNLI/train.tsv b/tests/data_for_tests/io/MNLI/train.tsv new file mode 100755 index 00000000..4ceebefd --- /dev/null +++ b/tests/data_for_tests/io/MNLI/train.tsv @@ -0,0 +1,7 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 gold_label +0 31193 31193n government ( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) ) ( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) ) (ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .))) (ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .))) Conceptually cream skimming has two basic dimensions - product and geography. Product and geography are what make cream skimming work. neutral neutral +1 101457 101457e telephone ( you ( ( know ( during ( ( ( the season ) and ) ( i guess ) ) ) ) ( at ( at ( ( your level ) ( uh ( you ( ( ( lose them ) ( to ( the ( next level ) ) ) ) ( if ( ( if ( they ( decide ( to ( recall ( the ( the ( parent team ) ) ) ) ) ) ) ) ( ( the Braves ) ( decide ( to ( call ( to ( ( recall ( a guy ) ) ( from ( ( triple A ) ( ( ( then ( ( a ( double ( A guy ) ) ) ( ( goes up ) ( to ( replace him ) ) ) ) ) and ) ( ( a ( single ( A guy ) ) ) ( ( goes up ) ( to ( replace him ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( You ( ( ( ( lose ( the things ) ) ( to ( the ( following level ) ) ) ) ( if ( ( the people ) recall ) ) ) . ) ) (ROOT (S (NP (PRP you)) (VP (VBP know) (PP (IN during) (NP (NP (DT the) (NN season)) (CC and) (NP (FW i) (FW guess)))) (PP (IN at) (IN at) (NP (NP (PRP$ your) (NN level)) (SBAR (S (INTJ (UH uh)) (NP (PRP you)) (VP (VBP lose) (NP (PRP them)) (PP (TO to) (NP (DT the) (JJ next) (NN level))) (SBAR (IN if) (S (SBAR (IN if) (S (NP (PRP they)) (VP (VBP decide) (S (VP (TO to) (VP (VB recall) (NP (DT the) (DT the) (NN parent) (NN team)))))))) (NP (DT the) (NNPS Braves)) (VP (VBP decide) (S (VP (TO to) (VP (VB call) (S (VP (TO to) (VP (VB recall) (NP (DT a) (NN guy)) (PP (IN from) (NP (NP (RB triple) (DT A)) (SBAR (S (S (ADVP (RB then)) (NP (DT a) (JJ double) (NNP A) (NN guy)) (VP (VBZ goes) (PRT (RP up)) (S (VP (TO to) (VP (VB replace) (NP (PRP him))))))) (CC and) (S (NP (DT a) (JJ single) (NNP A) (NN guy)) (VP (VBZ goes) (PRT (RP up)) (S (VP (TO to) (VP (VB replace) (NP (PRP him)))))))))))))))))))))))))))) (ROOT (S (NP (PRP You)) (VP (VBP lose) (NP (DT the) (NNS things)) (PP (TO to) (NP (DT the) (JJ following) (NN level))) (SBAR (IN if) (S (NP (DT the) (NNS people)) (VP (VBP recall))))) (. .))) you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him You lose the things to the following level if the people recall. entailment entailment +2 134793 134793e fiction ( ( One ( of ( our number ) ) ) ( ( will ( ( ( carry out ) ( your instructions ) ) minutely ) ) . ) ) ( ( ( A member ) ( of ( my team ) ) ) ( ( will ( ( execute ( your orders ) ) ( with ( immense precision ) ) ) ) . ) ) (ROOT (S (NP (NP (CD One)) (PP (IN of) (NP (PRP$ our) (NN number)))) (VP (MD will) (VP (VB carry) (PRT (RP out)) (NP (PRP$ your) (NNS instructions)) (ADVP (RB minutely)))) (. .))) (ROOT (S (NP (NP (DT A) (NN member)) (PP (IN of) (NP (PRP$ my) (NN team)))) (VP (MD will) (VP (VB execute) (NP (PRP$ your) (NNS orders)) (PP (IN with) (NP (JJ immense) (NN precision))))) (. .))) One of our number will carry out your instructions minutely. A member of my team will execute your orders with immense precision. entailment entailment +3 37397 37397e fiction ( ( How ( ( ( do you ) know ) ? ) ) ( ( All this ) ( ( ( is ( their information ) ) again ) . ) ) ) ( ( This information ) ( ( belongs ( to them ) ) . ) ) (ROOT (S (SBARQ (WHADVP (WRB How)) (SQ (VBP do) (NP (PRP you)) (VP (VB know))) (. ?)) (NP (PDT All) (DT this)) (VP (VBZ is) (NP (PRP$ their) (NN information)) (ADVP (RB again))) (. .))) (ROOT (S (NP (DT This) (NN information)) (VP (VBZ belongs) (PP (TO to) (NP (PRP them)))) (. .))) How do you know? All this is their information again. This information belongs to them. entailment entailment +4 50563 50563n telephone ( yeah ( i ( ( tell you ) ( what ( ( though ( if ( you ( go ( price ( some ( of ( those ( tennis shoes ) ) ) ) ) ) ) ) ) ( i ( can ( see ( why ( now ( you ( know ( they ( 're ( ( getting up ) ( in ( the ( hundred ( dollar range ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( The ( tennis shoes ) ) ( ( have ( ( a range ) ( of prices ) ) ) . ) ) (ROOT (S (VP (VB yeah) (S (NP (FW i)) (VP (VB tell) (NP (PRP you)) (SBAR (WHNP (WP what)) (S (SBAR (RB though) (IN if) (S (NP (PRP you)) (VP (VBP go) (VP (VB price) (NP (NP (DT some)) (PP (IN of) (NP (DT those) (NN tennis) (NNS shoes)))))))) (NP (FW i)) (VP (MD can) (VP (VB see) (SBAR (WHADVP (WRB why)) (S (ADVP (RB now)) (NP (PRP you)) (VP (VBP know) (SBAR (S (NP (PRP they)) (VP (VBP 're) (VP (VBG getting) (PRT (RP up)) (PP (IN in) (NP (DT the) (CD hundred) (NN dollar) (NN range))))))))))))))))))) (ROOT (S (NP (DT The) (NN tennis) (NNS shoes)) (VP (VBP have) (NP (NP (DT a) (NN range)) (PP (IN of) (NP (NNS prices))))) (. .))) yeah i tell you what though if you go price some of those tennis shoes i can see why now you know they're getting up in the hundred dollar range The tennis shoes have a range of prices. neutral neutral +11 11877 11877c travel ( ( Fun ( for ( ( adults and ) children ) ) ) . ) ( ( Fun ( for ( only children ) ) ) . ) (ROOT (S (VP (VB Fun) (PP (IN for) (NP (NNS adults) (CC and) (NNS children)))) (. .))) (ROOT (S (VP (VB Fun) (PP (IN for) (NP (JJ only) (NNS children)))) (. .))) Fun for adults and children. Fun for only children. contradiction contradiction diff --git a/tests/data_for_tests/io/MSRA_NER/dev.conll b/tests/data_for_tests/io/MSRA_NER/dev.conll new file mode 100755 index 00000000..792efce8 --- /dev/null +++ b/tests/data_for_tests/io/MSRA_NER/dev.conll @@ -0,0 +1,38 @@ +把 O +欧 B-LOC + +美 B-LOC +、 O + +港 B-LOC +台 B-LOC + +流 O +行 O + +的 O +食 O + +品 O +类 O + +图 O +谱 O + +马 B-PER +列 B-PER + +主 O +义 O + +在 O +中 B-LOC + +国 I-LOC +传 O + +播 O +的 O + +历 O +史 O \ No newline at end of file diff --git a/tests/data_for_tests/io/MSRA_NER/test.conll b/tests/data_for_tests/io/MSRA_NER/test.conll new file mode 100755 index 00000000..d611fcdd --- /dev/null +++ b/tests/data_for_tests/io/MSRA_NER/test.conll @@ -0,0 +1,31 @@ +中 B-ORG +共 I-ORG + +中 I-ORG +央 I-ORG + +致 O +中 B-ORG + +国 I-ORG +致 I-ORG + +公 I-ORG +党 I-ORG + +十 I-ORG +一 I-ORG + +大 I-ORG +的 O + +贺 O +词 O + + +各 O + +位 O +代 O + +表 O diff --git a/tests/data_for_tests/io/MSRA_NER/train.conll b/tests/data_for_tests/io/MSRA_NER/train.conll new file mode 100755 index 00000000..9edd3aef --- /dev/null +++ b/tests/data_for_tests/io/MSRA_NER/train.conll @@ -0,0 +1,60 @@ +是 O +我 O + +们 O +收 O + +藏 O +北 B-LOC + +京 I-LOC +史 O + +料 O + +调 O +查 O + +范 O +围 O + +涉 O +及 O + +故 B-LOC +宫 I-LOC + +、 O +历 B-LOC + +博 I-LOC +、 O + +古 B-ORG +研 I-ORG + +所 I-ORG +、 O + +北 B-LOC +大 I-LOC + +清 I-LOC +华 I-LOC + +图 I-LOC +书 I-LOC + +馆 I-LOC +. O + +夏 B-PER +财 I-PER + +兴 I-PER +家 O + +分 O +到 O + +田 O diff --git a/tests/data_for_tests/io/OntoNotes/dev.txt b/tests/data_for_tests/io/OntoNotes/dev.txt new file mode 100755 index 00000000..e99207a1 --- /dev/null +++ b/tests/data_for_tests/io/OntoNotes/dev.txt @@ -0,0 +1,10 @@ + +bc/msnbc/00/msnbc_0000 0 0 Hi UH (TOP(FRAG(INTJ*) - - - Dan_Abrams * - +bc/msnbc/00/msnbc_0000 0 1 everyone NN (NP*) - - - Dan_Abrams * - +bc/msnbc/00/msnbc_0000 0 2 /. . *)) - - - Dan_Abrams * - + +bc/msnbc/00/msnbc_0000 0 0 first RB (TOP(S(ADVP* - - - Dan_Abrams * (ARGM-TMP* * * * - +bc/msnbc/00/msnbc_0000 0 1 up RB * - - - Dan_Abrams * * * * * - +bc/msnbc/00/msnbc_0000 0 2 on IN (PP* - - - Dan_Abrams * * * * * - +bc/msnbc/00/msnbc_0000 0 3 the DT (NP* - - - Dan_Abrams * * * * * - +bc/msnbc/00/msnbc_0000 0 4 docket NN *)) docket - - Dan_Abrams * * * * * - diff --git a/tests/data_for_tests/io/OntoNotes/test.txt b/tests/data_for_tests/io/OntoNotes/test.txt new file mode 100755 index 00000000..c94069e0 --- /dev/null +++ b/tests/data_for_tests/io/OntoNotes/test.txt @@ -0,0 +1,10 @@ + +bc/msnbc/00/msnbc_0007 0 0 Dealing VBG (TOP(VP* deal 01 - speaker_1 * (V*) - +bc/msnbc/00/msnbc_0007 0 1 with IN (PP* - - - speaker_1 * (ARG1* - +bc/msnbc/00/msnbc_0007 0 2 serial JJ (NP(NP* - - - speaker_1 * * (156 +bc/msnbc/00/msnbc_0007 0 3 crimes NNS *) crime - 1 speaker_1 * * 156) +bc/msnbc/00/msnbc_0007 0 4 per FW (ADVP* - - - speaker_1 * * - +bc/msnbc/00/msnbc_0007 0 5 se FW *))) - - - speaker_1 * *) - +bc/msnbc/00/msnbc_0007 0 6 /. . *)) - - - speaker_1 * * - + +bc/msnbc/00/msnbc_0007 0 0 We PRP (TOP(S(NP*) - - - speaker_1 * (ARG0*) * (90) diff --git a/tests/data_for_tests/io/OntoNotes/train.txt b/tests/data_for_tests/io/OntoNotes/train.txt new file mode 100755 index 00000000..36f14c73 --- /dev/null +++ b/tests/data_for_tests/io/OntoNotes/train.txt @@ -0,0 +1,50 @@ + +bc/msnbc/00/msnbc_0003 0 0 The DT (TOP(S(NP* - - - Chris_Matthews * * (ARG1* * * * * - +bc/msnbc/00/msnbc_0003 0 1 move NN *) move 02 2 Chris_Matthews * (V*) *) * * * * - +bc/msnbc/00/msnbc_0003 0 2 comes VBZ (VP* come 03 2 Chris_Matthews * * (V*) * * * * - +bc/msnbc/00/msnbc_0003 0 3 a DT (SBAR(NP* - - - Chris_Matthews (DATE* * (ARGM-TMP* * * * * - +bc/msnbc/00/msnbc_0003 0 4 month NN *) month - 2 Chris_Matthews *) * * * * * * - +bc/msnbc/00/msnbc_0003 0 5 before IN * - - - Chris_Matthews * * * * * * * - +bc/msnbc/00/msnbc_0003 0 6 the DT (S(NP* - - - Chris_Matthews * * * * (ARG1* (ARG0* * - +bc/msnbc/00/msnbc_0003 0 7 Senate NNP *) - - - Chris_Matthews (ORG) * * * *) *) * - +bc/msnbc/00/msnbc_0003 0 8 is VBZ (VP* be 03 - Chris_Matthews * * * (V*) * * * - +bc/msnbc/00/msnbc_0003 0 9 scheduled VBN (VP* schedule 01 - Chris_Matthews * * * * (V*) * * - +bc/msnbc/00/msnbc_0003 0 10 to TO (S(VP* - - - Chris_Matthews * * * * (ARG2* * * - +bc/msnbc/00/msnbc_0003 0 11 hold VB (VP* hold 04 8 Chris_Matthews * * * * * (V*) * - +bc/msnbc/00/msnbc_0003 0 12 confirmation NN (NP(NP* - - - Chris_Matthews * * * * * (ARG1* (ARG2*) - +bc/msnbc/00/msnbc_0003 0 13 hearings NNS *) hearing 01 1 Chris_Matthews * * * * * * (V*) - +bc/msnbc/00/msnbc_0003 0 14 on IN (PP* - - - Chris_Matthews * * * * * * (ARG1* - +bc/msnbc/00/msnbc_0003 0 15 President NNP (NP(NP(NP* - - - Chris_Matthews * * * * * * * (194 +bc/msnbc/00/msnbc_0003 0 16 Bush NNP * - - - Chris_Matthews (PERSON) * * * * * * - +bc/msnbc/00/msnbc_0003 0 17 's POS *) - - - Chris_Matthews * * * * * * * 194) +bc/msnbc/00/msnbc_0003 0 18 Supreme NNP (NML* - - - Chris_Matthews (ORG* * * * * * * - +bc/msnbc/00/msnbc_0003 0 19 Court NNP *) - - - Chris_Matthews *) * * * * * * - +bc/msnbc/00/msnbc_0003 0 20 nominee NN *) - - - Chris_Matthews * * * * * * * - +bc/msnbc/00/msnbc_0003 0 21 John NNP (NP* - - - Chris_Matthews (PERSON* * * * * * * - +bc/msnbc/00/msnbc_0003 0 22 Roberts NNP *)))))))))))) - - - Chris_Matthews *) * *) * *) *) *) - +bc/msnbc/00/msnbc_0003 0 23 /. . *)) - - - Chris_Matthews * * * * * * * - + +bc/msnbc/00/msnbc_0003 0 0 Senator NNP (TOP(S(NP(NP* - - - Chris_Matthews * (ARG1* * * (162 +bc/msnbc/00/msnbc_0003 0 1 Chris NNP * - - - Chris_Matthews (PERSON* * * * - +bc/msnbc/00/msnbc_0003 0 2 Dodd NNP *) - - - Chris_Matthews *) * * * - +bc/msnbc/00/msnbc_0003 0 3 of IN (PP* - - - Chris_Matthews * * * * - +bc/msnbc/00/msnbc_0003 0 4 Connecticut NNP (NP*))) - - - Chris_Matthews (GPE) *) * * 162) +bc/msnbc/00/msnbc_0003 0 5 was VBD (VP* be 01 1 Chris_Matthews * (V*) * * - +bc/msnbc/00/msnbc_0003 0 6 among IN (PP* - - - Chris_Matthews * (ARG2* * * - +bc/msnbc/00/msnbc_0003 0 7 those DT (NP(NP* - - - Chris_Matthews * * (ARG0* * - +bc/msnbc/00/msnbc_0003 0 8 Democrats NNPS *) - - - Chris_Matthews (NORP) * *) * - +bc/msnbc/00/msnbc_0003 0 9 who WP (SBAR(WHNP*) - - - Chris_Matthews * * (R-ARG0*) * - +bc/msnbc/00/msnbc_0003 0 10 spoke VBD (S(VP* speak 03 5 Chris_Matthews * * (V*) * - +bc/msnbc/00/msnbc_0003 0 11 out RP (PRT*) - - - Chris_Matthews * * * * - +bc/msnbc/00/msnbc_0003 0 12 against IN (PP* - - - Chris_Matthews * * (ARG1* * - +bc/msnbc/00/msnbc_0003 0 13 Bolton NNP (NP(NP* - - - Chris_Matthews (PERSON) * * (ARG1* (31|(130 +bc/msnbc/00/msnbc_0003 0 14 's POS *) - - - Chris_Matthews * * * *) 31) +bc/msnbc/00/msnbc_0003 0 15 appointment NN *)) appointment 01 1 Chris_Matthews * * *) (V*) 130) +bc/msnbc/00/msnbc_0003 0 16 today NN (NP*))))))) today - 2 Chris_Matthews (DATE) *) (ARGM-TMP*) * (121) +bc/msnbc/00/msnbc_0003 0 17 /. . *)) - - - Chris_Matthews * * * * - + +bc/msnbc/00/msnbc_0003 0 0 I PRP (TOP(S(NP*) - - - Christopher_Dodd * * (ARG0*) * (162) +bc/msnbc/00/msnbc_0003 0 1 just RB (ADVP*) - - - Christopher_Dodd * * (ARGM-ADV*) * - +bc/msnbc/00/msnbc_0003 0 2 do VBP (VP* do 01 - Christopher_Dodd * (V*) * * - +bc/msnbc/00/msnbc_0003 0 3 n't RB * - - - Christopher_Dodd * * (ARGM-NEG*) * - +bc/msnbc/00/msnbc_0003 0 4 think VB (VP* think 01 1 Christopher_Dodd * * (V*) * - diff --git a/tests/data_for_tests/io/QNLI/dev.tsv b/tests/data_for_tests/io/QNLI/dev.tsv new file mode 100755 index 00000000..ac4ecabe --- /dev/null +++ b/tests/data_for_tests/io/QNLI/dev.tsv @@ -0,0 +1,6 @@ +index question sentence label +0 What came into force after the new constitution was herald? As of that day, the new constitution heralding the Second Republic came into force. entailment +1 What is the first major city in the stream of the Rhine? The most important tributaries in this area are the Ill below of Strasbourg, the Neckar in Mannheim and the Main across from Mainz. not_entailment +2 What is the minimum required if you want to teach in Canada? In most provinces a second Bachelor's Degree such as a Bachelor of Education is required to become a qualified teacher. not_entailment +3 How was Temüjin kept imprisoned by the Tayichi'ud? The Tayichi'ud enslaved Temüjin (reportedly with a cangue, a sort of portable stocks), but with the help of a sympathetic guard, the father of Chilaun (who later became a general of Genghis Khan), he was able to escape from the ger (yurt) in the middle of the night by hiding in a river crevice.[citation needed] entailment +4 What did Herr Gott, dich loben wir become known as ? He paraphrased the Te Deum as "Herr Gott, dich loben wir" with a simplified form of the melody. not_entailment diff --git a/tests/data_for_tests/io/QNLI/test.tsv b/tests/data_for_tests/io/QNLI/test.tsv new file mode 100755 index 00000000..55bfbeaa --- /dev/null +++ b/tests/data_for_tests/io/QNLI/test.tsv @@ -0,0 +1,6 @@ +index question sentence +0 What organization is devoted to Jihad against Israel? For some decades prior to the First Palestine Intifada in 1987, the Muslim Brotherhood in Palestine took a "quiescent" stance towards Israel, focusing on preaching, education and social services, and benefiting from Israel's "indulgence" to build up a network of mosques and charitable organizations. +1 In what century was the Yarrow-Schlick-Tweedy balancing system used? In the late 19th century, the Yarrow-Schlick-Tweedy balancing 'system' was used on some marine triple expansion engines. +2 The largest brand of what store in the UK is located in Kingston Park? Close to Newcastle, the largest indoor shopping centre in Europe, the MetroCentre, is located in Gateshead. +3 What does the IPCC rely on for research? In principle, this means that any significant new evidence or events that change our understanding of climate science between this deadline and publication of an IPCC report cannot be included. +4 What is the principle about relating spin and space variables? Thus in the case of two fermions there is a strictly negative correlation between spatial and spin variables, whereas for two bosons (e.g. quanta of electromagnetic waves, photons) the correlation is strictly positive. diff --git a/tests/data_for_tests/io/QNLI/train.tsv b/tests/data_for_tests/io/QNLI/train.tsv new file mode 100755 index 00000000..fc0b966e --- /dev/null +++ b/tests/data_for_tests/io/QNLI/train.tsv @@ -0,0 +1,6 @@ +index question sentence label +0 When did the third Digimon series begin? Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese. not_entailment +1 Which missile batteries often have individual launchers several kilometres from one another? When MANPADS is operated by specialists, batteries may have several dozen teams deploying separately in small sections; self-propelled air defence guns may deploy in pairs. not_entailment +2 What two things does Popper argue Tarski's theory involves in an evaluation of truth? He bases this interpretation on the fact that examples such as the one described above refer to two things: assertions and the facts to which they refer. entailment +3 What is the name of the village 9 miles north of Calafat where the Ottoman forces attacked the Russians? On 31 December 1853, the Ottoman forces at Calafat moved against the Russian force at Chetatea or Cetate, a small village nine miles north of Calafat, and engaged them on 6 January 1854. entailment +4 What famous palace is located in London? London contains four World Heritage Sites: the Tower of London; Kew Gardens; the site comprising the Palace of Westminster, Westminster Abbey, and St Margaret's Church; and the historic settlement of Greenwich (in which the Royal Observatory, Greenwich marks the Prime Meridian, 0° longitude, and GMT). not_entailment diff --git a/tests/data_for_tests/io/Quora/dev.tsv b/tests/data_for_tests/io/Quora/dev.tsv new file mode 100755 index 00000000..8182f190 --- /dev/null +++ b/tests/data_for_tests/io/Quora/dev.tsv @@ -0,0 +1,2 @@ +1 How do I get funding for my web based startup idea ? How do I get seed funding pre product ? 327970 +0 Is honey a viable alternative to sugar for diabetics ? How would you compare the United States ' euthanasia laws to Denmark ? 90348 diff --git a/tests/data_for_tests/io/Quora/test.tsv b/tests/data_for_tests/io/Quora/test.tsv new file mode 100755 index 00000000..9582aa14 --- /dev/null +++ b/tests/data_for_tests/io/Quora/test.tsv @@ -0,0 +1,2 @@ +1 What should I do to avoid sleeping in class ? How do I not sleep in a boring class ? 50018 +0 Do women support each other more than men do ? Do women need more compliments than men ? 126924 diff --git a/tests/data_for_tests/io/Quora/train.tsv b/tests/data_for_tests/io/Quora/train.tsv new file mode 100755 index 00000000..e82940c9 --- /dev/null +++ b/tests/data_for_tests/io/Quora/train.tsv @@ -0,0 +1,2 @@ +1 What is your review of Hidden Figures -LRB- 2016 movie -RRB- ? What are your impressions of Hidden Figures -LRB- 2017 movie -RRB- ? 11877 +0 Currently , all Supreme Court Justices come from very elite law schools , is it similar for the best lawyers in private practice ? What 's your type of jungle -LRB- concrete or nature -RRB- and why ? 221489 diff --git a/tests/data_for_tests/io/R52/dev.csv b/tests/data_for_tests/io/R52/dev.csv new file mode 100755 index 00000000..37eab6ad --- /dev/null +++ b/tests/data_for_tests/io/R52/dev.csv @@ -0,0 +1,6 @@ +trade,canadians urge exemption u trade bill group canadian lawmakers ontario today asked u counterparts exempt canada mandatory trade retaliation provisions major trade bill considered u congress meeting northeast midwest coalition organization u legislators david cooke chairman ontario parliament select committee economic affairs said exemption would help trade relations trade legislation considered full house late april would require president reagan retaliate foreign unfair trade practices unless trade actions would harm u economy currently reagan reject trade sanctions grounds cooke member liberal party told u congressmen understand trade bill think concerns parts world would suggest best concerns canada consider country bill added canada united states largest trading partner two way trade billion dlrs according coalition u ran billion dlr deficit manufactured goods year compared billion dlr surplus services trade reuter +earn,american corp nd qtr feb shr profit one cts vs loss three cts net profit vs loss revs mln vs mln six months shr profit six cts vs loss six cts net profit mln vs loss mln revs mln vs mln note six months includes gain four cts change accounting principle reuter +earn,meyers co increases dividend qtly div eight cts vs seven cts prior payable may record april reuter +earn,meyers co year feb shr dlrs vs dlrs net mln dlrs vs mln revs mln vs mln note results reflect year month period company changed fiscal year end february march reuter +earn,kelly oil gas partners year dec shr cts vs cts net mln vs mln revs mln vs mln reuter +money-fx,japan seeks strengthen paris currency accord japan seek strengthen paris accord currency stability meeting group seven leading industrial nations tomorrow japanese officials said however officials japanese finance minister kiichi miyazawa asked identified would provide details wanted accord signed six leading industrial democracies february strengthened currency target zones reference ranges discussed g meeting scheduled tomorrow japanese officials said meeting held conjunction week international monetary fund world bank sessions currency pact need changing language used paris accord officials said miyazawa met u treasury secretary james baker early afternoon discussed dollar yen exchange rates officials said declined disclosed details discussion japanese officials also declined detail miyazawa baker discussed subject greater joint intervention currency markets stabilize dollar independent american intervention officials said money market action stabilize dollar benefit japan suffering sharp appreciation currency also benefit united states well u japan take steps boost domestic demand reduce trade surplus japan explain economic measures g officials said however miyazawa failed outline size japanese economic package meeting baker today japanese budget authorized parliament despite new fiscal year started april one officials said japan ruling liberal democratic party revealed economic package today calling billion yen additional spending reuter diff --git a/tests/data_for_tests/io/R52/test.csv b/tests/data_for_tests/io/R52/test.csv new file mode 100755 index 00000000..99497e79 --- /dev/null +++ b/tests/data_for_tests/io/R52/test.csv @@ -0,0 +1,6 @@ +pet-chem,italy eni invest venezuelan projects italy state owned ente nazionale idrocarburi eni invest mln dlrs two joint ventures coal petroleos de venezuela eni president franco said speaking news conference said two projects eventually bring mln dlrs annually foreign exchange venezuela help diversify country export base joint ventures principal instrument allowing resources industrialized countries developing world lead future growth said eni subsidiary join petrochemical subsidiary pdvsa building mln dlr plant produce gasoline additive used increase octane levels mt per year plant jose eastern venezuela fed butane produced pdvsa eastern complex eni owns pct joint venture company super c pct remaining three pct sold private investors production set begin third quarter officials said plant one saudi arabia another eni subsidiary agip sign letter intent caracas tomorrow enter partnership pdvsa mine coal deposits western state said feasibility studies still done project definitive accord slated august added agip atlantic richfield coal arco subsidiary formed consortium pct project whose total cost estimated mln dlrs company said agip invest pct mln dlrs project said reuter +earn,republicbank rpt brazil loans republicbank corp said placed mln dlrs intermediate term loans brazil non accrual basis march said reclassification reduce first quarter earnings mln dlrs taxes mln dlrs taxes brazil change position moratorium interest payments republicbank also said net income first quarter expected mln dlrs cts share fully diluted basis year ago first quarter company earned mln dlrs cts share company also said first quarter results expected include provision loan losses mln dlrs mln dlrs net loan charge offs mln dlrs said provision increase loan losses mln dlrs pct loans republicbank total assets billion dlrs announced december agreement interfirst corp form first republicbank corp merger approved regulatory agencies stockholders would create th largest bank holding company united states reuter +acq,amoskeag bank seek rehearing amoskeag bank shares inc portsmouth savings bank said file rehearing new hampshire supreme court march ruling state regulatory approval amoskeag acquisition portsmouth decision believe go well beyond affiliation amoskeag portsmouth savings bank said amoskeag chairman william transaction opposed group portsmouth investors wanted bank remain independent according press reports reuter +strategic-metal,doe recommends special unit uranium energy secretary john herrington told congress federally chartered corporation would best way manage operate government uranium program said letter congressmen unless program run energy department improved sales worth five billion dlrs could lost program annual commercial sales one billion dlrs holds pct free world market services department official said world market uranium power utilities increasingly competitive private entity could better tap administration plan spin department uranium operation line effort reduce federal government role areas feels private enterprise could efficient reuter +earn,declares stock dividend financial corp said declared stock dividend one class share two class shares held payable may shareholders record april reuter +acq,allegheny ag shareholders file suit allegheny international inc agreed merge jointly formed first boston inc affiliate deal worth mn dlrs said shareholders preferred stock filed class action complaint company complaint alleges among things company board agreed pay first boston illegal seven mln dlr fee received higher offer company prior buyout suit fee allegheny ability attract offers take actions would benefit holders preferred stock complaint also alleges federal securities laws violations breach fiduciary duty suit requests injunction proceeding pending offer made sunter acquisition acquire allegheny sunter acquisition corp sunter holdings corp formed first boston allegheny allegheny said sunter concerns intend vigorously defend complaint charges complaints filed robert parties believed shares allegheny preferred stock reuter diff --git a/tests/data_for_tests/io/R52/train.csv b/tests/data_for_tests/io/R52/train.csv new file mode 100755 index 00000000..34af13dc --- /dev/null +++ b/tests/data_for_tests/io/R52/train.csv @@ -0,0 +1,6 @@ +earn,convertible securities sets dividend convertible securities fund inc said board declared initial quarterly dividend three cents per share payable april shareholders record april said anticipates paying regular quarterly dividend company made initial public stock offering march five reuter +jobs,n z unemployment rate pct december quarter new zealand unemployment rate pct workforce quarter ended december unchanged revised pct preliminary pct previous quarter slightly pct year earlier quarter statistics department said department citing household labour force survey said statement number unemployed october december september quarter year earlier reuter +rubber,japan rubber stocks fall march japan rubber stocks fell tonnes march february march japan rubber trade association said stocks tonnes february year earlier comparisons march feb march crude rubber synthetic latex reuter +money-fx,south korean fixed month high bank korea said fixed dollar highest level since february set yesterday risen pct dollar far year rising pct reuter +copper,nippon mining lowers copper price nippon mining co ltd said lowered selling price electrolytic copper yen per tonne effective immediately reuter +ship,australian unions launch new south wales strikes australian trade unions said launched week long strikes industrial action new south wales nsw protest new laws would reduce injury compensation payments union sources said talks state government broke last night two sides scheduled meet later today attempt find compromise rail freight shipping cargo movements country state first affected union officials said almost every business sector hit unless quick settlement state government recently introduced new workers compensation act would cut cash benefits injured workers third act awaiting parliamentary ratification nsw state premier said workers compensation risen recent years proposed cuts would save hundreds mlns dollars year union officials said industrial action could spread states federal government also plans make sharp cuts workers compensation reuter diff --git a/tests/data_for_tests/io/R8/dev.csv b/tests/data_for_tests/io/R8/dev.csv new file mode 100755 index 00000000..b7271c38 --- /dev/null +++ b/tests/data_for_tests/io/R8/dev.csv @@ -0,0 +1,6 @@ +acq,amoskeag bank seek amoskeag bank shares inc portsmouth savings bank said file new hampshire supreme court march ruling state regulatory approval amoskeag acquisition portsmouth decision believe go well beyond affiliation amoskeag portsmouth savings bank said amoskeag chairman william transaction opposed group portsmouth investors wanted bank remain independent according press reports reuter +earn,declares stock dividend financial corp said declared stock dividend one class share two class shares held payable may shareholders record april reuter +acq,allegheny ag shareholders file suit allegheny international inc agreed merge jointly formed first boston inc affiliate deal worth mn dlrs said shareholders preferred stock filed class action complaint company complaint alleges among things company board agreed pay first boston illegal seven mln dlr fee received higher offer company prior buyout suit fee allegheny ability attract offers take actions would benefit holders preferred stock complaint also alleges federal securities laws violations fiduciary duty suit requests injunction proceeding pending offer made sunter acquisition acquire allegheny sunter acquisition corp sunter holdings corp formed first boston allegheny allegheny said sunter concerns intend vigorously defend complaint charges complaints filed robert parties believed shares allegheny preferred stock reuter +trade,canadians urge exemption u trade bill group canadian lawmakers ontario today asked u exempt canada mandatory trade retaliation provisions major trade bill considered u congress meeting northeast midwest coalition organization u legislators david cooke chairman ontario parliament select committee economic affairs said exemption would help trade relations trade legislation considered full house late april would require president reagan retaliate foreign unfair trade practices unless trade actions would harm u economy currently reagan reject trade sanctions grounds cooke member liberal party told u congressmen understand trade bill think concerns parts world would suggest best concerns canada consider country bill added canada united states largest trading partner two way trade billion dlrs according coalition u ran billion dlr deficit manufactured goods year compared billion dlr surplus services trade reuter +earn,american corp nd qtr feb shr profit one cts vs loss three cts net profit vs loss revs mln vs mln six months shr profit six cts vs loss six cts net profit mln vs loss mln revs mln vs mln note six months includes gain four cts change accounting principle reuter +earn,meyers co increases dividend qtly div eight cts vs seven cts prior payable may record april reuter diff --git a/tests/data_for_tests/io/R8/test.csv b/tests/data_for_tests/io/R8/test.csv new file mode 100755 index 00000000..13225334 --- /dev/null +++ b/tests/data_for_tests/io/R8/test.csv @@ -0,0 +1,6 @@ +earn,technology inc nd qtr march shr profit eight cts vs loss dlrs net profit vs loss revs mln vs avg shrs vs six mths shr loss nine cts vs loss dlrs net loss vs loss revs mln vs mln avg shrs vs reuter +earn,nacco industries report nd qtr gain nacco industries inc said report gain second quarter mln dlrs dlrs share sale stock subsidiary nacco said north american coal corp unit received notice consolidation coal co unit du pont co dd exercise option buy stock mining co subsidiary north american coal stock north american coal receive mln dlrs mln paid closing april rest company said addition pay dividend north american coal mln dlrs retained earnings closing funds previously used finance mining operations consolidation coal got option group utilities received option nacco nacco reported earnings mln dlrs dlrs share last year second quarter generated mln dlrs net income equal cts share nacco total earnings dlrs share produced mln short tons mln tons produced north american coal nacco said reuter +earn,buffton post investigation charge buffton corp said conduct investigation plant designated site result charge six cts per share second quarter year ago second quarter buffton reported net income cts share dlrs sales mln dlrs study completed nine months determine action may required inc plant former owner split cost buffton said share cost dlrs reuter +acq,american dynamics sell pct stake american dynamics corp meridian reserve inc said signed definitive agreement meridian buy mln shares pct american dynamics common stock terms agreement santa calif based meridian said pay based american dynamics one mln dlrs cash notes five years shares common stock meridian said option issue additional shares common next two years payment certain notes meridian oil gas company whose operations primarily oklahoma said acquisition increase consolidated assets mln dlrs committed gas reserves mln dlrs discounted present value american dynamics engaged gas gathering transmission liquids also oklahoma companies said five plants miles transmission lines five oklahoma counties reuter +money-fx,ussr exchange rates soviet state bank effective april roubles per hundred unless stated u stg unch fin unch yen aus aus dlr unch pak unch ind unch unch one unch unch +earn,republicbank rpt brazil loans republicbank corp said placed mln dlrs intermediate term loans brazil non accrual basis march said reclassification reduce first quarter earnings mln dlrs taxes mln dlrs taxes brazil change position moratorium interest payments republicbank also said net income first quarter expected mln dlrs cts share fully diluted basis year ago first quarter company earned mln dlrs cts share company also said first quarter results expected include provision loan losses mln dlrs mln dlrs net loan charge offs mln dlrs said provision increase loan losses mln dlrs pct loans republicbank total assets billion dlrs announced december agreement interfirst corp form first republicbank corp merger approved regulatory agencies stockholders would create th largest bank holding company united states reuter diff --git a/tests/data_for_tests/io/R8/train.csv b/tests/data_for_tests/io/R8/train.csv new file mode 100755 index 00000000..77897bb9 --- /dev/null +++ b/tests/data_for_tests/io/R8/train.csv @@ -0,0 +1,6 @@ +earn,meyers co year feb shr dlrs vs dlrs net mln dlrs vs mln revs mln vs mln note results reflect year month period company changed fiscal year end february march reuter +earn,kelly oil gas partners year dec shr cts vs cts net mln vs mln revs mln vs mln reuter +money-fx,japan seeks strengthen paris currency accord japan seek strengthen paris accord currency stability meeting group seven leading industrial nations tomorrow japanese officials said however officials japanese finance minister kiichi miyazawa asked identified would provide details wanted accord signed six leading industrial democracies february strengthened currency target zones reference ranges discussed g meeting scheduled tomorrow japanese officials said meeting held conjunction week international monetary fund world bank sessions currency pact need changing language used paris accord officials said miyazawa met u treasury secretary james baker early afternoon discussed dollar yen exchange rates officials said declined disclosed details discussion japanese officials also declined detail miyazawa baker discussed subject greater joint intervention currency markets stabilize dollar independent american intervention officials said money market action stabilize dollar benefit japan suffering sharp appreciation currency also benefit united states well u japan take steps boost domestic demand reduce trade surplus japan explain economic measures g officials said however miyazawa failed outline size japanese economic package meeting baker today japanese budget authorized parliament despite new fiscal year started april one officials said japan ruling liberal democratic party revealed economic package today calling billion yen additional spending reuter +earn,convertible securities sets dividend convertible securities fund inc said board declared initial quarterly dividend three cents per share payable april shareholders record april said anticipates paying regular quarterly dividend company made initial public stock offering march five reuter +money-fx,south korean fixed month high bank korea said fixed dollar highest level since february set yesterday risen pct dollar far year rising pct reuter +ship,australian unions launch new south wales strikes australian trade unions said launched week long strikes industrial action new south wales nsw protest new laws would reduce injury compensation payments union sources said talks state government broke last night two sides scheduled meet later today attempt find compromise rail freight shipping cargo movements country state first affected union officials said almost every business sector hit unless quick settlement state government recently introduced new workers compensation act would cut cash benefits injured workers third act awaiting parliamentary nsw state premier said workers compensation risen recent years proposed cuts would save hundreds dollars year union officials said industrial action could spread states federal government also plans make sharp cuts workers compensation reuter diff --git a/tests/data_for_tests/io/RTE/dev.tsv b/tests/data_for_tests/io/RTE/dev.tsv new file mode 100755 index 00000000..f8f72536 --- /dev/null +++ b/tests/data_for_tests/io/RTE/dev.tsv @@ -0,0 +1,6 @@ +index sentence1 sentence2 label +0 Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Christopher Reeve had an accident. not_entailment +1 Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations. Bacteria is winning the war against antibiotics. entailment +2 Cairo is now home to some 15 million people - a burgeoning population that produces approximately 10,000 tonnes of rubbish per day, putting an enormous strain on public services. In the past 10 years, the government has tried hard to encourage private investment in the refuse sector, but some estimate 4,000 tonnes of waste is left behind every day, festering in the heat as it waits for someone to clear it up. It is often the people in the poorest neighbourhoods that are worst affected. But in some areas they are fighting back. In Shubra, one of the northern districts of the city, the residents have taken to the streets armed with dustpans and brushes to clean up public areas which have been used as public dumps. 15 million tonnes of rubbish are produced daily in Cairo. not_entailment +3 The Amish community in Pennsylvania, which numbers about 55,000, lives an agrarian lifestyle, shunning technological advances like electricity and automobiles. And many say their insular lifestyle gives them a sense that they are protected from the violence of American society. But as residents gathered near the school, some wearing traditional garb and arriving in horse-drawn buggies, they said that sense of safety had been shattered. "If someone snaps and wants to do something stupid, there's no distance that's going to stop them," said Jake King, 56, an Amish lantern maker who knew several families whose children had been shot. Pennsylvania has the biggest Amish community in the U.S. not_entailment +4 Security forces were on high alert after an election campaign in which more than 1,000 people, including seven election candidates, have been killed. Security forces were on high alert after a campaign marred by violence. entailment diff --git a/tests/data_for_tests/io/RTE/test.tsv b/tests/data_for_tests/io/RTE/test.tsv new file mode 100755 index 00000000..e52dfac4 --- /dev/null +++ b/tests/data_for_tests/io/RTE/test.tsv @@ -0,0 +1,6 @@ +index sentence1 sentence2 +0 Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case. Shukla is related to Mangla. +1 Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia. Authorities in Brazil hold 200 people as hostage. +2 A mercenary group faithful to the warmongering policy of former Somozist colonel Enrique Bermudez attacked an IFA truck belonging to the interior ministry at 0900 on 26 March in El Jicote, wounded and killed an interior ministry worker and wounded five others. An interior ministry worker was killed by a mercenary group. +3 The British ambassador to Egypt, Derek Plumbly, told Reuters on Monday that authorities had compiled the list of 10 based on lists from tour companies and from families whose relatives have not been in contact since the bombings. Derek Plumbly resides in Egypt. +4 Tibone estimated diamond production at four mines operated by Debswana -- Botswana's 50-50 joint venture with De Beers -- could reach 33 million carats this year. Botswana is a business partner of De Beers. diff --git a/tests/data_for_tests/io/RTE/train.tsv b/tests/data_for_tests/io/RTE/train.tsv new file mode 100755 index 00000000..70e5414f --- /dev/null +++ b/tests/data_for_tests/io/RTE/train.tsv @@ -0,0 +1,6 @@ +index sentence1 sentence2 label +0 No Weapons of Mass Destruction Found in Iraq Yet. Weapons of Mass Destruction Found in Iraq. not_entailment +1 A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI. Pope Benedict XVI is the new leader of the Roman Catholic Church. entailment +2 Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients. Herceptin can be used to treat breast cancer. entailment +3 Judie Vivian, chief executive at ProMedica, a medical service company that helps sustain the 2-year-old Vietnam Heart Institute in Ho Chi Minh City (formerly Saigon), said that so far about 1,500 children have received treatment. The previous name of Ho Chi Minh City was Saigon. entailment +4 A man is due in court later charged with the murder 26 years ago of a teenager whose case was the first to be featured on BBC One's Crimewatch. Colette Aram, 16, was walking to her boyfriend's house in Keyworth, Nottinghamshire, on 30 October 1983 when she disappeared. Her body was later found in a field close to her home. Paul Stewart Hutchinson, 50, has been charged with murder and is due before Nottingham magistrates later. Paul Stewart Hutchinson is accused of having stabbed a girl. not_entailment diff --git a/tests/data_for_tests/io/SNLI/snli_1.0_dev.jsonl b/tests/data_for_tests/io/SNLI/snli_1.0_dev.jsonl new file mode 100755 index 00000000..2d091c73 --- /dev/null +++ b/tests/data_for_tests/io/SNLI/snli_1.0_dev.jsonl @@ -0,0 +1,5 @@ +{"annotator_labels": ["neutral", "entailment", "neutral", "neutral", "neutral"], "captionID": "4705552913.jpg#2", "gold_label": "neutral", "pairID": "4705552913.jpg#2r1n", "sentence1": "Two women are embracing while holding to go packages.", "sentence1_binary_parse": "( ( Two women ) ( ( are ( embracing ( while ( holding ( to ( go packages ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", "sentence2": "The sisters are hugging goodbye while holding to go packages after just eating lunch.", "sentence2_binary_parse": "( ( The sisters ) ( ( are ( ( hugging goodbye ) ( while ( holding ( to ( ( go packages ) ( after ( just ( eating lunch ) ) ) ) ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP are) (VP (VBG hugging) (NP (UH goodbye)) (PP (IN while) (S (VP (VBG holding) (S (VP (TO to) (VP (VB go) (NP (NNS packages)) (PP (IN after) (S (ADVP (RB just)) (VP (VBG eating) (NP (NN lunch))))))))))))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "entailment", "entailment", "entailment"], "captionID": "4705552913.jpg#2", "gold_label": "entailment", "pairID": "4705552913.jpg#2r1e", "sentence1": "Two women are embracing while holding to go packages.", "sentence1_binary_parse": "( ( Two women ) ( ( are ( embracing ( while ( holding ( to ( go packages ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", "sentence2": "Two woman are holding packages.", "sentence2_binary_parse": "( ( Two woman ) ( ( are ( holding packages ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are) (VP (VBG holding) (NP (NNS packages)))) (. .)))"} +{"annotator_labels": ["contradiction", "contradiction", "contradiction", "contradiction", "contradiction"], "captionID": "4705552913.jpg#2", "gold_label": "contradiction", "pairID": "4705552913.jpg#2r1c", "sentence1": "Two women are embracing while holding to go packages.", "sentence1_binary_parse": "( ( Two women ) ( ( are ( embracing ( while ( holding ( to ( go packages ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", "sentence2": "The men are fighting outside a deli.", "sentence2_binary_parse": "( ( The men ) ( ( are ( fighting ( outside ( a deli ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NNS men)) (VP (VBP are) (VP (VBG fighting) (PP (IN outside) (NP (DT a) (NNS deli))))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "entailment", "entailment", "entailment"], "captionID": "2407214681.jpg#0", "gold_label": "entailment", "pairID": "2407214681.jpg#0r1e", "sentence1": "Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.", "sentence1_binary_parse": "( ( ( Two ( young children ) ) ( in ( ( ( ( ( blue jerseys ) , ) ( one ( with ( the ( number 9 ) ) ) ) ) and ) ( one ( with ( the ( number 2 ) ) ) ) ) ) ) ( ( are ( ( ( standing ( on ( ( wooden steps ) ( in ( a bathroom ) ) ) ) ) and ) ( ( washing ( their hands ) ) ( in ( a sink ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (CD Two) (JJ young) (NNS children)) (PP (IN in) (NP (NP (JJ blue) (NNS jerseys)) (, ,) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 9)))) (CC and) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 2))))))) (VP (VBP are) (VP (VP (VBG standing) (PP (IN on) (NP (NP (JJ wooden) (NNS steps)) (PP (IN in) (NP (DT a) (NN bathroom)))))) (CC and) (VP (VBG washing) (NP (PRP$ their) (NNS hands)) (PP (IN in) (NP (DT a) (NN sink)))))) (. .)))", "sentence2": "Two kids in numbered jerseys wash their hands.", "sentence2_binary_parse": "( ( ( Two kids ) ( in ( numbered jerseys ) ) ) ( ( wash ( their hands ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN in) (NP (JJ numbered) (NNS jerseys)))) (VP (VBP wash) (NP (PRP$ their) (NNS hands))) (. .)))"} +{"annotator_labels": ["neutral", "neutral", "neutral", "entailment", "entailment"], "captionID": "2407214681.jpg#0", "gold_label": "neutral", "pairID": "2407214681.jpg#0r1n", "sentence1": "Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.", "sentence1_binary_parse": "( ( ( Two ( young children ) ) ( in ( ( ( ( ( blue jerseys ) , ) ( one ( with ( the ( number 9 ) ) ) ) ) and ) ( one ( with ( the ( number 2 ) ) ) ) ) ) ) ( ( are ( ( ( standing ( on ( ( wooden steps ) ( in ( a bathroom ) ) ) ) ) and ) ( ( washing ( their hands ) ) ( in ( a sink ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (CD Two) (JJ young) (NNS children)) (PP (IN in) (NP (NP (JJ blue) (NNS jerseys)) (, ,) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 9)))) (CC and) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 2))))))) (VP (VBP are) (VP (VP (VBG standing) (PP (IN on) (NP (NP (JJ wooden) (NNS steps)) (PP (IN in) (NP (DT a) (NN bathroom)))))) (CC and) (VP (VBG washing) (NP (PRP$ their) (NNS hands)) (PP (IN in) (NP (DT a) (NN sink)))))) (. .)))", "sentence2": "Two kids at a ballgame wash their hands.", "sentence2_binary_parse": "( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( wash ( their hands ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN at) (NP (DT a) (NN ballgame)))) (VP (VBP wash) (NP (PRP$ their) (NNS hands))) (. .)))"} diff --git a/tests/data_for_tests/io/SNLI/snli_1.0_test.jsonl b/tests/data_for_tests/io/SNLI/snli_1.0_test.jsonl new file mode 100755 index 00000000..49d40720 --- /dev/null +++ b/tests/data_for_tests/io/SNLI/snli_1.0_test.jsonl @@ -0,0 +1,5 @@ +{"annotator_labels": ["neutral", "contradiction", "contradiction", "neutral", "neutral"], "captionID": "2677109430.jpg#1", "gold_label": "neutral", "pairID": "2677109430.jpg#1r1n", "sentence1": "This church choir sings to the masses as they sing joyous songs from the book at a church.", "sentence1_binary_parse": "( ( This ( church choir ) ) ( ( ( sings ( to ( the masses ) ) ) ( as ( they ( ( sing ( joyous songs ) ) ( from ( ( the book ) ( at ( a church ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT This) (NN church) (NN choir)) (VP (VBZ sings) (PP (TO to) (NP (DT the) (NNS masses))) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP sing) (NP (JJ joyous) (NNS songs)) (PP (IN from) (NP (NP (DT the) (NN book)) (PP (IN at) (NP (DT a) (NN church))))))))) (. .)))", "sentence2": "The church has cracks in the ceiling.", "sentence2_binary_parse": "( ( The church ) ( ( has ( cracks ( in ( the ceiling ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN church)) (VP (VBZ has) (NP (NP (NNS cracks)) (PP (IN in) (NP (DT the) (NN ceiling))))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "entailment", "neutral", "entailment"], "captionID": "2677109430.jpg#1", "gold_label": "entailment", "pairID": "2677109430.jpg#1r1e", "sentence1": "This church choir sings to the masses as they sing joyous songs from the book at a church.", "sentence1_binary_parse": "( ( This ( church choir ) ) ( ( ( sings ( to ( the masses ) ) ) ( as ( they ( ( sing ( joyous songs ) ) ( from ( ( the book ) ( at ( a church ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT This) (NN church) (NN choir)) (VP (VBZ sings) (PP (TO to) (NP (DT the) (NNS masses))) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP sing) (NP (JJ joyous) (NNS songs)) (PP (IN from) (NP (NP (DT the) (NN book)) (PP (IN at) (NP (DT a) (NN church))))))))) (. .)))", "sentence2": "The church is filled with song.", "sentence2_binary_parse": "( ( The church ) ( ( is ( filled ( with song ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN church)) (VP (VBZ is) (VP (VBN filled) (PP (IN with) (NP (NN song))))) (. .)))"} +{"annotator_labels": ["contradiction", "contradiction", "contradiction", "contradiction", "contradiction"], "captionID": "2677109430.jpg#1", "gold_label": "contradiction", "pairID": "2677109430.jpg#1r1c", "sentence1": "This church choir sings to the masses as they sing joyous songs from the book at a church.", "sentence1_binary_parse": "( ( This ( church choir ) ) ( ( ( sings ( to ( the masses ) ) ) ( as ( they ( ( sing ( joyous songs ) ) ( from ( ( the book ) ( at ( a church ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT This) (NN church) (NN choir)) (VP (VBZ sings) (PP (TO to) (NP (DT the) (NNS masses))) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP sing) (NP (JJ joyous) (NNS songs)) (PP (IN from) (NP (NP (DT the) (NN book)) (PP (IN at) (NP (DT a) (NN church))))))))) (. .)))", "sentence2": "A choir singing at a baseball game.", "sentence2_binary_parse": "( ( ( A choir ) ( singing ( at ( a ( baseball game ) ) ) ) ) . )", "sentence2_parse": "(ROOT (NP (NP (DT A) (NN choir)) (VP (VBG singing) (PP (IN at) (NP (DT a) (NN baseball) (NN game)))) (. .)))"} +{"annotator_labels": ["neutral", "neutral", "neutral", "neutral", "neutral"], "captionID": "6160193920.jpg#4", "gold_label": "neutral", "pairID": "6160193920.jpg#4r1n", "sentence1": "A woman with a green headscarf, blue shirt and a very big grin.", "sentence1_binary_parse": "( ( ( A woman ) ( with ( ( ( ( ( a ( green headscarf ) ) , ) ( blue shirt ) ) and ) ( a ( ( very big ) grin ) ) ) ) ) . )", "sentence1_parse": "(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with) (NP (NP (DT a) (JJ green) (NN headscarf)) (, ,) (NP (JJ blue) (NN shirt)) (CC and) (NP (DT a) (ADJP (RB very) (JJ big)) (NN grin)))) (. .)))", "sentence2": "The woman is young.", "sentence2_binary_parse": "( ( The woman ) ( ( is young ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is) (ADJP (JJ young))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "contradiction", "entailment", "neutral"], "captionID": "6160193920.jpg#4", "gold_label": "entailment", "pairID": "6160193920.jpg#4r1e", "sentence1": "A woman with a green headscarf, blue shirt and a very big grin.", "sentence1_binary_parse": "( ( ( A woman ) ( with ( ( ( ( ( a ( green headscarf ) ) , ) ( blue shirt ) ) and ) ( a ( ( very big ) grin ) ) ) ) ) . )", "sentence1_parse": "(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with) (NP (NP (DT a) (JJ green) (NN headscarf)) (, ,) (NP (JJ blue) (NN shirt)) (CC and) (NP (DT a) (ADJP (RB very) (JJ big)) (NN grin)))) (. .)))", "sentence2": "The woman is very happy.", "sentence2_binary_parse": "( ( The woman ) ( ( is ( very happy ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is) (ADJP (RB very) (JJ happy))) (. .)))"} diff --git a/tests/data_for_tests/io/SNLI/snli_1.0_train.jsonl b/tests/data_for_tests/io/SNLI/snli_1.0_train.jsonl new file mode 100755 index 00000000..8be03c11 --- /dev/null +++ b/tests/data_for_tests/io/SNLI/snli_1.0_train.jsonl @@ -0,0 +1,5 @@ +{"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"} +{"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"} +{"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"} +{"annotator_labels": ["neutral"], "captionID": "2267923837.jpg#2", "gold_label": "neutral", "pairID": "2267923837.jpg#2r1n", "sentence1": "Children smiling and waving at camera", "sentence1_binary_parse": "( Children ( ( ( smiling and ) waving ) ( at camera ) ) )", "sentence1_parse": "(ROOT (NP (S (NP (NNP Children)) (VP (VBG smiling) (CC and) (VBG waving) (PP (IN at) (NP (NN camera)))))))", "sentence2": "They are smiling at their parents", "sentence2_binary_parse": "( They ( are ( smiling ( at ( their parents ) ) ) ) )", "sentence2_parse": "(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VBG smiling) (PP (IN at) (NP (PRP$ their) (NNS parents)))))))"} +{"annotator_labels": ["entailment"], "captionID": "2267923837.jpg#2", "gold_label": "entailment", "pairID": "2267923837.jpg#2r1e", "sentence1": "Children smiling and waving at camera", "sentence1_binary_parse": "( Children ( ( ( smiling and ) waving ) ( at camera ) ) )", "sentence1_parse": "(ROOT (NP (S (NP (NNP Children)) (VP (VBG smiling) (CC and) (VBG waving) (PP (IN at) (NP (NN camera)))))))", "sentence2": "There are children present", "sentence2_binary_parse": "( There ( ( are children ) present ) )", "sentence2_parse": "(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NNS children)) (ADVP (RB present)))))"} diff --git a/tests/data_for_tests/io/SST-2/dev.tsv b/tests/data_for_tests/io/SST-2/dev.tsv new file mode 100755 index 00000000..3fec0fa6 --- /dev/null +++ b/tests/data_for_tests/io/SST-2/dev.tsv @@ -0,0 +1,6 @@ +sentence label +it 's a charming and often affecting journey . 1 +unflinchingly bleak and desperate 0 +allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 1 +the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 1 +it 's slow -- very , very slow . 0 diff --git a/tests/data_for_tests/io/SST-2/test.tsv b/tests/data_for_tests/io/SST-2/test.tsv new file mode 100755 index 00000000..6ad46368 --- /dev/null +++ b/tests/data_for_tests/io/SST-2/test.tsv @@ -0,0 +1,6 @@ +index sentence +0 uneasy mishmash of styles and genres . +1 this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation . +2 by the end of no such thing the audience , like beatrice , has a watchful affection for the monster . +3 director rob marshall went out gunning to make a great one . +4 lathan and diggs have considerable personal charm , and their screen rapport makes the old story seem new . diff --git a/tests/data_for_tests/io/SST-2/train.tsv b/tests/data_for_tests/io/SST-2/train.tsv new file mode 100755 index 00000000..4d7ea56c --- /dev/null +++ b/tests/data_for_tests/io/SST-2/train.tsv @@ -0,0 +1,6 @@ +sentence label +hide new secretions from the parental units 0 +contains no wit , only labored gags 0 +that loves its characters and communicates something rather beautiful about human nature 1 +remains utterly satisfied to remain the same throughout 0 +on the worst revenge-of-the-nerds clichés the filmmakers could dredge up 0 diff --git a/tests/data_for_tests/io/SST/dev.txt b/tests/data_for_tests/io/SST/dev.txt new file mode 100755 index 00000000..46fca6bf --- /dev/null +++ b/tests/data_for_tests/io/SST/dev.txt @@ -0,0 +1,6 @@ +(3 (2 It) (4 (4 (2 's) (4 (3 (2 a) (4 (3 lovely) (2 film))) (3 (2 with) (4 (3 (3 lovely) (2 performances)) (2 (2 by) (2 (2 (2 Buy) (2 and)) (2 Accorsi))))))) (2 .))) +(2 (2 (1 No) (2 one)) (1 (1 (2 goes) (2 (1 (2 (2 unindicted) (2 here)) (2 ,)) (2 (2 which) (3 (2 (2 is) (2 probably)) (3 (2 for) (4 (2 the) (4 best))))))) (2 .))) +(3 (2 And) (4 (3 (2 if) (1 (2 you) (1 (2 (2 (2 're) (1 not)) (2 nearly)) (4 (3 (3 moved) (2 (2 to) (1 tears))) (2 (2 by) (2 (2 (2 a) (2 couple)) (2 (2 of) (2 scenes)))))))) (2 (2 ,) (2 (2 you) (2 (2 (2 've) (1 (2 got) (2 (3 (2 ice) (2 water)) (2 (2 in) (2 (2 your) (2 veins)))))) (2 .)))))) +(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .)))) +(4 (3 (2 Uses) (3 (3 (4 (3 sharp) (4 (3 (4 humor) (2 and)) (2 insight))) (2 (2 into) (3 (2 human) (2 nature)))) (2 (2 to) (2 (2 examine) (2 (2 class) (1 conflict)))))) (2 (2 ,) (2 (2 (2 adolescent) (2 (2 (2 yearning) (2 ,)) (3 (2 (2 the) (2 roots)) (3 (2 of) (2 (2 friendship) (2 (2 and) (2 (2 sexual) (2 identity)))))))) (2 .)))) +(2 (2 (2 Half) (1 (2 (2 (2 (2 (2 Submarine) (2 flick)) (2 ,)) (2 (2 Half) (2 (2 Ghost) (2 Story)))) (2 ,)) (2 (2 All) (2 (2 in) (2 (2 one) (2 criminally)))))) (1 (1 neglected) (2 film))) diff --git a/tests/data_for_tests/io/SST/test.txt b/tests/data_for_tests/io/SST/test.txt new file mode 100755 index 00000000..ebf325d8 --- /dev/null +++ b/tests/data_for_tests/io/SST/test.txt @@ -0,0 +1,6 @@ +(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid) (2 biopic))) +(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) (2 (2 like) (3 (2 to) (3 (3 (2 go) (2 (2 to) (2 (2 the) (2 movies)))) (3 (2 to) (3 (2 have) (4 fun))))))))) (2 (2 ,) (2 (2 Wasabi) (3 (3 (2 is) (2 (2 a) (2 (3 good) (2 (2 place) (2 (2 to) (2 start)))))) (2 .))))) +(4 (4 (4 (3 (2 Emerges) (3 (2 as) (3 (2 something) (3 rare)))) (2 ,)) (4 (2 (2 an) (2 (2 issue) (2 movie))) (3 (2 that) (3 (3 (2 's) (4 (3 (3 (2 so) (4 honest)) (2 and)) (3 (2 keenly) (2 observed)))) (2 (2 that) (2 (2 it) (2 (1 (2 does) (2 n't)) (2 (2 feel) (2 (2 like) (2 one)))))))))) (2 .)) +(2 (2 (2 The) (2 film)) (3 (3 (3 (3 provides) (2 (2 some) (3 (4 great) (2 insight)))) (3 (2 into) (3 (2 (2 the) (2 (2 neurotic) (2 mindset))) (3 (2 of) (2 (2 (2 (2 (2 all) (2 comics)) (2 --)) (2 even)) (3 (2 those) (4 (2 who) (4 (2 have) (4 (2 reached) (4 (4 (2 the) (3 (2 absolute) (2 top))) (2 (2 of) (2 (2 the) (2 game))))))))))))) (2 .))) +(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare) (2 combination))) (2 (2 of) (3 (3 (3 entertainment) (2 and)) (2 education))))) (2 .)) +(3 (2 Perhaps) (4 (2 (1 (1 no) (2 picture)) (2 (2 ever) (2 made))) (3 (3 (2 (2 has) (2 (2 more) (3 literally))) (3 (2 showed) (2 (2 that) (2 (1 (2 (2 the) (1 road)) (1 (2 to) (0 hell))) (3 (2 is) (3 (2 paved) (3 (2 with) (3 (3 good) (2 intentions))))))))) (2 .)))) diff --git a/tests/data_for_tests/io/SST/train.txt b/tests/data_for_tests/io/SST/train.txt new file mode 100755 index 00000000..d5296ab0 --- /dev/null +++ b/tests/data_for_tests/io/SST/train.txt @@ -0,0 +1,6 @@ +(3 (2 (2 The) (2 Rock)) (4 (3 (2 is) (4 (2 destined) (2 (2 (2 (2 (2 to) (2 (2 be) (2 (2 the) (2 (2 21st) (2 (2 (2 Century) (2 's)) (2 (3 new) (2 (2 ``) (2 Conan)))))))) (2 '')) (2 and)) (3 (2 that) (3 (2 he) (3 (2 's) (3 (2 going) (3 (2 to) (4 (3 (2 make) (3 (3 (2 a) (3 splash)) (2 (2 even) (3 greater)))) (2 (2 than) (2 (2 (2 (2 (1 (2 Arnold) (2 Schwarzenegger)) (2 ,)) (2 (2 Jean-Claud) (2 (2 Van) (2 Damme)))) (2 or)) (2 (2 Steven) (2 Segal))))))))))))) (2 .))) +(4 (4 (4 (2 The) (4 (3 gorgeously) (3 (2 elaborate) (2 continuation)))) (2 (2 (2 of) (2 ``)) (2 (2 The) (2 (2 (2 Lord) (2 (2 of) (2 (2 the) (2 Rings)))) (2 (2 '') (2 trilogy)))))) (2 (3 (2 (2 is) (2 (2 so) (2 huge))) (2 (2 that) (3 (2 (2 (2 a) (2 column)) (2 (2 of) (2 words))) (2 (2 (2 (2 can) (1 not)) (3 adequately)) (2 (2 describe) (2 (3 (2 (2 co-writer\/director) (2 (2 Peter) (3 (2 Jackson) (2 's)))) (3 (2 expanded) (2 vision))) (2 (2 of) (2 (2 (2 J.R.R.) (2 (2 Tolkien) (2 's))) (2 Middle-earth))))))))) (2 .))) +(3 (3 (2 (2 (2 (2 (2 Singer\/composer) (2 (2 Bryan) (2 Adams))) (2 (2 contributes) (2 (2 (2 a) (2 slew)) (2 (2 of) (2 songs))))) (2 (2 --) (2 (2 (2 (2 a) (2 (2 few) (3 potential))) (2 (2 (2 hits) (2 ,)) (2 (2 (2 a) (2 few)) (1 (1 (2 more) (1 (2 simply) (2 intrusive))) (2 (2 to) (2 (2 the) (2 story))))))) (2 --)))) (2 but)) (3 (4 (2 the) (3 (2 whole) (2 package))) (2 (3 certainly) (3 (2 captures) (2 (1 (2 the) (2 (2 (2 intended) (2 (2 ,) (2 (2 er) (2 ,)))) (3 spirit))) (2 (2 of) (2 (2 the) (2 piece)))))))) (2 .)) +(2 (2 (2 You) (2 (2 'd) (2 (2 think) (2 (2 by) (2 now))))) (2 (2 America) (2 (2 (2 would) (1 (2 have) (2 (2 (2 had) (1 (2 enough) (2 (2 of) (2 (2 plucky) (2 (2 British) (1 eccentrics)))))) (4 (2 with) (4 (3 hearts) (3 (2 of) (3 gold))))))) (2 .)))) +(3 (2 ``) (3 (2 Frailty) (4 (2 '') (3 (4 (3 (2 has) (3 (2 been) (3 (4 (3 (3 written) (3 (2 so) (3 well))) (2 ,)) (2 (2 (2 that) (2 even)) (1 (2 (2 a) (2 simple)) (1 (2 ``) (0 Goddammit))))))) (2 !)) (2 ''))))) +(4 (2 (2 Whether) (2 (2 (2 (2 or) (1 not)) (3 (2 you) (2 (2 're) (3 (3 enlightened) (2 (2 by) (2 (2 any) (2 (2 of) (2 (2 Derrida) (2 's))))))))) (2 (2 lectures) (2 (2 on) (2 (2 ``) (2 (2 (2 (2 (2 (2 the) (2 other)) (2 '')) (2 and)) (2 ``)) (2 (2 the) (2 self)))))))) (3 (2 ,) (3 (2 '') (3 (2 Derrida) (3 (3 (2 is) (4 (2 an) (4 (4 (2 undeniably) (3 (4 (3 fascinating) (2 and)) (4 playful))) (2 fellow)))) (2 .)))))) diff --git a/tests/data_for_tests/io/THUCNews/dev.txt b/tests/data_for_tests/io/THUCNews/dev.txt new file mode 100755 index 00000000..e40ee4a0 --- /dev/null +++ b/tests/data_for_tests/io/THUCNews/dev.txt @@ -0,0 +1,9 @@ +体育 调查-您如何评价热火客场胜绿军总分3-1夺赛点?新浪体育讯四年了,终于赢球了,热火在凯尔特人的主场经过加时98-90艰难战胜对手,总比分3-1领先,詹姆斯拿下35分14个篮板,韦德28分9篮板,波什20分12个篮板。您如何评价这场比赛? +娱乐 盘点好莱坞明星新年目标 布兰妮迪亚兹在列(图)新年伊始,又是制定新一年目标的时候了。大到关注环保、寻找真爱,小到改掉坏毛病、改变生活习惯,这些都是美国演艺明星在2009年中的目标。●告别烟圈好莱坞女星卡梅隆·迪亚兹计划在新的一年戒烟,和她目标相同者还有《实习医生格蕾》中的凯瑟琳·海格尔及《飞跃贝弗利》中的布莱恩·奥斯汀·格林。格林说:“每年我似乎都说要戒烟,看看今年行不行吧。”●不咬指甲女歌手布兰妮( 听歌)希望自己“改掉咬手指甲的毛病”。此外,她还表示:“我希望自己不再焦虑,以前的我无时无刻不在焦虑中,我要学会让自己幸福。”●寻觅真爱凭借《灵魂歌王》一片夺得2005年奥斯卡()奖的杰米·福克斯希望自己能在2009年找到真爱。●回归平静去年刚刚与男友分手的影星安妮·海瑟薇则希望过上平静的生活。●享受滑雪因出演《灵异第六感》而一举成名的影星黑利·乔尔·奥斯门特的最大愿望就是重拾自己滑雪的爱好,并从美国犹他州的某座高山上直冲而下。●致力环保曾主演《异形》和《冰风暴》等片的女演员西戈尼·威弗表示要为环保事业贡献力量。她说:“我不再使用塑料袋,手头现有的这些我也要循环使用。”●亲近素食《绝望主妇》中的伊娃·朗格利亚的目标是努力尝试吃素。●活络筋骨热门电视剧《汉娜·蒙塔娜》的主角麦莉·赛勒斯关心的问题则是“多做运动”。●回馈世界要说计划最为抽象的当数帕丽斯·希尔顿,她说:“我已经长大了,成熟了,我要怀着一颗感恩的心,开始回馈世界。”●计划“计划”1983年出演《战争游戏》的马修·布罗德里克的新年计划最别具一格,他的计划就是在2009年“拟订计划”。○据新华社 +家居 蓝景丽家尹勃乐居思路清晰 创新开拓(图)     新浪家居谢娟讯  10月16日,易居中国与新浪合资公司中国房产信息集团(简称CRIC)在美国纳斯达克成功上市。此消息一出,家居业界大腕在分享喜悦的同时,纷纷来电来函,向中国房产信息集团成功登陆纳斯达克表示祝贺,同时对CRIC在未来发展提出了中肯的建议和期待。新浪家居电话连线业内数位大腕,倾听他们对此事的看法,以及对中国房产信息集团上市寄语。【CRIC(中国房产信息集团)纳斯达克挂牌上市】       采访嘉宾:蓝景丽家总经理 尹勃         新浪家居:您好,尹总,我是新浪乐居家居频道编辑谢娟,感谢您接受本次访谈。   尹勃:您好。       新浪家居:北京时间2009年10月16日,易居中国与新浪合资公司中国房产信息集团在美国纳斯达克成功上市融资2亿美元。您是否知道此事?您对此有怎样的看法?       尹勃:刚刚知道!对家居很好的促进作用,希望能够加大北京市场支持力度,给予北京市场更高的重视。   新浪家居:感谢您的肯定。同时也希望您能给予建设性的意见。       尹勃:在罗总的带领下做的比较有声势,目前的思路更清晰。希望乐居做到较其他媒体更有高度,活动更有所创新。   新浪家居:您有怎样的祝语?             尹勃:祝新浪乐居越办越好,带动北京家居市场更上一层楼!      【嘉宾简介】       尹勃:(蓝景丽家总经理 北京市建筑装饰协会家装委员会副会长 北京市场协会家居分会副会长 北京家具协会常务理事 中国建材市场协会理事会副理事长)家居流通卖场一路走来,从昔日倒爷式的地摊、棚户到今天品牌型的综合、主题式购物广场,经历了多少时代的洗礼。尹勃作为这个行业中翘楚企业的负责人,见证了整个家具行业的变迁。名字后面这一连串的职务介绍足以说明他在整个行业中举足轻重的影响力,也更加肯定了他对“蓝景丽家”这个行业航母的巨大贡献。      【推荐阅读】        蓝景丽家十一精彩促销撼京城       百城万店无假货蓝景丽家启动       乐居装修日首战告捷 蓝景丽家销售额逆势暴涨       【媒体声音】      中国证券报:新浪易居合资公司CRIC登陆纳市       上证报:新浪易居合资公司CRIC逆市登陆纳市       第一财经日报:CRIC上市首日市值20亿美元       新华网:新浪与易居合资公司CRIC登陆纳斯达克       专访丁祖昱:CRIC在做前人没有做过的事情       专访罗军:CRIC具有巨大的商业潜力       专访曹国伟:在某些垂直领域会做更多尝试 【更多】     上市背景资料:      美国东部时间10月16日(北京时间10月16日)消息,易居中国与新浪合资公司中国房产信息集团(以下简称CRIC)在美国纳斯达克挂牌上市,首日开盘价12.28美元,超出发行价0.28美元。CRIC为易居中国与新浪的合资公司,股票代码为CRIC,发行价12美元,共发行美国存托股票(ADS)1800万股,同时承销商有权在未来30天内,行使总额达到270万股的超额配售权,此次IPO共计募集资金约2.16亿美元。作为中国在美国的地产科技第一股,CRIC是中国最大的专业房地产信息服务公司,并且拥有同时覆盖线上线下的房地产综合信息和服务平台。CRIC的成功上市,也创造了两家在美国上市的中国公司,分拆各自极具成长力的业务后进行合并,并进行二次上市的先河。CRIC联席董事长、CEO周忻表示;“我们很高兴看到CRIC成功上市,此次IPO将确立CRIC作为中国房地产信息服务第一品牌的地位,并有利于CRIC继续推进国内最大和最先进的房地产信息系统建设,使CRIC成为同时覆盖线上和线下的强大中国房地产网络信息服务平台,为房地产开发商、供应商、专业机构以及个人用户提供多元化房地产信息服务。CRIC联席董事长、新浪CEO曹国伟表示:“CRIC的成功上市,是易居中国和新浪合作的重要一步,也是我们在垂直领域商业模式探索的有益尝试,我们很高兴有机会发挥双方的协同效应。而进一步拓展和深化互联网垂直领域的商机,建立公司在细分市场的核心竞争力并做大做强,这也是新浪未来长远战略的重要组成部分。     +房产 弘阳大厦骏馆开盘 首日热销1亿昨天,位于南京大桥北路69号的红太阳销售中心人头攒动,当天开盘的弘阳大厦·骏馆取得了开门红,由于产品品质高端、户型精致总价又低,吸引了一拨又一拨看房者,当天销售额突破了一个亿。弘阳大厦·骏馆位于南京市浦口区大桥北路西侧,紧邻已建成的旭日华庭金棕榈园区旁,用地总面积6万多平米,包括一个包含酒店公寓、商业及办公的综合楼,一个酒店式公寓以及8万平方米的居住建筑和15000平方米的商业。弘阳大厦作为这块地块中的综合楼,主楼高99.65米,共28层,是集办公、商业、餐饮、公寓为一体的泛配套复合多功能商住楼。此次推出的弘阳大厦·骏馆,是弘阳大厦其中5-22层的酒店式公寓,主力户型为41-75平米商住先锋小户型。由于项目地处桥北新城的核心位置,离市区仅一桥之隔,规划中的地铁与过江隧道近在咫尺,兼具成熟配套资源优势。公共交通也非常方便,131、132、鼓珍、鼓扬、汉江、中六、汉六等多条公交线路可以直达该项目。除了地处桥北核心地段,具备传统的生活多方面配套以外,弘阳大厦·骏馆还拥有同属弘阳集团旗下的华东MALL完美商业配套。 我要评论 +教育 名师解析标准读音在四级考试中的重要性对于中国学生而言,都知道口语和听力很重要,但就是怎么也不好过关,究其原因就是他们英语发音不标准。一、口语。一口标准而流利的口语可以立即提升你的形象,给人以很好的第一印象。举例1:汤姆汉克斯主演的电影《幸福终点站》中有一个情节,大家应该很熟悉:他将a man of mystery“一个神秘的人”读成了a man of misery“一个痛苦的人”,意思相差了十万八千里,自然造成理解障碍。举例2:中文中v和w没有任何区别,说“我wo”的时候,如果上齿咬着下唇的话,也无所谓,因为不会产生任何歧义。但是英文中不一样,这两个音区别很大。vine表示“葡萄藤”;而wine则表示“葡萄酒”。green wine表示“新酒”;而green vine则表示“绿色的葡萄藤”。读错了音意思差别可就大了去了。举例3:一位外国人在中国马路上迷了路,见到一位姑娘,立即冲上前去,说道:“我想吻(问)你...”吓得姑娘连忙跑掉,就是因为读音的问题,外国人在中国也会遭遇理解障碍。二、听力。听力在四级考试中占35%的份额,如果听力不如意的话,考试想要及格真的是很难。听力过程中学生可能会有以下几种体会:1. 根本听不清楚读音——因为不熟悉英文的读音规则;2. 听清了读音,但对应不出是哪个单词——词汇量不够,没有好好记单词;3. 听清了读音,也知道是哪个单词,但忘了啥意思了——还是词汇量不够,对于单词不熟悉;4. 对于spot dictation题型而言,听清了,知道是哪个单词,但就是—写就出现拼写错误——还是词汇没记好。第一,注意单词的读音,英式的和美式的。如:It's very hot today. 中hot美语中几乎就读成了hut这个词的读音了。第二,句子一连读、失去爆破等,连单词的影子都找不到了。如:This-is-an ol(d) pi(c)ture-of-a bi(g) car。横线表示连读,连读起来都不知道到底是一个词还是几个词了,括号里是不发音的,所以这个句子一旦读出来就完全走了样了。但听力中这种现象确是很常见的。要想练习好听力,首先要练习好英文的读音,包括词和句的读音规则。尤其对于外地孩子来说,就更重要了。如湖南的孩子说“我来自湖南”,由于方言影响就成了“我来自弗兰”。而这些人都不认为自己的读音是错误的,所以他听别人这样说的时候也认为是正确的。总之,如果我们平时的读音是错误的话,当听到正确读音时反而会不知道是哪个词,所以要想加强听力,首先要加强自己的读音。(党敏) +时尚 组图:10款艳丽泳装熟女穿出少女情怀导语:时下的泳装注重层次和线条感的悠闲设计,流露出自然的气质。 简洁的色彩搭配,甜美感觉凸显少女情怀,抽象概念化的异域花卉,颜色和谐、明快,印花纱裙,感觉轻盈,细致有女人味。 +时政 台“中选会”称12月5日选举时程不变新华网消息 据台联合晚报报道,台“中选会”上午如期召开幕僚选务会议,仍按原定12月5日举办“三合一”选举时程进行相关作业规划。“中选会”将在9月4日发布选举公告。基于考量莫拉克风灾灾后重建,以及H1N1疫情发烧,有部分蓝绿政治人物倡议延后年底“三合一”选举。据了解,到目前为止,年底“三合一”选举的相关选务作业仍如期进行。“中选会”表示,“中选会”是选务机关,是否延选,仍须由政策决定,在政策未改变前,“中选会”将依既定时程,规划年底“三合一”选举的相关选务作业。 +游戏 《天问》国家系统神秘美丽女儿国初探传说在遥远的西域,有一个神秘美丽的国家,上至国王,下至百姓,全国居民都是美丽温婉的女性。唐僧四师徒一路西行,就是来到了这个风光如画的女性之国。粉色帷幔随风飘扬,阳光照耀着的粉色砖墙闪闪发亮;清澈的泉水边,风情万种的女子们悠闲地编制精美的地毯,蝴蝶在花香中起舞……西梁女国就是一位端坐西域的温柔而美丽的少女,带着神秘的微笑注视来来往往的游客。解阳山是全新的练级场景, 山上微风吹拂,仙鹤悠闲地梳理着翎羽,处处透露平和安逸的气氛。但是山顶一座简陋的道观,竟藏着不少金银财宝?西梁女国百姓最珍视的一口泉水,也隐藏在道观山之上,这里到底隐藏着什么秘密?在解阳山上有一个神秘的副本波月洞,里面溶岩密布,石柱高耸,组成了各种美妙的景观。然而,波月洞盘踞着以毒蝎精领导的一群女妖,这帮妖精已与女儿国争战多年。当群侠得知毒蝎精近来甚至企图绑架女儿国太子,以要挟国王就范时,不论是出于怜香惜玉,还是英雄救美,一场的激烈的战争终将不可避免的开始了…… +科技 五彩时尚MP3 三星U5仅售299元 三星YP-U5(2GB)共有蓝、粉、白、红、黑五种时尚漂亮颜色可供选择。色彩感很浓烈。三星YP-U5(2GB)的背面还提供了一个背夹,再加上五颜六色的款式,使它看上去很像一个美发卡。机身很小巧,三围尺寸只有25×88×11.8mm,重量也只有23g,完全可以随身携带。在机身正面可以看到一个OLED冷光屏,显示的字体比较清晰。三星YP-U5(2GB)可以支持mp3、wma、ogg、Flac音频格式文件播放,此外,它支持三星最新的DNSe代3代音效,5种音效,提供自动、正常、工作室、摇滚、节奏及布鲁斯、舞厅、音乐厅7种选择,也可以进行自定义,对EQ和3D进行调节,效果非常好。除了出色的音乐播放功能以外,三星YP-U5(2GB)还支持FM收音机、歌词显示、MIC录音等功能。编辑点评:U系列是三星主打平价市场的产品,主要针对学生、办公室一族。相信这款音质出众、色彩绚丽的时尚MP3,也将为学生和年轻白领一族的个性生活增添亮丽色彩。    三星YP-U5(2GB)      [参考价格] 299元    [联系方式] 13434155009      diff --git a/tests/data_for_tests/io/THUCNews/test.txt b/tests/data_for_tests/io/THUCNews/test.txt new file mode 100755 index 00000000..81d00e65 --- /dev/null +++ b/tests/data_for_tests/io/THUCNews/test.txt @@ -0,0 +1,9 @@ +体育 凯尔特人vs尼克斯前瞻III纽约背水战 甜瓜必杀令新浪体育讯北京时间4月23日上午7点,凯尔特人将迎移师纽约城,挑战尼克斯,这是两队首轮的第三次交锋。前两场比赛中,小斯和安东尼轮番打出现象级的表现,可惜都无法为尼克斯带来一场胜利。目前凯尔特人总比分2-0领先,对尼克斯而言,他们没有退路。“第三场在主场击败,这是一场必胜的战争,我们根本输不起,这是本赛季为止将要面临的最艰难的一场比赛。”安东尼说。然而运气却不在纽约这边,他们接连以小分差输掉两场,与此同时,比卢普斯和小斯又接连出现伤病,第三场比赛两人的状态仍旧未知,小斯缺席了球队的训练,他在第二场下半场因为背部痉挛休战,但小斯仍希望能够在第三场出战,比卢普斯则有膝伤在身,能否复出还要等赛前决定。第二场比赛中,比卢普斯休战,小斯下半场未打,比尔-沃克全场11投0中,但是尼克斯凭借安东尼的42分17个篮板6次助攻,顽强的将比赛拖到最后一秒,直到最后时刻杰弗里斯的传球被KG抢断,才遗憾落败。德安东尼说:“很遗憾他们两不能上场,但从积极方面看,下半场球队打出的顽强表现,让我们信心满满。”小斯在第一场拿到28分11个篮板,但是安东尼在那场饱受犯规困扰,18投5中只拿到15分,下半场11投1中,尼克斯最终85-87落败,纽约人相信,如果安东尼和小斯同时发挥,他们有很大机会扳倒绿巨人。“我想这是一种精神折磨,你知道自己打得有多努力,有多棒,但两次我们都距离胜利差之毫厘。”安东尼说。第三战将是尼克斯自从2004年4月25日以来,首次在麦迪逊广场花园首次举办季后赛,这座举世闻名的篮球麦加殿堂已有七年未曾染指季后赛。对凯尔特人而言,他们的进攻出现了不少问题,季后赛前两场分别是靠雷-阿伦和凯文-加内特的关键球才勉强击败对手。里弗斯表示,球队表现需要提高,奥尼尔第三场能否出战还是谜,雷-阿伦连续两场打出不俗表现,隆多则在第二场砍下30分7次助攻,他们将尼克斯的命中率限制到35.6%,但与此同时,他们也丢失了大量的防守篮板,上场比赛尼克斯抢下了20个进攻篮板,而凯尔特人只有9个。小斯曾在这轮系列赛中和格伦-戴维斯大打口水仗,此战重回纽约,尼克斯急需他的发挥,接下来就看小斯带伤出战,能为尼克斯提供多少支援了。两队预计首发:凯尔特人:隆多、阿伦、皮尔斯、加内特、小奥尼尔尼克斯:道格拉斯、菲尔德斯、图里亚夫、安东尼、小斯(木瓜丁) +娱乐 独家探班李康生蔡明亮短片《自转》(组图)新浪娱乐讯蔡明亮(阿亮)导演、李康生(小康)演出的银幕组合让两人在国际影坛挣出一席地位,如今两人“角色互换”!李康生执导台湾公视《台北异想》影片中的短片──《自转》,请出已20年没站在镜头前的蔡明亮当演员,阿亮为了爱徒再次“下海”演戏,没想到自称对演员施以爱的教育的小康,拍第一场戏就让阿亮吃了18次NG,现场更放催泪音乐,让感情丰富的阿亮流下真情的眼泪。台湾公视的《台北异想》影片,概念将一天从清晨六点起分为八个时段,邀来李康生、郑芬芬、钮承泽、林靖杰等八位导演,拍摄八部十分钟短片,接力诠释24小时的台北故事。小康选了凌晨四时至六时的时段发挥,他说:“2006年,舞蹈家伍国柱、罗曼菲相继过世让我感触很深,蔡明亮拍摄电影《洞》时,罗曼菲担任舞蹈编排,她直率、认真的性格留给大家很深的印象。因此特别选择她凌晨四点多辞世的时段,拍摄《自转》,也希望将这部短片献给她。”蔡明亮自从20年前曾在电视单元剧中饰演乐团主唱后,即不再以演员身分现身萤光幕前,为了挺爱徒再站镜头前,阿亮坦言,剧中虽只需扮演自己,但被拍仍令他紧张,要不是近几年常受访,被媒体训练出减少对镜头的恐惧,不然他不会让自己名列演员名单中。被阿亮指导演戏惯了的小康,如何回过头来对恩师教戏?他虽说:“我让演员自由发挥,采取『爱的教育』!”但光是陆奕静炒咖啡豆,阿亮静坐咖啡厅一隅,这全剧第一个镜头就磨了十八次,现场播放雷光夏广播录音和林怀民舞作《挽歌》音乐,更催出阿亮的男儿泪,阿亮说:“我就是想到了罗曼菲,更感受到美好的事物都会消失,真想再看一次罗曼菲跳舞。”《自转》的最后一场戏,陆奕静衬着音乐转圈跳舞,阿亮也即兴起舞,但连两天熬夜赶戏体力透支,加上不停转圈,她拍到呕吐、阿亮则晕眩不止,小康却满意称赞:“这两人跳得不错嘛!”小康当导演,从第一场戏折腾演员到末场戏,堪称“有始有终”,蔡明亮笑说:“未来我还是选择继续当导演吧。”台湾特派记者郑伟柏/台北报导 声明:新浪网独家稿件,转载请注明出处。 +家居 打好算盘最省钱瓷砖选购法面对导购小姐的微笑更是心中打鼓:人家说的好像挺有道理,但会觉得说得越好,会不会上当啊,是不是有猫腻呢?本文从建筑卫生陶瓷角度来分析,其它建材选购原理也与之相差无几。瓷砖的选购很讲究,要知道瓷砖这玩意儿一旦铺上了要是再发现有问题,后果是很严重的!下面列出的几点问题是在装修前一定要想清楚的,这些问题往往决定了以后选择瓷砖的种类、规格、价位甚至家居的整体风格。1、到底铺什么?这个问题好像问得很白痴,但这却是最基本的,首先你得充分了解哪些空间适合用哪些瓷砖啊!其实这个问题的关键不是用什么铺地,而是各种材料该怎么搭配。比如:有些业主希望在客厅铺瓷砖,同时在卧室选择木地板,这样问题就产生了:如果客厅铺普通玻化砖,卧室铺强化复合地板,那么卧室与客厅就会存在3cm左右的高度差,这主要是由于强化地板下没有打龙骨造成的。那么是不是在卧室选择实木地板就行了呢?当然不是。通常实木地板由厂家安装都会使用3×2cm的龙骨,如果为了和客厅的瓷砖找平最好使用5×4cm规格的龙骨,但是各个地板厂商对于更换龙骨的服务条款可是不同的。所以要充分与业主沟通,毕竟我们的目的是要让业主满意,了解业主的最基本的要求,然后根据业主的原始思路,找出最合适的方案。如果业主希望选择地板与地砖混铺的方式,就一定要规划好,避免不必要的麻烦。下面介绍两种基本搭配方式:瓷砖+强化地板=铺地板的房间用水泥灰浆垫高3cm,瓷砖+实木地板=地板下采用5×4cm规格的龙骨。2、选择什么规格的地砖?是铺600的?800的?还是1000的或是其它规格的?这是一个问题!现在的地砖,尤其是客厅使用的地砖主要是500mm、600mm、 800mm和1000mm(即1米)等规格,其中使用最多的是600mm和800mm两种。那么该如何选择呢?建议根据铺贴的面积及家具的摆放进行选择。由于单位面积中600mm的砖比800mm的砖铺贴数量要多,所以视觉上能产生空间的扩张感,同时在铺贴边角时的废料率要低于800mm的砖,而空间大时铺800甚至1米规格的砖就显得大气。因此建议小于40平米的空间选择600mm规格的地砖;而大于40平米的空间则可以选择800mm或一米的地砖。值得注意的是,如果在房间中家具过多(如卧室),盖住大块地面时,最好也采用600mm的地砖。3、该铺怎样的砖?到底是选择铺怎样的砖呢?是仿古砖还是抛光砖?仿古砖自然、柔务,在复古风格、尤其是拼花上有着玻化砖无法比拟的优势。同时,由于表面釉层的保护,对于茶水、墨水甚至热烟头的抗污能力也优于玻化砖。但是玻化砖也并非一无是处。随着技术的发展,现在玻化砖表面玻化层的密实度、光洁度已经相当的高,不仅能够使居室显得更加亮堂,还决不会像釉面砖由于外力碰撞、摩擦产生釉面破损的现象。所以选择什么样的砖要根据你要体现的风格,要明亮、大气就选抛光砖,要自然、温馨就选仿古砖。建议居室空间、客厅如果采光相对有限选择玻化砖,而光线充足的客厅和和需防滑的厨房和卫生间地面,及阳台等可选择仿古砖或其它釉面砖。4、“微晶玉”、“微晶石”、“微晶钻”都是什么意思?很多人逛建材城最头疼的恐怕就是记录瓷砖的名字了。什么“微晶玉”、“微晶石”、“微晶钻”、“超炫石”、“聚晶玉”等等。其实大家根本没必要记住这些拗口的名字,它们描述的都是同一种东西——玻化砖,这些名字只是厂商为了区分产品的档次,进一步细化市场而使用的代号罢了。在选择时大家只要坚持自己的预算,尽量选择适合自己的产品就行了。微晶石表面很炫,但其硬度只有莫氏五度左右,不耐磨,不适于用在地面,比较适于用在外墙干挂。 +房产 迪拜危机启示录:空中楼阁迟早要倒塌美国拉斯维加斯,又一家奢侈至极的酒店在这个“罪恶之城”绽放。但此次,相较酒店豪华的各种天价服务和开幕典礼上的好莱坞群星璀璨外,似乎其幕后的主人更吸引人们的眼球--迪拜世界。仅仅一周前,迪拜世界这个名词牵动了世界每个角落的神经。11月25日,迪拜主权财富基金迪拜世界宣布,暂缓偿还债务。根据评级机构穆迪的估算,迪拜的债务预计接近1000亿美元。巨大的数额勾起了人们对去年雷曼兄弟倒闭以来那波汹涌澎湃的国际金融危机的回忆。汇丰、渣打、巴克莱、苏格兰皇家银行等在内的多家银行涉及在内。人们开始担心,我国是否也会因此受到波及。庆幸的是,国内几大商业银行随即申明表示,没有涉及迪拜世界、迪拜政府和其他相关迪拜主权基金及机构发行的债权。有所涉及的,比例也相当的小。记者致电多家研究所银行业分析师,均表示认为此事对国内银行业影响不大,目前没有特别关注。因此,公众的目光从银行投向了导致其债务根源的房地产业。迪拜世界的房产项目,现在已经成为了全世界最大的烂尾楼。而就在这债务问题凸显的时刻,其旗下的“重型”项目却闪亮登场。“城市中心”酒店的开幕,似乎使得地产行业最尴尬的一面展现在了公众眼中。反观我国的地产行业,近期拍卖地王频现,房屋交易价格再次飙升,种种迹象也让人们对其产生了许多担忧。有专家对记者表示,在高速成长时期,楼价和地价互相推动的背后,是资金的不断流入。在那些光鲜的大楼后被后默默支撑的是债券、贷款等各种负债工具。一个原本是沙漠中人口只有十几万的小城,在几乎没有任何实业的基础上,居然吸引了世界上各方的资金,建成了一个人口上百万的豪华都市。房地产市场的巨大利益诱惑在其中占据了重大的因素。不断高涨的楼市,加上免税的便利,使得国际游资疯狂涌入。在聚集了巨大资金后,其所投资的项目遍布世界,美国这次的拉斯维加斯“城市中心”项目,迪拜世界就砸了近50亿美元。这种推动与反推动作用,给予了人们一个璀璨的迪拜,但当问题暴露,留下的却是满目疮痍。“迪拜危机对我们而言更多的是警示作用。”中国社科院金融研究所中国经济评价中心主任刘煜辉在接受《证券日报》记者采访时如此表示。他认为,目前为止迪拜危机对我国银行业的影响不多,但由于有过全球金融危机的影响,心理上的波动是会有的。此外,刘煜辉还告诉记者,任何以过度负债支撑起来的价格上涨或资产泡沫都是需要高度警惕。因为一旦泡沫破裂,就会带来破坏性较强的连锁反应。相信通过这次迪拜危机的警示,国内更多的行业会关注本行业内的负债和泡沫,对于投机性行为和高风险项目将会更加冷静。我要评论 +教育 知名美国私立寄宿中学来华招生行程序号 学校 时间 地点 学校情况 1、北野山中学Northfield Mount Hermon School10月26日 星期三PM1:00 美丽园龙都美国教育部认可的示范型学校2、Cranbrook school10月27日 星期四AM8:40-10:20美丽园龙都每年本校学生的AP考试成绩都位列于全国成绩最好的学校之中3、The Storm King School10月29日 星期六PM4:30上海南京西路1515号嘉里中心1809室纽约州一所私立男女混合精英寄宿中学4、Villanova Preparatory School10月30日 星期日PM1:00-4:00虹桥万豪酒店美国唯一一所的男女混合寄宿制天主教教会学校5、Wyoming Seminary Upper School11月1日 星期二AM10:00香格里拉美国著名的百年贵族名校,也是美国东北部最古老的中学及大学预科学校6、胡桃山音乐学校Walnut Hill School11月2日 星期三PM1:00浦东香格里拉美国最古老的艺术高中7、弗莱堡学校Fryeburg Academy11月3日 星期四PM2:00-6:00上海南京西路1515号嘉里中心1809室一所独特的提供寄宿和走读学习的学校8、St.Johnsbury Academy11月8日 星期二AM9:00-12:00上海南京西路1515号嘉里中心1809室美国中学中拥有最棒校园的男女合校寄宿学校9、波特茅斯教会学校Portsmouth Abbey School11月8日 星期二PM1:00-3:00北京朝阳区建外SOHO,A座9层全国首屈一指的天主教混合住宿学校10、波特茅斯教会学校Portsmouth Abbey School11月15日 星期三PM1:00-4:00上海南京西路1515号嘉里中心1809室全国首屈一指的天主教混合住宿学校11、库欣高中Cushing Academy11月第三周待定美国最悠久男女合校寄宿中学之一12、West NottinghamAcademy11月19日 星期六PM2:00上海南京西路1515号嘉里中心1809室美国最早的学校,245年历史13、格瑞尔女子中学The Grier School11月26日 星期六PM9:45明天广场万豪历史悠久的著名女子寄宿学校14、萨菲尔德学院Suffield Academy11月30日 星期三 待定有170多年历史,是一所男女同校的私立中学15、威利斯顿 • 诺塞普顿中学The Williston Northampton School12月1日 星期四PM2:00-4:00上海南京西路1515号嘉里中心1809室学校以其优质的教学质量而闻名16、菲利普斯埃克塞特Philips Exeter Academy12月2日星期五PM6:30-8:30北京建国饭店牡丹厅(北京建国门外大街5号)“美国高中的哈佛” 、全美国最好的私立寄宿制高中17、菲利普斯埃克塞特Philips Exeter Academy12月3日星期六PM2:30-4:30上海浦东香格里拉浦江楼2层青岛厅“美国高中的哈佛” 、全美国最好的私立寄宿制高中18、菲利普斯埃克塞特Philips Exeter Academy12月5日星期一PM6:30-8:30浙江图书馆1楼文澜厅(杭州西湖区曙光路73号)“美国高中的哈佛” 、全美国最好的私立寄宿制高中19、坎特伯雷中学Canterbury School12月5日  星期一AM9:00-12:00 待定走读与寄宿都有的男女合校20、西城中学/威斯顿中学Westtown School12月5日 星期一AM9:00待定一所拥有205年悠远传统的中学21菲利普斯埃克塞特Philips Exeter Academy12月6日 星期二PM6:30-8:30广州天河区林和中路6号海肮威斯汀酒店5楼蓝厅“美国高中的哈佛” 、全美国最好的私立寄宿制高中22菲利普斯埃克塞特Philips Exeter Academy12月7日 星期三PM6:30-8:30深圳格兰云天酒店26楼云河厅(福田区深南中路3024号)“美国高中的哈佛” 、全美国最好的私立寄宿制高中23Cheshire Academy12月18日 星期日待定美国最早的传统寄宿中学24The Governor’s Academy待定待定美国最古老的寄宿高中之一25Peddie School待定待定著名的具有悠久历史的男女混合寄宿学校26Westover School待定待定美国著名的大学预备女子私立寄宿中学27Rabun Gap-Nacoochee School待定待定一所6-12年级的大学预备住宿走读中学28Ben Lippen School待定待定一所为学生提供大学准备课程的教会学院29George Stevens Academy待定待定一所拥有200多年历史的学校 +时尚 组图:纽约2011时装周 博主编辑街拍自成风景导语:纽约2011春夏时装秀正在如火如荼地进行着,打开任何时尚网站,你都可以看到这RUNWAY秀的图片,所以我不想在这里赘述了,反而我觉得秀场外这些赶赴现场的模特们和时尚博主以及时尚编辑的街拍更有意思。 +时政 台当局开放大陆银联卡在台刷卡消费中国台湾网7月16日消息 据台湾《联合报》报道,台当局“金管会”昨天发布修正“两岸金融业务往来许可办法”,开放大陆银联卡在台刷卡消费。最快9月初大陆民众就可以持银联卡在台刷卡消费,将可提高大陆游客赴台观光、消费意愿,并为台湾每年新增1000亿元(新台币,下同)刷卡商机。岛内银行也将可办理相关收单业务,对收单银行的手续费年收益至少可多出20亿元的贡献。报道称,台当局“金管会银行局副局长”萧长瑞表示,办法发布生效后,“金管会”就可开始受理岛内收单银行、联合信用卡中心等申请,台湾的联合信用卡中心也要跟大陆银联公司签约,估计最快9月初银联卡就可进入台湾。大陆银联卡赴台使用研议多时,消算等技术层面问题一直待克服,昨天“金管会”正式发布相关规定开放银联卡赴台,也代表技术面问题都已解决。根据“金管会”昨天发布的两岸金融业务往来许可办法第二条及第七条之一修正案,明定岛内信用卡业务机构经主管机关许可者,可以与银联公司从事信用卡或转帐卡的业务往来。主要包括银联卡在岛内刷卡消费的收单业务,以及交易授权与清算业务等两项。至于岛内银行发行银联卡的发卡业务则未开放。(高大林) +游戏 腾讯手游在线 《幻想西游》勇创新高根据腾讯QQ游戏中心2009年11月26日显示的在线数据,由腾讯和广州银汉联合运营的《幻想西游》再创新高,同时在线达到54336!54336同时在线一举打破之前的在线记录,创造手机游戏在线新高,这是《幻想西游》的光荣,也是手机游戏界的光荣!罗马不是一天建成的,《幻想西游》运营三年以前,开发组一直注重提升游戏品质和馈玩家,做属于玩家自己的游戏。这次创造在线人数新高,就是对开发组最高的褒奖。11月期间,《幻想西游》举行了“美在西游”系列活动吸引了数千美女玩家报名,6万多玩家参与了本次活动,掀起了11月的活动高潮。11月25日感恩节,开发组成员更是身怀感恩之心,化身GM来到游戏中倾听玩家的心声,并且心甘情愿地被玩家击败后奉上了感恩节礼物。12月将进入“美在西游”决赛阶段,广州银汉笑迎八方客,热情地邀请来自全国各地的美女玩家和跨服帮战优秀代表共聚羊城,共叙三年幻想情,畅谈西游未来路。《幻想西游》是根据名著《西游记》改编的手机网络游戏,具有操作简洁,界面美观,互动性好,娱乐性强的特点,营造出一个充满梦幻的西游世界。进入游戏:手机访问 http://3g.qq.com,选择游戏-网游-幻想手机官网 http://wap.01234.com.cn,选择快速进入 +科技 配18-135mm镜头 佳能7D国庆带票促销中(中关村在线数码影像行情报道)佳能EOS-7D是一款拥有1800万像素成像能力,每秒钟8张连怕性能,并具备高清摄像功能的单反相机。这款单反相机于上周登陆中关村市场,是目前APS-C规格单反中的旗舰机型。今天笔者在市场上了解到,配备有18-135mm防抖镜头的7D套机,价格为13800元带发票。EOS 7D实现了在约1800万有效像素的高画质下,高达约8张/秒的连拍速度。并搭载了高速智能的自动对焦系统等众多新功能。EOS 7D不仅达到了约1800万的有效像素,还实现了低噪点的精细图像表现。其搭载的CMOS图像感应器是佳能自行研发生产的产品。在提高像素感光度的同时,对像素内的晶体管进行了改良实现了更高的S/N(信噪)比。7D的常用ISO感光度为100-6400,扩展ISO感光度最高为12800。图像信号传输是在将单通道序列读取高速化的同时,采用8通道进行高速读取。与EOS 50D相比要快约1.3倍,实现了约8张/秒的高速连拍。另外,对更换镜头时以及反光镜、快门等动作时产生的感应器灰尘也采用了相应的综合除尘措施;同时还搭载了可从相机硬件和附带软件两方面进行除尘的“EOS综合除尘系统”,在除尘功能上考虑得十分周到。快门单元和机身盖采用了不易产生碎屑的特殊材料;即便是不小心进入了灰尘,也可以通过超声波使图像感应器最前面的低通滤镜产生振动将灰尘抖落。低通滤镜表面进行了氟涂层处理,不论是对难以脱落的具有较高粘度的灰尘还是潮湿的灰尘都有着很好的除尘效果。双DIGIC 4数字影像处理器实现了对通过8个通道从图像感应器中高速读取出的,具有约1800万像素的庞大数据的迅速且高精度处理。搭载了2个高性能数字影像处理器DIGIC 4,能够对各种数据进行并行处理,即使是约1800万有效像素也可以实现最高约8张/秒连拍的高速图像处理。EOS 7D搭载了多达19个的自动对焦点,并且提高了每个对焦点的对焦精度。19个对焦点全部采用对应F5.6光束的十字型自动对焦感应器。将用于检测纵向线条的横向线型自动对焦感应器与用于检测横向线条的纵向线型自动对焦感应器呈十字型排列,从而实现了很高的被摄体捕捉能力。中央对焦点在相对于F5.6光束十字型自动对焦感应器的斜方向上配置了对应F2.8光束精度更高的十字型自动对焦感应器。通过中央八向双十字自动对焦感应器的协同工作,实现了高速且高精度的合焦。追踪被摄体的人工智能伺服自动对焦功能也在EOS 7D上得到了大幅的进化。EOS 7D的光学取景器具有约100%的视野率和约1倍(100%)的放大倍率,同时具有29.4°的视角和22毫米的眼点,其光学性能在历代EOS单反相机中也名列前茅。通过视野率约100%的光学取景器观察到的范围与实际拍摄的范围基本一致,因此能够得到非常精确的构图。此外,EOS 7D还在光学取景器内搭载了具有背透型液晶面板的“智能信息显示光学取景器”,它能够在对焦屏上显示网格线和三维电子水准仪等内容。EOS 7D的机身外壳采用了重量轻,刚性高且具有电磁屏蔽效果的镁合金材料。表面涂层采用了与EOS数码单反相机中顶级的EOS-1D系列相同的涂层材料及工艺。此外,EOS 7D还具有防水滴防尘构造,镁合金的外部部件变为高精度接缝构造,电池仓、存储卡插槽盖以及各操作按钮周围等都采用了密封部件,来保护相机的内部。EOS 7D背面的液晶监视器采用了具有160°的广视角(上下左右方向)及高清晰的92万点新型液晶监视器——“3.0"清晰显示液晶监视器II型”,其内部构造也经过重新研发,采用了新技术。7D机身上分别设置了专用的“实时显示/短片拍摄开关 ”和相应的“开始/停止按钮 ”,并且短片拍摄时能够在手动模式下对曝光进行控制。此外,可实现每秒30/25/24帧,分辨率1920×1080像素的全高清短片拍摄,在使用高清画质(分辨率1280×720像素)及标清画质(分辨率640×480像素)时,能够以每秒60/50帧进行拍摄。编辑观点:佳能7D的出现,再一次丰富了E0S产品系列中APS-C规格单反的阵营。佳能也终于有了可以和尼康D300级别单反正面对抗的产品。而出色的性能表现,不论是摄影爱好者还是专业人士,都会对其青睐有加。而上市价格也比较合理,只是希望7D不要重蹈5D II缺货涨价的覆辙。 diff --git a/tests/data_for_tests/io/THUCNews/train.txt b/tests/data_for_tests/io/THUCNews/train.txt new file mode 100755 index 00000000..65ca8a36 --- /dev/null +++ b/tests/data_for_tests/io/THUCNews/train.txt @@ -0,0 +1,9 @@ +体育 火箭这一胜有更多意义 这是联盟最差击败联盟王者根据ESPN记者亨利-艾伯特的报道,对于一支NBA球队来说,在比赛最后24秒落后一两分或者和对方打成平局,这时候得分能力的高下就将决定最后的胜负。根据近五年来的统计,在这样的关键时刻下,联盟里最擅长得分的球队是黄蜂队,而最不擅长得分的球队则是火箭队。今天这两支球队狭路相逢,最后的24秒正是这样的情形。如果根据近5年火箭和黄蜂的表现来开,那火箭输定了。可是,奇迹出现了,火箭在距离比赛还有22秒的时候以88-87领先对手1分,但是他们并未停下得分的脚步,通过马丁和科特尼-李的三次罚球,他们最终让联盟最会把握最后时刻的王者球队黄蜂最终只是在临近终场的时候由大卫-韦斯特投进了无关紧要的一球,而以2分的优势胜出。一向不善于打关键球的火箭队今天却在最后时刻顶住了压力,力挽狂澜,这相当于火箭用自己最差的技能战胜了全联盟此项技能最强的球队。这和我们以往印象中的火箭截然不同。以往火箭总是在最后时刻无人挺身而出。然而马丁的出色发挥保证了火箭在最后时刻对对手篮筐的冲击力,他不断地抢断、造对手犯规,让黄蜂无法跟上火箭的得分脚步。在今天的比赛中,我们没有看到那支曾经缩手缩脚的球队,也许交易截止日期过了之后,所有的球员终于能安心稳定下来打球了吧。所以一度拥有巨大领先优势、穿着庆祝节日盛装队服的黄蜂最后俨然不敢接受这样的现实,我们至少从保罗的眼神中读出了这失望。所以,这场比赛的胜利对于火箭来说有着更深一层的意义。不论火箭是否已经达到脱胎换骨的境界,至少全明星后的四连胜对火箭冲击季后赛这个短期目标来说,是个极好的兆头。(大猩猩) +娱乐 《山楂树》电影比原著还干净 删减情节曝光(图)《山楂树之恋》小说有20万字,要将原著的全部内容压缩到一部110分钟的电影里,实属不易。事实上,电影里删掉了小说原著中的几场吻戏和激情戏的大部分内容,比小说原著还“干净”。张艺谋自己在说到改编的时候也表示,“其实原作中很多情节我都拍了,但是实在是太长了,我希望能将更多的笔墨放在老三和静秋身上,又能让故事平静地娓娓道来,所以剪掉了大半,后来还做了一些字幕将一些年代关系简化掉。 ”删除部分——长林喜欢静秋小说:静秋刚到生产队长家时,队长老婆希望把她说给自己的二儿子长林,而憨厚的长林也确实喜欢静秋。于是他偷偷地以自己的方式表达着他的爱,然而当他知道老三喜欢静秋时,也觉得自己配不上静秋,默默地就收回了自己的这份感情。影片:影片中这个分支被彻底删掉了,长林到静秋家送过一次核桃和冰糖,但都是老三让他去的。不过静秋在队长家吃饭时,队长一一介绍大哥二哥三哥的时候,长林突然间站起来的反常表现,还是可以看出他面对静秋时候的紧张。很显然,张艺谋其实拍了长林这段,但后来剪掉了。大量枝杈人物小说:为了让故事更丰满,小说中有很多配角在不同的阶段出现。例如,为了表现静秋被欺负,安排了王长生、万驼子这样的反面角色,也安排了成医生一家的出场,静秋对于白血病的一些知识都是从成医生那儿得来的。书中的静秋有个哥哥,为了能让哥哥顺利娶媳妇,这一家人也是做了不少牺牲和努力。影片:这些人物不复存在,张艺谋明确表示,为了有充分空间描述静秋和老三的爱情,不得不舍弃。老三的告别信小说:静秋无意中得知老三得了白血病。两人在医院度过了难忘的一夜,静秋向老三表示:“如果你死了,我也去死。 ”因此,老三选择了离开,并留下一封告别信,表示自己根本没得白血病,只是感冒了,而他不打算要静秋了。影片:老三早早就就澄清自己只是感冒,而之后又不告而别,令静秋既迷惑又伤心,那封告别信并没有出现。更多亲密片段小说:虽然号称“史上最干净的爱情”,小说中也有老三亲吻静秋的描写,包括二人在医院度过难忘一夜中“床戏”的描写。影片:张艺谋拍得比作者写得更干净,能算得上亲密的只有老三用军大衣拥静秋入怀,在医院难忘一夜里,老三和静秋手握着手和衣而眠。对此,张艺谋的解释是,对于影片来说,小说中某些场面还是较为“露骨”,毕竟要考虑到国内电影的审查制度,而且两张清纯的面庞经不起附加太多的“性”。作者有话——改编忠实度把握不好而小说《山楂树之恋》的作者艾米,在接受专访时曾表示,电影删掉的原著中的几场吻戏,没什么道理。《山楂树之恋》的主线就是静秋由惧怕“失足”到主动要求“失足”的转变过程,每场吻戏都是这个过程不可或缺的部分。如果去掉,就等于去掉了故事的主线,静秋后来的要求“失足”就会显得突兀。艾米同时指出:“我以为,这两位导演改编的忠实度把握得不好。仅从现在已经透露出的信息来看,就做了几个很没水平的改编。 ”记者 王琳娜 陈妍妮 +家居 物业交地产公司 以月租10万英镑放盘一年(图)   丹尼尔明年9月担纲演百老汇剧《恋马狂》时,正好方便落脚,但他似乎并非如此打算,因为他已把物业交地产公司,以月租10万英镑(150万人民币)放盘一年。租客将可享用会所设施,包括泳池和蒸气浴室,以及酒店公寓服务。 +房产 开发商频频拿地 市场复苏谨防再起炒作风10日,经过50次举牌,广州市城市建设有限公司以总价34500万元夺得广州天河区珠江新城一地块,折合楼面地价15324元/平方米,而此前珠江新城最高楼面地价为11912元/平方米。 今年2月份以来,随着楼市“小阳春”的到来,沉寂了多个月的土地交易市场再起波澜,开发商们在土地收储上的集体爆发引人关注。再露繁荣景象的土地市场反映出房地产企业充足的资本和对后市的信心,同时,随之高涨的地价、房价也让人们担心,新一轮炒地提价的闸门是否已经悄然打开。 信心加资本撬动土地市场全面复苏 从绿地集团(企业专区,旗下楼盘)分别以9.57亿元和12亿元的价格接连拿下上海松江区辰花路15号B地块和徐汇区斜土街道107街坊,创今年上海土地出让价格的新高,到富力地产(企业专区,旗下楼盘)10.22亿元拿下北京广渠门外10号地,再到中洲宝城26.1亿元拿下深圳3宗捆绑商住地块,雅戈尔10.28亿元拿下宁波“地王”。一个多月的时间内,国内“地王”频现。 中国指数研究院最新的统计数据显示,6月1日至7日,全国20个重点城市共推出土地124宗,环比增加25%,推出土地面积608万平方米,环比增加25%,成交土地面积173万平方米,环比增加14%。 “优质地块一直是开发商们收储的对象,只不过去年楼市的低迷抑制了开发商的热情。”易居中国房地产研究院综合部部长杨红旭在接受采访时指出,目前的情况表明冷落已久的土地市场开始复苏,地产商对后市的预期正在转好,信心正在增强。 国内地产巨头万科近日发布的公告显示,在过去的一个多月中,公司已斥资23亿元多处拿地。这与其两个月前对于国内楼市“尚需进一步观察”的谨慎表态形成了鲜明的对比。 万科能在短时间内连连出手,表明公司“不差钱”。上述公告显示,5月份万科实现销售面积69.7万平方米,销售金额64.1亿元,同比分别增长19.3%和19.7%。这一销售额已经接近2007年高峰时期的单月最高纪录。而今年1至5月,万科的销售总额已达238.9亿元,较2008年同期大涨20.9%。 嘉华(中国)投资有限公司总经理助理谷文胜表示,近期国内楼市十分活跃,开发商在短时间内回笼了大量资金,而开发项目资本金比例也降低了15个百分点,这都使开发商的财务状况大大改善,现金流增加,出于持续发展的需要,买地是很自然的。 地价楼价再入上升通道引发担忧 然而伴随着土地市场的不断回暖,房地产市场成交价格的不断冲高也越来越成为人们关心的问题。 根据国家发展改革委、国家统计局调查显示,5月份,全国70个大中城市房屋销售价格同比下降0.6%,降幅比上月缩小0.5个百分点;环比上涨0.6%,涨幅比上月扩大0.2个百分点。 北京、上海、深圳等地不断传出各类楼市涨价新闻,其中北京朝阳区一处楼盘一个月内每平方米房价上涨5000元的消息更是加重了购房者对后市的担忧。就在富力集团高价拿下广渠门外10号地之后,周边的二手房价格就开始跟风上涨,虽然尚无准确的统计数据,但据业内人士透露,部分业主跟风涨价的行为已经在京城房地产市场上营造出了浓浓的涨价氛围。 “现在开发商又在大量买地,土地市场和楼市会不会再像2007年一样被炒出一波高涨的行情?”正准备买房的丁先生向记者表达了自己的担忧。 丁先生的担忧不无道理,一边是高调拿地,一边是悄悄涨价。虽然综合全国土地收储和开发的情况看,开发商先前收储的土地并没有完全消化,市场供求关系也没有发生根本性的变化。但主要开发商在土地市场上的频频出手,还是很容易让人联想起2007年地价、房价交替上涨的火暴局面。 市场复苏谨防再起炒作之风 “目前的土地市场仍处于恢复性增长阶段,尚未到达繁荣期。”面对地产商纷纷布局土地市场的现状,杨红旭表示,现在还处于宏观经济的低谷期,很多开发商仍旧不敢对后市过于乐观。开发商们在土地市场上频频出手、高价成交,虽然客观上会使楼市预期升温。但土地市场的回暖和楼市的回暖毕竟还是两回事。在宏观经济形势没有发生根本性变化之前,盲目看高后市的地产商有可能碰壁。 北京我爱我家市场研究部高级研究员秦瑞表示,开发商高价拿地之后,地块周边二手房的业主常常会盲目跟风追涨,但从目前的市场环境来看,较高的房价只可能吓退对价格特别敏感的刚性需求,进而导致成交量的萎缩,加重市场的观望情绪。 对于一季度的楼市暖春,再次走上炒地涨价之路,无论是对开发商还是中小业主都不一定是件好事。机构分析人士认为,造成目前房价普涨、开发商收地加快的原因,一方面是市场回暖,另一方面是开发商的去库存已接近尾声,开发商注意力将转向购地、新开工面积和涨价上。 不过“去年以来的经验让购房者变聪明了”,秦瑞告诉记者,如果现在开发商或是中小业主盲目利用市场回暖的时机涨价,那么购房者很可能会再次持币观望,交易量的回落不可避免,房价的持续上涨也不会有市场的依托。 把握推地节奏警惕泡沫出现 谷文胜表示,企业决定买地与否的主要根据是对宏观经济形势的判断和对未来的预期,但“也可能是在全球性通胀预期的驱动下进行资产保值的一种选择,毕竟,持有土地的风险要小于持有现金的风险”。 尽管对购买土地是否真能规避通胀风险存有不同意见。但业内人士还是普遍认为,当土地交易市场成为投资市场,泡沫就随时可能浮现。在全球经济尚未好转、国内信贷相对宽松的背景下,如果将土地进行资本化杠杆运作,频频制造高价抢地的现象,泡沫便会被迅速吹大。 目前看来,地方政府较好地掌握了推地节奏,企业也还比较理性,没有盲目抢地的现象。不少房地产企业判断,“只要政府调控得当,今年应该不会出现像2007年那么多的‘地王’”。 长期调研楼市的上海市政协人资环建委员会专职副主任孙钟炬认为,要让房地产业回归理性、减少泡沫,就需要降低房产成本,而地价成本是房价成本的一个重要组成部分。 “拿地还是要谨慎,现在把地价抬得过高,未来可能心生悔意,就如2007年很多高价拿地企业一样。”杨红旭说。(记者 罗宇凡 叶锋) 我要评论 +教育 澳驻华使馆:政府公布多项国际教育新规澳大利亚驻华使领馆教育处17日通报称,澳大利亚移民与公民事务部长克里斯·鲍恩(Chris Bowen)议员及教育、技能、工作和劳资关系部长克里斯·埃文斯(Chris Evans)参议员今日宣布将对学生签证项目进行复审以及为国际教育行业制订的多项具体措施。埃文斯表示,澳币升值,全球金融危机在一些国家的持续影响,以及逐步加剧的来自美国、新西兰和加拿大等国为吸引国际学生而形成的竞争,给澳大利亚国际教育行业带来的压力在不断增加。他说,国际教育行业的规模和性质在过去十年中也发生了剧大的变化,因此我们采取政府各部门间通力合作的方式来应对这些变化是至关重要的。复审担负着提高国际教育行业的持续竞争力和加强优化学生签证项目两项任务,将为教育机构和各利益相关方提供机会,阐述他们对国际教育行业未来的远见卓识。据介绍,吉拉德政府已任命了澳大利亚勋章获得者迈克尔(Michael Knight)负责复审工作,并于2011年中旬向鲍恩和埃文斯提交复审报告。鲍恩指出,复审工作将考察主要利益相关方与学生签证申请要求之间所建立起来的合作伙伴框架,并将就如何建立一个更加有效的合作伙伴框架提出建议。同时还将审视各种更好的针对学生签证案例中移民风险的管理方法,遏制违规及滥用学生签证项目的行为,并考虑各类学生签证对不同教育类别的适宜性。他介绍说,政府还将采取多项措施,在继续坚持优化学生签证项目的同时,精简低风险人群的签证申请审理程序。这些措施有力支撑了政府近期为优化学生签证项目而采取的改革措施,并再次强调技术移民项目应为澳大利亚中长期经济发展提供所需的高端技能。这些措施包括:——按照近期澳大利亚移民与公民事务部进行的评估等级复审的建议,从2011年4月起,降低一些学生签证评估等级。作为这项决策的一部分,来自中国和印度的高等教育类别的学生签证申请评估等级将会被降低;——调整规定使预付的寄宿学校住宿费可以从签证申请所要求的生活费中扣除;——促进政府和国际教育行业间的信息交流,这包括即将在移民部网站上公布学生签证季度统计数据,以便院校跟踪了解学生签证新趋势;——使职业教育与培训(VET)学生签证评估等级(AL)4的签证申请人能够就读证书级别的配套课程,并能满足获得学生签证的要求。使馆介绍说,今天的这项宣布是对最近澳大利亚政府为加强国际教育行业而实施的多项措施的补充。这些措施包括:针对《2000年海外学生教育服务(ESOS)法案》的贝尔德复审(BairdReview),要求所有提供国际教育的院校于2010年底前重新注册的《海外学生教育服务(ESOS)法案》修正案,以及发布由澳大利亚政府理事会(Councilof Australian Government)制订的《澳大利亚国际学生战略》。埃文斯说:“保持澳大利亚教育继续被高度公认为能够为赴澳留学的国际学生提供高质量课程是十分重要的。”即将于明年成立的国家职业教育与培训规范局(National VET Regulator)和高等教育质量和标准署(Tertiary Education Quality Standards Agency)将保障职业教育与培训和高等教育行业继续保持高质量。 +时尚 组图:香肩美锁骨 性感不张扬女人哪个部位最美最性感?不是红唇,不是翘臀,更不是波胸,而是肩膀。锁骨,是你身着斜肩上装引来同性羡慕的地方,是被抹胸曳地长礼服衬托得最耀眼的地方,它的美充满灵性,让女人立刻有了一种轻盈的气质。它堪称女人身上一道最美的风景线。今夏,单肩装将低调并一枝独秀地流行着,一抹香肩半边锁骨的靓丽,同时造就了几个层次的美感,不对称、错落感、优雅、性感……一切都在那微微倾斜的一道色彩。单肩休闲衫 搭配牛仔最IN如果你认为,单肩风潮仅仅适用于相对正式的礼服或小洋装,那你就大错特错了,一款棉质的普通T恤,只需在剪裁上作一些调整,同时将领口开大,就能轻松呈现出当季最In的单肩感觉,在斜肩处露出细细的肩带,搭配牛仔裤就很好看。时尚女王凯特-摩丝永远懂得美的定义,就连最普通的T恤,一样可以穿出最Fashion的感觉。单肩小洋装 呈现多样风格短款单肩连衣裙根据面料、剪裁的不同,往往可以展现出多样、多变的风格。礼服型的单肩连衣裙充满野性;而缎面、丝绸材质的连衣裙则散发着迷人的青春气息。“绯闻女孩”布莱克-莱弗利一袭玫红色缎面单肩小洋装,玲珑曲线凸显无遗。 +时政 全国95%以上地市建立特邀监察员制度新华网北京3月13日电(记者李亚杰)记者日前从监察部获悉,自1989年以来,监察部已聘请了四批特邀监察员,共计130人次。目前,全国31个省、自治区、直辖市,95%以上的地(市)、65%以上的县和中央国家机关的十多个部委,建立了特邀监察员制度。特邀监察员制度是中国共产党领导的多党合作和政治协商制度的重要组成部分,也是民主监督、参政议政在反腐败领域的成功实践。监察部有关负责人表示,自1989年建立特邀监察员制度以来,监察部一直高度重视,把这项工作作为监察机关的一项重要工作来抓,明确把专门监督与群众监督相结合的制度坚持得如何、特邀监察员工作是加强了还是削弱了,作为衡量和判断在纪检监察机关合署办公体制下行政监察职能是否得到加强的六条标准之一。特邀监察员工作开展近20年来,特邀监察员制度在实践中进一步得到完善和发展,特邀监察员队伍不断壮大,工作领域逐步拓宽,在党风廉政建设和反腐败工作中的作用也越来越明显。1989年5月,经过充分酝酿并经中央同意,监察部作出建立特邀监察员制度的决定。同年12月,监察部从民革、民盟、民建、民进、农工党、致公党、九三学社、台盟8个民主党派和全国工商联聘请了21位专家、学者为监察部首批特邀监察员。之后,特邀监察员工作在全国各级纪检监察机关逐步推开。1996年11月,监察部召开了全国纪检监察机关特邀监察员工作座谈会,这是特邀监察员制度建立以来召开的第一次全国性会议,总结交流了全国纪检监察机关开展特邀监察员工作的经验和做法,有力地推动了特邀监察员工作的深入开展。2004年10月颁布实施的《中华人民共和国行政监察法实施条例》进一步明确:监察机关根据工作需要,可以在国家行政机关、企业、事业单位、社会团体中聘请特邀监察员。聘请特邀监察员的具体办法由国务院监察机关规定。之后,监察部先后制定颁布了《监察部关于聘请特邀监察员的几点意见》、《关于改进特邀监察员工作的几点意见》、《中央纪委监察部关于加强和改进行政监察工作的意见》等一系列法规、文件和规定,明确了特邀监察员工作的总体要求和主要内容。即将颁布施行的《中国人民共和国行政监察法》,将进一步明确特邀监察员选聘程序、职责权限等,为特邀监察员全面履行职责提供法律依据。各地也结合工作实际,纷纷制定颁布了切实可行的工作制度。北京、上海、河南、广东、广西、山东、福建、四川、深圳等地还根据实践发展不断修订、完善特邀监察员工作办法等制度规定,特邀监察员工作的规范化、制度化水平不断提高。 +游戏 经典无法复制!《神鬼寓言3》PC版评析《神鬼寓言3》在一个异彩纷呈的虚拟世界,人类在电脑治下民主共存 -- 再没有什么比这更能激发想象的火花了。我的一个小巧玲珑的世界,我可以予取予求。力量感在我周身涌起,因为这结果完全由我来主宰。若是不想眼看着那帮狼人们凌虐镇子,我或者施法送出火球,或者挥舞宝剑,怎样都能拯救世界。我也可以将镇子寻求保护的一丝光芒熄灭干净,看着怪物们把尖叫的无辜百姓给撕成碎片。这些方面,《神鬼寓言3》做得可圈可点,但是 -- 太罕见了。在阿尔比昂大陆最新的故事里,纵然Lionhead工作室用令人荡气回肠的道德抉择设置了无数奇思妙想和激动时刻,它们却被深埋在了一堆毫不丰满的人物形象、冗长的故事和狗血情节里。如果你从来没玩儿过《神鬼寓言》,Xbox-360独占的《神鬼寓言2》也错过了 -- 没关系的,别担心为了了解《神鬼寓言3》而做好功课的事儿。所有需要你知道的,开篇全交代了:国王是个恶棍,需要被干掉。并不是遵循着最初的故事,总之我 -- 就是主角,从城堡里跑了,混迹市井之中,在阿尔比昂这个奇妙的大陆中徘徊,以期攒足人气资本,把国王搞下来,我自己坐这把交椅。《神鬼寓言3》所耍的手段在于,并不是我戴上王冠就终章了。那些我帮过的人,我还得给出承诺来;一旦取得王位,我得决定是旧账一律不认,还是一律兑现。这事儿让我真的很不舒服。我费大力气拯救出的那些人,敢情谁都不是跑龙套的,都等着最后来向我讨债,都等着我登基之后捎只胳膊带把手儿去拉他们一把。而且大多数的这种事儿都跟王国的安全这种更高层次的要求是冲突的。我不得不在践行诺言与保证阿尔比昂的安全之间竭力求取平衡,小心翼翼如履薄冰。这种构思其实挺不错,但是本来挺好的一件事儿,感觉怎么就这么恶心呢。首先这些人物就有问题。绝大多数的这些角色都同样地逡巡。相比行动来说,还是口音和衣着能有些区分。置他们的吁求不顾而去推广童工或者把妓院夷为平地,我这是多么撕心裂肺的抉择啊!除了我的导师与伙伴沃特,以及暴君洛根之外,剩下的角色全都一个心眼儿,根本就不比普通的三维物件强到哪里去。作为国王而背弃承诺之时,我真是毫无任何感觉,仅仅按下键盘命令,让他们滚,如是而已。穿插在《神鬼寓言3》的主线故事之中,有很多招募的任务 -- 几乎就没有哪个有意思。也有分支任务,可大部分都是教科书一般的护送或者刺杀任务。我可以购置实业,但是只有最基本的项目可供自定义。一个饶有趣味的帝国管理游戏就这样被剥夺了,成了一个单调、乏味的流程,仅仅在金钱进入游戏里钱包的那轻轻一声响更是放大了这一点。我可以杀死或者审判阿尔比昂的百姓,但是与此一道的各种冷笑话和莫名其妙的大打出手,完全把这种感受给毁了。哪怕是黎民们当面儿大喊大叫说我是“刽子手”,我也照旧可以傻乎乎地跳舞、做支线任务、去约会,搞不好就结婚了,还拖家带口的。游戏中的形成、发展和关系的维系,全因为这个设定被束缚住了。就算是《神鬼寓言3》在某些方面引入了阴谋和神秘的元素,例如我被丢到一个黑暗荒芜的洞穴之后,我不得不面对各种恐惧,这使得我无法探索每一个角落。恐惧在这个大陆上是最强大的邪恶,而且大约会在游戏进程的三分之二处出现,而且仅仅会遭遇几次而已。游戏给人的感觉就是完成度不高,而且赶工迹象明显。寻找游戏中的收集元素、参与小鸡快跑比赛、镇压地精等等事情都让人很难一直保持兴趣。而当我最终坐上王座之后,《神鬼寓言3》所能提供的选择少得可怜。还好《神鬼寓言3》有一些时尚和幽默。有些台词写得还是非常有意思的。虽然这样的台词对塑造人物没有任何意义,但是会让你一直一直笑。阿尔比昂仍然是个美丽的地方,而且角色模型、动画和环境光跟随构造除了这个美丽的世界。从墓地的薄雾到荒漠的午后阳光,这样一个充满生机的地方非常令人赞叹。配音做的很专业。任务繁多,讲述了一个宏大的故事,而且还有很多娱乐元素,不过所有这些都相互孤立,让本该成为一款佳作的《神鬼寓言3》就这样沦为了一款毫不出彩的作品。战斗过程令人兴奋,但是缺乏打击感。由于战斗过程的乏味,所以战斗无法使玩家的注意力从游戏剧情和肤浅的人物问题上转移开。格斗武器,枪支和魔法本质上来说都是一样的。基本上都是闪躲和攻击,这样的方法可以用来对付所有遇到的敌人。说实话,这样的战斗系统着实令人失望。武器升级所带来的外观和属性上的改变让我切实感受到了游戏的进程,不过由于战斗系统的失败,这样的设定也让人感到无聊。整体感觉6.5分:漂亮的界面,不过与PC平台毫不相称。杂乱无章的故事与游戏节奏画面表现7.5分:一些很棒的动画和特效,还有多彩和谐的艺术风格声效表现8.0分:令人振奋的音乐,配音表演相当完美上手体验6.0分:有很多可以做的内容,但只有很小部分令人兴奋。单调的战斗,重复的任务,只有很小部分值得情感投入耐玩性5.5分:你或许从合作游戏和大量的收集感到愉悦,但这也无法更改核心游戏体验总评6.0分:还行吧 +科技 摩托罗拉:GPON在FTTH中比EPON更有优势作 者:鲁义轩2009年,在国内光进铜退的火热趋势下,摩托罗拉携其在国际市场上已经获得丰富运营经验的GPON解决方案,大举进入中国的光通信市场。对于这一个时间点的选择,摩托罗拉宽带及移动网络事业部网络接入解决方案部全球营销与传播总监FloydWagoner的解释是:中国利用GPON推进光线到户的时机正在趋于成熟,而摩托罗拉在国际上的GPON研发和运营经验,可以更好地提升国内运营商推进FTTH的效率。GPON的国际性优势在亚洲地区,推进光线到户的多种技术中,EPON一直是非常强大并且主流的技术。而在亚洲以外的国际很多地区,运营商都开始越来越多地关注GPON,今年GPON预计占到全球光纤到户市场的40%。在FloydWagoner看来,EPON虽然仍然强大,而GPON的实力在显著加强。在带宽方面,GPON比EPON上下行带宽都加强了至少一倍。因为EPON利用率相对于GPON要低一些,在相同的用户部署、相同终端情况下,统计数据表明EPON支持上、下行29Mbit/s的带宽,而GPON可以达到下行79Mbit/s上行37Mbit/s的实际带宽,从根本上提升了对数据业务的支持。在服务的质量保证(QoS)上,目前EPON的业务主要是数据业务,而运营商要推广三网融合等复杂的业务,服务质量保证要求会更高。在这方面,GPON有了更好的机制来保证多业务服务质量的实现。此外,在部署的方便性上,光线路中的光功率意味着传输距离的长短。EPON的速率是24dB,而GPON是28dB,在相同的条件下,GPON的传输距离更远。运营商可以把ONT布置在更远的位置,节省线路的成本,将来可以覆盖更多、更远的终端单元。综合比较,无论在技术方面还是在业务保障方面以及在材料方面,GPON到现在为止所体现的趋势更加地优于EPON。而且GPON的成本价格已经下降很多,得到越来越多的运营商的青睐。目前国内中国电信、中国联通以及中国移动都已经表示过把GPON作为下一步光网络发展的优选。创新性的GPONONT和OLT据FloydWagoner介绍,凭借在全球FTTH领域积累的经验,摩托罗拉开发了创新产品,以满足服务供应商提供更低密度的OLT、满足更高密度的 MDU环境以及具集成功能的室内ONT等方面的需求。创新性的GPONONT和OLT,可以将光纤延伸至服务供应商网络的边缘,从而保证用户在任何地方都能享用端到端的超宽带服务。同时,摩托罗拉的FTTH网元管理系统AXSvision,还能简化网管界面,并帮助运营商加速新型、丰富的个性化娱乐业务推出速度。 diff --git a/tests/data_for_tests/io/WeiboSenti100k/dev.txt b/tests/data_for_tests/io/WeiboSenti100k/dev.txt new file mode 100755 index 00000000..fdca0212 --- /dev/null +++ b/tests/data_for_tests/io/WeiboSenti100k/dev.txt @@ -0,0 +1,7 @@ +label text +1 多谢小莲,好运满满[爱你] +1 能在他乡遇老友真不赖,哈哈,珠儿,我也要用这个拼图软件!BTW,小飞人儿终于要飞回家啦,深圳行,谢谢每位工作人员的照顾![爱你] +0 [衰]补鞋的说鞋子是进口的,质量太好,刀子都切不进去!所以说大家以后别买进口,到时补都没的补![爱你] +0 第五季都没看了[泪]要补起来 +1 美图好诗![鼓掌] //@言家楼:回复@俺叫老鬼:【七律。感时】 叶随风舞身何处, 鸟逆风行觅树梢。 岁月风来无退路, 激流风助有波涛。 寒微风动曾言志, 富贵风骚似不牢。 雪竹风梅诗未尽, 休云风雨剪春刀。//鸢肩格:藏珠“风”。 +0 没敢问,她男朋友在旁边呢。。[泪]//@好饭换坏饭: 你问问她能不能调成静音模式 diff --git a/tests/data_for_tests/io/WeiboSenti100k/test.txt b/tests/data_for_tests/io/WeiboSenti100k/test.txt new file mode 100755 index 00000000..3d071fb2 --- /dev/null +++ b/tests/data_for_tests/io/WeiboSenti100k/test.txt @@ -0,0 +1,8 @@ +label text +1 钟爱大粉的亲们,这一茬我们又种大粉了,座果也不错,能吃上了[嘻嘻] +0 //@北京全攻略: 我擦。。。牛逼~果断收藏[衰] +1 都有我都有我~~~我的2012注定是美美的精彩的不得了啊~哈哈哈[太开心]//@哆啦胖兔梦: 转发微博。 +1 这周的成果就是这样 刻的好累但是很喜欢[嘻嘻]#我的橡皮章# +1 你把我整?了。[抓狂] //@窦智耀:开 往大稿艺术区店开 带上祝贺的花篮。。。昨夜 杨家火锅 你把我灌醉。。。今夜 我要学会排队等位。再贺开业大吉![鼓掌][鼓掌][鼓掌] +1 [爱你]亲们,我刚刚发表了一篇文章,有图有真相,速来围观![围观]||#蚂蜂窝游记#《新疆,雨中的野核桃沟》,查看更多精彩>>> http://t.cn/zR4BMN3 (分享自 @蚂蜂窝旅游攻略) +0 [泪]//@平安北京: 珍爱生命,小心驾驶,驾车时请勿接打电话! diff --git a/tests/data_for_tests/io/WeiboSenti100k/train.txt b/tests/data_for_tests/io/WeiboSenti100k/train.txt new file mode 100755 index 00000000..4f0adf27 --- /dev/null +++ b/tests/data_for_tests/io/WeiboSenti100k/train.txt @@ -0,0 +1,7 @@ +label text +1 //@实用小百科:这才是吃货本色[哈哈] +0 回复@邋遢大王诗文:好的[ok] //@邋遢大王诗文:回复@静冈叔叔:[ok]木有问题!回来了和我联系 //@静冈叔叔:回复@西瓜叫高荔蜒啊:在富士山静冈机场有很多小丸子的土产啊[嘻嘻] //@西瓜叫高荔蜒啊:祝你一路顺风~ 想要小丸子的お土?~[泪] +1 我花了两年最后被抢的只剩下一枚,情何以堪! //@自由橙的小窝:@程诗然 同学集卡速度最快,我花了两年时间才集全 //@怯弱的狮子Susan: 回复@阮导:@墙墙-墙根俱乐部 看你多抢手!快给我们各发一套吧![嘻嘻] //@阮导:回复@怯弱的狮子Susan:所以。。。。你要给我找一套撒。。哈哈哈哈哈!!! +1 KIMSCLOSET的年会,海鲜自助餐,太丰盛了!大家吃的HIGH,喝的HIGH,聊的HIGH!太开心了![哈哈][爱你] +1 在iPhone的便携鱼眼镜头之下,扣肉蝴蝶饱子显得多诱人呀![围观][馋嘴][嘻嘻] +0 英织,你知道不知道,他是我最最最爱的大叔,你跟他靠这么近,我的心都碎了!!!你说你说你说,你有没有他的签名![泪] diff --git a/tests/data_for_tests/io/XNLI/dev.txt b/tests/data_for_tests/io/XNLI/dev.txt new file mode 100755 index 00000000..eced8fac --- /dev/null +++ b/tests/data_for_tests/io/XNLI/dev.txt @@ -0,0 +1,7 @@ +language gold_label sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 promptID pairID genre label1 label2 label3 label4 label5 sentence1_tokenized sentence2_tokenized match +zh neutral 他说,妈妈,我回来了。 校车把他放下后,他立即给他妈妈打了电话。 1 1 facetoface neutral contradiction neutral neutral neutral 他 说 , 妈妈 , 我 回来 了 。 校车 把 他 放下 后 , 他 立即 给 他 妈妈 打 了 电话 。 True +zh contradiction 他说,妈妈,我回来了。 他没说一句话。 1 2 facetoface contradiction contradiction contradiction contradiction contradiction 他 说 , 妈妈 , 我 回来 了 。 他 没 说 一 句 话 。 True +zh entailment 他说,妈妈,我回来了。 他告诉他的妈妈他已经回到家了。 1 3 facetoface entailment entailment neutral entailment entailment 他 说 , 妈妈 , 我 回来 了 。 他 告诉 他 的 妈妈 他 已经 回到家 了 。 True +zh neutral 他们停止了跟这家交朋友,因为他们决定了当白人。 种族紧张局势开始时,他们不再探望这家人。 13 39 facetoface neutral entailment entailment entailment entailment 他们 停止 了 跟 这家 交朋友 , 因为 他们 决定 了 当 白人 。 种族 紧张 局势 开始 时 , 他们 不再 探望 这家 人 。 False +zh contradiction 老太太以前常说她姐姐和姐丈是如何决定要搬到奥古斯塔城里去,并且被当做白人看待。 奶奶的妹妹是白人,搬到了德克萨斯州。 17 49 facetoface contradiction contradiction contradiction contradiction neutral 老太太 以前 常 说 她 姐姐 和 姐丈 是 如何 决定 要 搬 到 奥古斯塔 城里 去 , 并且 被 当做 白人 看待 。 奶奶 的 妹妹 是 白人 , 搬 到 了 德克萨斯州 。 True +zh entailment 老太太以前常说她姐姐和姐丈是如何决定要搬到奥古斯塔城里去,并且被当做白人看待。 奶奶的姐姐不是白人。 17 50 facetoface entailment entailment contradiction neutral entailment 老太太 以前 常 说 她 姐姐 和 姐丈 是 如何 决定 要 搬 到 奥古斯塔 城里 去 , 并且 被 当做 白人 看待 。 奶奶 的 姐姐 不 是 白人 。 True diff --git a/tests/data_for_tests/io/XNLI/test.txt b/tests/data_for_tests/io/XNLI/test.txt new file mode 100755 index 00000000..d5ff4c24 --- /dev/null +++ b/tests/data_for_tests/io/XNLI/test.txt @@ -0,0 +1,7 @@ +language gold_label sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 promptID pairID genre label1 label2 label3 label4 label5 sentence1_tokenized sentence2_tokenized match +zh contradiction 嗯,我根本没想过,但是我很沮丧,最后我又和他说话了。 我还没有和他再次谈论。 2 4 facetoface contradiction contradiction contradiction contradiction contradiction 嗯 , 我 根本 没 想 过 , 但是 我 很 沮丧 , 最后 我 又 和 他 说话 了 。 我 还 没有 和 他 再次 谈论 。 True +zh entailment 嗯,我根本没想过,但是我很沮丧,最后我又和他说话了。 我非常沮丧,我刚刚开始跟他说话。 2 5 facetoface entailment entailment entailment entailment entailment 嗯 , 我 根本 没 想 过 , 但是 我 很 沮丧 , 最后 我 又 和 他 说话 了 。 我 非常 沮丧 , 我 刚刚 开始 跟 他 说话 。 True +zh neutral 嗯,我根本没想过,但是我很沮丧,最后我又和他说话了。 我们谈得很好。 2 6 facetoface neutral neutral neutral neutral neutral 嗯 , 我 根本 没 想 过 , 但是 我 很 沮丧 , 最后 我 又 和 他 说话 了 。 我们 谈 得 很 好 。 True +zh neutral 而我当初认为这是一个特权,我现在仍然这样想,我是唯一的922 Ex-O,也是我的AFFC空军职业生涯。 我不知道那天我不是唯一一个在场的人。 3 7 facetoface neutral contradiction contradiction contradiction contradiction 而 我 当初 认为 这 是 一个 特权 , 我 现在 仍然 这样 想 , 我 是 唯一 的 922 Ex-O , 也 是 我 的 AFFC 空军 职业生涯 。 我 不 知道 那天 我 不 是 唯一 一个 在场 的 人 。 False +zh contradiction 而我当初认为这是一个特权,我现在仍然这样想,我是唯一的922 Ex-O,也是我的AFFC空军职业生涯。 我们都被赋予了相同的确切数字,无论我们被许诺了何种特权,都是谎言。 3 9 facetoface contradiction contradiction entailment contradiction contradiction 而 我 当初 认为 这 是 一个 特权 , 我 现在 仍然 这样 想 , 我 是 唯一 的 922 Ex-O , 也 是 我 的 AFFC 空军 职业生涯 。 我们 都 被 赋予 了 相同 的 确切 数字 , 无论 我们 被 许诺 了 何种 特权 , 都 是 谎言 。 True +zh entailment 这是Fannie Flono,她在佐治亚州奥古斯塔长大,她会讲述她童年时的一些故事。 Fannie Flono就在这里,她将与我们分享她在奥古斯塔成长的童年故事。 12 35 facetoface entailment entailment entailment entailment entailment 这 是 Fannie Flono , 她 在 佐治亚州 奥古斯塔 长大 , 她 会讲 述 她 童年 时 的 一些 故事 。 Fannie Flono 就 在 这里 , 她 将 与 我们 分享 她 在 奥古斯塔 成 长 的 童年 故事 。 True diff --git a/tests/data_for_tests/io/XNLI/train.txt b/tests/data_for_tests/io/XNLI/train.txt new file mode 100755 index 00000000..8a2fd3a3 --- /dev/null +++ b/tests/data_for_tests/io/XNLI/train.txt @@ -0,0 +1,9 @@ +premise hypo label +我们 家里 有 一个 但 我 没 找到 我 可以 用 的 时间 我们 家里 有 一个 但 我 从来 没有 时间 使用 它 . entailment +该镇 仍然 充满 雕塑家 , piazza alberica 是 一个 夏季 雕塑 比赛 的 现场 14 天 来 制作 一个 杰作 . 几乎 所有 的 雕塑家 都 离开 了 piazza alberica 为 其他 城市 . contradictory +土耳其 的 面包车 是 自己 坐 下 来 的 , 但 他们 喜欢 玩和呃 , 他们 喜欢 和 他们 一起 玩 , 他们 把 他们 的 社会 从 它 . neutral +好 吗 ? 我 问 benignantly , 因为 她 犹豫 了 . 我 抓住 她 的 胳膊 和 她 愤怒地 , 问 , 好 吗 ? contradictory +一 段 时间 来 看 , 这 一 运动 似乎 要 取得 成功 , 但 政治 事件 , 加 上 帕内尔 在 一个 令 人 愤慨 的 离婚案 中 被 称为 共同 答辩人 , 导致 许多 人 撤回 他们 的 支持 . 帕内尔 在 一个 令 人 愤慨 的 离婚 问题 上 的 法律 问题 使 这 场 运动 受到 了 影响 . entailment +看 在 这里 , 他 说 我们 不 希望 任何 律师 混在 这 一 点 . 他 说 看看 那 张 纸 neutral +Soderstrom 在 创伤 中心 进行 了 多次 筛选 测试 . 测试 必须 在 创伤 中心 进行 比较 , 否则 就 会 无效 . neutral +嗯 , 这 是 一 种 明显 的 我 的 意思 是 , 他们 甚至 把 它 带 到 现在 呢 , 他们 在 电视 上 做 广告 , 你 知道 如果 你 知道 , 如果 你 知道 这样 做 , 或者 如果 你 需要 这 个呃 , 我们 会 告 你 和 你 你 不用 给 我们 钱 , 但 他们 不 告诉 你 的 是 如果 他们 赢 了 你 给 他们 至少 三分之一 他们 赢 的 东西 , 所以 我 不 知道 它 是呃 , 它 得到 了 现在 做 更 多 的 生意 , 而 不 是呃 实际上 是 在 处理 犯罪 而 不 是 与 呃嗯 他们 的 律师 只 是 为了 钱 , 我 相信 , 我 知道 我 同意 你 , 我 认为 你 是 真实 的 你. 非常 正确 的 是 , 我 认为 他们 应该 有 同等 数量 的 你 知道 也许 他们 可以 有 几 个 , 但 我 认为 大多数 他们 应该 不 是 律师 在 事实 , 这 是 方式 他们 已经 进入 政治 , 这 是 因为 在 法律 上 , 你 知道 的 循环 和 一切 , 但 我 不 知道 我们 是 在 马里兰州 和呃 , 我们 有 同样 的 东西 人满为患 , 和呃 他们 让 他们 出来 我 的 意思 是 只 是 普通 的 监狱 判决 的 事情 , 他们 让. 他们 是 因为 他们 没有 任何 地方 可以 留住 他们 所以 你 可以 知道呃 , 除非 是 一个 重大 的 罪行 , 但呃 , 即使 是 小小的 东西 , 我 的 意思 是 那些 在 美国 失去 的 人 是 受害者 和 谁 可能 是 抢劫 或 毒品 , 或者 其他 什么 , 他们 是 谁 要 支付 , 他们 是 一个 会 受苦 , 另 一个 你 知道 的 人 , 如果 他们 被 逮捕 , 如果 他们 逮捕 他们嗯 , 然后 呢 , 你 知道 的 时间 法律 接管 了 一 半 时间 呃 他们 要么 让 他们 走 , 或者 他们 下 了 一个 句子 , 因为 他们 有 一个 律师 , 你 知道 的 感觉 他们 是 不 是 所有 在 一起 当 他们 做到 了 .它 我 不 知道 我们 怎么 到 这 一 点 , 虽然 . neutral diff --git a/tests/data_for_tests/io/ag/test.csv b/tests/data_for_tests/io/ag/test.csv new file mode 100755 index 00000000..3a4cc0ae --- /dev/null +++ b/tests/data_for_tests/io/ag/test.csv @@ -0,0 +1,5 @@ +"3","Fears for T N pension after talks","Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul." +"4","The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com)","SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket." +"4","Ky. Company Wins Grant to Study Peptides (AP)","AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins." +"4","Prediction Unit Helps Forecast Wildfires (AP)","AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and flames will roar." +"4","Calif. Aims to Limit Farm-Related Smog (AP)","AP - Southern California's smog-fighting agency went after emissions of the bovine variety Friday, adopting the nation's first rules to reduce air pollution from dairy cow manure." diff --git a/tests/data_for_tests/io/ag/train.csv b/tests/data_for_tests/io/ag/train.csv new file mode 100755 index 00000000..e766a481 --- /dev/null +++ b/tests/data_for_tests/io/ag/train.csv @@ -0,0 +1,4 @@ +"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again." +"4","Building Dedicated to Columbia Astronauts (AP)","AP - A former dormitory converted to classrooms at the Pensacola Naval Air Station was dedicated Friday to two Columbia astronauts who were among the seven who died in the shuttle disaster Feb. 1, 2003." +"2","Phelps On Relay Team","Michael Phelps is named to the 4x100-meter freestyle relay team that will compete in Sunday's final, keeping alive his quest for a possible eight Olympic gold medals." +"1","Venezuelans Vote Early in Referendum on Chavez Rule (Reuters)","Reuters - Venezuelans turned out early\and in large numbers on Sunday to vote in a historic referendum\that will either remove left-wing President Hugo Chavez from\office or give him a new mandate to govern for the next two\years." diff --git a/tests/data_for_tests/io/cmrc/dev.json b/tests/data_for_tests/io/cmrc/dev.json new file mode 100755 index 00000000..c9069efe --- /dev/null +++ b/tests/data_for_tests/io/cmrc/dev.json @@ -0,0 +1,155 @@ +{ + "version": "v1.0", + "data": [ + { + "paragraphs": [ + { + "id": "DEV_0", + "context": "《战国无双3》()是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴,分别是以武田信玄等人为主的《关东三国志》,织田信长等人为主的《战国三杰》,石田三成等人为主的《关原的年轻武者》,丰富游戏内的剧情。此部份专门介绍角色,欲知武器情报、奥义字或擅长攻击类型等,请至战国无双系列1.由于乡里大辅先生因故去世,不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图(不含村雨城),后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多,部分地图会有兼用的状况,战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主,以下是相关介绍。(注:前方加☆者为猛将传新增关卡及地图。)合并本篇和猛将传的内容,村雨城模式剔除,战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品", + "qas": [ + { + "question": "《战国无双3》是由哪两个公司合作开发的?", + "id": "DEV_0_QUERY_0", + "answers": [ + { + "text": "光荣和ω-force", + "answer_start": 11 + }, + { + "text": "光荣和ω-force", + "answer_start": 11 + }, + { + "text": "光荣和ω-force", + "answer_start": 11 + } + ] + }, + { + "question": "男女主角亦有专属声优这一模式是由谁改编的?", + "id": "DEV_0_QUERY_1", + "answers": [ + { + "text": "村雨城", + "answer_start": 226 + }, + { + "text": "村雨城", + "answer_start": 226 + }, + { + "text": "任天堂游戏谜之村雨城", + "answer_start": 219 + } + ] + }, + { + "question": "战国史模式主打哪两个模式?", + "id": "DEV_0_QUERY_2", + "answers": [ + { + "text": "「战史演武」&「争霸演武」", + "answer_start": 395 + }, + { + "text": "「战史演武」&「争霸演武」", + "answer_start": 395 + }, + { + "text": "「战史演武」&「争霸演武」", + "answer_start": 395 + } + ] + } + ] + } + ], + "id": "DEV_0", + "title": "战国无双3" + }, + { + "paragraphs": [ + { + "id": "DEV_1", + "context": "锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:", + "qas": [ + { + "question": "锣鼓经是什么?", + "id": "DEV_1_QUERY_0", + "answers": [ + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + }, + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + }, + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + } + ] + }, + { + "question": "锣鼓经常用的节奏型称为什么?", + "id": "DEV_1_QUERY_1", + "answers": [ + { + "text": "锣鼓点", + "answer_start": 67 + }, + { + "text": "锣鼓点", + "answer_start": 67 + }, + { + "text": "锣鼓点", + "answer_start": 67 + } + ] + }, + { + "question": "锣鼓经运用的程式是什么?", + "id": "DEV_1_QUERY_2", + "answers": [ + { + "text": "依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。", + "answer_start": 167 + }, + { + "text": "依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。", + "answer_start": 167 + }, + { + "text": "依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点", + "answer_start": 167 + } + ] + }, + { + "question": "戏曲锣鼓所运用的敲击乐器主要有什么类型?", + "id": "DEV_1_QUERY_3", + "answers": [ + { + "text": "鼓、锣、钹和板", + "answer_start": 237 + }, + { + "text": "鼓、锣、钹和板", + "answer_start": 237 + }, + { + "text": "鼓、锣、钹和板", + "answer_start": 237 + } + ] + } + ] + } + ], + "id": "DEV_1", + "title": "锣鼓经" + } + ] +} \ No newline at end of file diff --git a/tests/data_for_tests/io/cmrc/train.json b/tests/data_for_tests/io/cmrc/train.json new file mode 100755 index 00000000..823b9c80 --- /dev/null +++ b/tests/data_for_tests/io/cmrc/train.json @@ -0,0 +1,161 @@ +{ + "version": "v1.0", + "data": [ + { + "paragraphs": [ + { + "id": "TRAIN_186", + "context": "范廷颂枢机(,),圣名保禄·若瑟(),是越南罗马天主教枢机。1963年被任为主教;1990年被擢升为天主教河内总教区宗座署理;1994年被擢升为总主教,同年年底被擢升为枢机;2009年2月离世。范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生;童年时接受良好教育后,被一位越南神父带到河内继续其学业。范廷颂于1940年在河内大修道院完成神学学业。范廷颂于1949年6月6日在河内的主教座堂晋铎;及后被派到圣女小德兰孤儿院服务。1950年代,范廷颂在河内堂区创建移民接待中心以收容到河内避战的难民。1954年,法越战争结束,越南民主共和国建都河内,当时很多天主教神职人员逃至越南的南方,但范廷颂仍然留在河内。翌年管理圣若望小修院;惟在1960年因捍卫修院的自由、自治及拒绝政府在修院设政治课的要求而被捕。1963年4月5日,教宗任命范廷颂为天主教北宁教区主教,同年8月15日就任;其牧铭为「我信天主的爱」。由于范廷颂被越南政府软禁差不多30年,因此他无法到所属堂区进行牧灵工作而专注研读等工作。范廷颂除了面对战争、贫困、被当局迫害天主教会等问题外,也秘密恢复修院、创建女修会团体等。1990年,教宗若望保禄二世在同年6月18日擢升范廷颂为天主教河内总教区宗座署理以填补该教区总主教的空缺。1994年3月23日,范廷颂被教宗若望保禄二世擢升为天主教河内总教区总主教并兼天主教谅山教区宗座署理;同年11月26日,若望保禄二世擢升范廷颂为枢机。范廷颂在1995年至2001年期间出任天主教越南主教团主席。2003年4月26日,教宗若望保禄二世任命天主教谅山教区兼天主教高平教区吴光杰主教为天主教河内总教区署理主教;及至2005年2月19日,范廷颂因获批辞去总主教职务而荣休;吴光杰同日真除天主教河内总教区总主教职务。范廷颂于2009年2月22日清晨在河内离世,享年89岁;其葬礼于同月26日上午在天主教河内总教区总主教座堂举行。", + "qas": [ + { + "question": "范廷颂是什么时候被任为主教的?", + "id": "TRAIN_186_QUERY_0", + "answers": [ + { + "text": "1963年", + "answer_start": 30 + } + ] + }, + { + "question": "1990年,范廷颂担任什么职务?", + "id": "TRAIN_186_QUERY_1", + "answers": [ + { + "text": "1990年被擢升为天主教河内总教区宗座署理", + "answer_start": 41 + } + ] + }, + { + "question": "范廷颂是于何时何地出生的?", + "id": "TRAIN_186_QUERY_2", + "answers": [ + { + "text": "范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生", + "answer_start": 97 + } + ] + }, + { + "question": "1994年3月,范廷颂担任什么职务?", + "id": "TRAIN_186_QUERY_3", + "answers": [ + { + "text": "1994年3月23日,范廷颂被教宗若望保禄二世擢升为天主教河内总教区总主教并兼天主教谅山教区宗座署理", + "answer_start": 548 + } + ] + }, + { + "question": "范廷颂是何时去世的?", + "id": "TRAIN_186_QUERY_4", + "answers": [ + { + "text": "范廷颂于2009年2月22日清晨在河内离世", + "answer_start": 759 + } + ] + } + ] + } + ], + "id": "TRAIN_186", + "title": "范廷颂" + }, + { + "paragraphs": [ + { + "id": "TRAIN_54", + "context": "安雅·罗素法(,),来自俄罗斯圣彼得堡的模特儿。她是《全美超级模特儿新秀大赛》第十季的亚军。2008年,安雅宣布改回出生时的名字:安雅·罗素法(Anya Rozova),在此之前是使用安雅·冈()。安雅于俄罗斯出生,后来被一个居住在美国夏威夷群岛欧胡岛檀香山的家庭领养。安雅十七岁时曾参与香奈儿、路易·威登及芬迪(Fendi)等品牌的非正式时装秀。2007年,她于瓦伊帕胡高级中学毕业。毕业后,她当了一名售货员。她曾为Russell Tanoue拍摄照片,Russell Tanoue称赞她是「有前途的新面孔」。安雅在半准决赛面试时说她对模特儿行业充满热诚,所以参加全美超级模特儿新秀大赛。她于比赛中表现出色,曾五次首名入围,平均入围顺序更拿下历届以来最优异的成绩(2.64),另外胜出三次小挑战,分别获得与评判尼祖·百克拍照、为柠檬味道的七喜拍摄广告的机会及十万美元、和盖马蒂洛(Gai Mattiolo)设计的晚装。在最后两强中,安雅与另一名参赛者惠妮·汤姆森为范思哲走秀,但评判认为她在台上不够惠妮突出,所以选了惠妮当冠军,安雅屈居亚军(但就整体表现来说,部份网友认为安雅才是第十季名副其实的冠军。)安雅在比赛拿五次第一,也胜出多次小挑战。安雅赛后再次与Russell Tanoue合作,为2008年4月30日出版的MidWeek杂志拍摄封面及内页照。其后她参加了V杂志与Supreme模特儿公司合办的模特儿选拔赛2008。她其后更与Elite签约。最近她与香港的模特儿公司 Style International Management 签约,并在香港发展其模特儿事业。她曾在很多香港的时装杂志中任模特儿,《Jet》、《东方日报》、《Elle》等。", + "qas": [ + { + "question": "安雅·罗素法参加了什么比赛获得了亚军?", + "id": "TRAIN_54_QUERY_0", + "answers": [ + { + "text": "《全美超级模特儿新秀大赛》第十季", + "answer_start": 26 + } + ] + }, + { + "question": "Russell Tanoue对安雅·罗素法的评价是什么?", + "id": "TRAIN_54_QUERY_1", + "answers": [ + { + "text": "有前途的新面孔", + "answer_start": 247 + } + ] + }, + { + "question": "安雅·罗素法合作过的香港杂志有哪些?", + "id": "TRAIN_54_QUERY_2", + "answers": [ + { + "text": "《Jet》、《东方日报》、《Elle》等", + "answer_start": 706 + } + ] + }, + { + "question": "毕业后的安雅·罗素法职业是什么?", + "id": "TRAIN_54_QUERY_3", + "answers": [ + { + "text": "售货员", + "answer_start": 202 + } + ] + } + ] + } + ], + "id": "TRAIN_54", + "title": "安雅·罗素法" + }, + { + "paragraphs": [ + { + "id": "TRAIN_756", + "context": "为日本漫画足球小将翼的一个角色,自小父母离异,与父亲一起四处为家,每个地方也是待一会便离开,但他仍然能够保持优秀的学业成绩。在第一次南葛市生活时,与同样就读于南葛小学的大空翼为黄金拍档,曾效力球队包括南葛小学、南葛高中、日本少年队、日本青年军、日本奥运队。效力日本青年军期间,因救同母异父的妹妹导致被车撞至断脚,在决赛周只在决赛的下半场十五分钟开始上场,成为日本队夺得世青冠军的其中一名功臣。基本资料绰号:球场上的艺术家出身地:日本南葛市诞生日:5月5日星座:金牛座球衣号码:11担任位置:中场、攻击中场、右中场擅长脚:右脚所属队伍:盘田山叶故事发展岬太郎在小学期间不断转换学校,在南葛小学就读时在全国大赛中夺得冠军;国中三年随父亲孤单地在法国留学;回国后三年的高中生涯一直输给日本王牌射手日向小次郎率领的东邦学院。在【Golden 23】年代,大空翼、日向小次郎等名将均转战海外,他与松山光、三杉淳组成了「3M」组合(松山光Hikaru Matsuyama、岬太郎Taro Misaki、三杉淳Jyun Misugi)。必杀技1. 回力刀射门2. S. S. S. 射门3. 双人射门(与大空翼合作)", + "qas": [ + { + "question": "岬太郎在第一次南葛市生活时的搭档是谁?", + "id": "TRAIN_756_QUERY_0", + "answers": [ + { + "text": "大空翼", + "answer_start": 84 + } + ] + }, + { + "question": "日本队夺得世青冠军,岬太郎发挥了什么作用?", + "id": "TRAIN_756_QUERY_1", + "answers": [ + { + "text": "在决赛周只在决赛的下半场十五分钟开始上场,成为日本队夺得世青冠军的其中一名功臣。", + "answer_start": 156 + } + ] + }, + { + "question": "岬太郎与谁一起组成了「3M」组合?", + "id": "TRAIN_756_QUERY_2", + "answers": [ + { + "text": "他与松山光、三杉淳组成了「3M」组合(松山光Hikaru Matsuyama、岬太郎Taro Misaki、三杉淳Jyun Misugi)。", + "answer_start": 391 + } + ] + } + ] + } + ], + "id": "TRAIN_756", + "title": "岬太郎" + } + ] +} \ No newline at end of file diff --git a/tests/data_for_tests/io/cnndm/dev.label.jsonl b/tests/data_for_tests/io/cnndm/dev.label.jsonl new file mode 100755 index 00000000..52a56ab0 --- /dev/null +++ b/tests/data_for_tests/io/cnndm/dev.label.jsonl @@ -0,0 +1,4 @@ +{"label": [1, 19, 25], "text": ["marseille , france -lrb- cnn -rrb- the french prosecutor leading an investigation into the crash of germanwings flight 9525 insisted wednesday that he was not aware of any video footage from on board the plane .", "marseille prosecutor brice robin told cnn that `` so far no videos were used in the crash investigation . ''", "he added , `` a person who has such a video needs to immediately give it to the investigators . ''", "robin 's comments follow claims by two magazines , german daily bild and french paris match , of a cell phone video showing the harrowing final seconds from on board germanwings flight 9525 as it crashed into the french alps .", "all 150 on board were killed .", "paris match and bild reported that the video was recovered from a phone at the wreckage site .", "the two publications described the supposed video , but did not post it on their websites .", "the publications said that they watched the video , which was found by a source close to the investigation .", "`` one can hear cries of ` my god ' in several languages , '' paris match reported .", "`` metallic banging can also be heard more than three times , perhaps of the pilot trying to open the cockpit door with a heavy object .", "towards the end , after a heavy shake , stronger than the others , the screaming intensifies .", "then nothing . ''", "`` it is a very disturbing scene , '' said julian reichelt , editor-in-chief of bild online .", "an official with france 's accident investigation agency , the bea , said the agency is not aware of any such video .", "lt. col. jean-marc menichini , a french gendarmerie spokesman in charge of communications on rescue efforts around the germanwings crash site , told cnn that the reports were `` completely wrong '' and `` unwarranted . ''", "cell phones have been collected at the site , he said , but that they `` had n't been exploited yet . ''", "menichini said he believed the cell phones would need to be sent to the criminal research institute in rosny sous-bois , near paris , in order to be analyzed by specialized technicians working hand-in-hand with investigators .", "but none of the cell phones found so far have been sent to the institute , menichini said .", "asked whether staff involved in the search could have leaked a memory card to the media , menichini answered with a categorical `` no . ''", "reichelt told `` erin burnett : outfront '' that he had watched the video and stood by the report , saying bild and paris match are `` very confident '' that the clip is real .", "he noted that investigators only revealed they 'd recovered cell phones from the crash site after bild and paris match published their reports .", "`` that is something we did not know before .", "... overall we can say many things of the investigation were n't revealed by the investigation at the beginning , '' he said .", "what was mental state of germanwings co-pilot ?", "german airline lufthansa confirmed tuesday that co-pilot andreas lubitz had battled depression years before he took the controls of germanwings flight 9525 , which he 's accused of deliberately crashing last week in the french alps .", "lubitz told his lufthansa flight training school in 2009 that he had a `` previous episode of severe depression , '' the airline said tuesday .", "email correspondence between lubitz and the school discovered in an internal investigation , lufthansa said , included medical documents he submitted in connection with resuming his flight training .", "the announcement indicates that lufthansa , the parent company of germanwings , knew of lubitz 's battle with depression , allowed him to continue training and ultimately put him in the cockpit .", "lufthansa , whose ceo carsten spohr previously said lubitz was 100 % fit to fly , described its statement tuesday as a `` swift and seamless clarification '' and said it was sharing the information and documents -- including training and medical records -- with public prosecutors .", "spohr traveled to the crash site wednesday , where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside .", "he saw the crisis center set up in seyne-les-alpes , laid a wreath in the village of le vernet , closer to the crash site , where grieving families have left flowers at a simple stone memorial .", "menichini told cnn late tuesday that no visible human remains were left at the site but recovery teams would keep searching .", "french president francois hollande , speaking tuesday , said that it should be possible to identify all the victims using dna analysis by the end of the week , sooner than authorities had previously suggested .", "in the meantime , the recovery of the victims ' personal belongings will start wednesday , menichini said .", "among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board .", "check out the latest from our correspondents .", "the details about lubitz 's correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and lubitz 's possible motive for downing the jet .", "a lufthansa spokesperson told cnn on tuesday that lubitz had a valid medical certificate , had passed all his examinations and `` held all the licenses required . ''", "earlier , a spokesman for the prosecutor 's office in dusseldorf , christoph kumpa , said medical records reveal lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot 's license .", "kumpa emphasized there 's no evidence suggesting lubitz was suicidal or acting aggressively before the crash .", "investigators are looking into whether lubitz feared his medical condition would cause him to lose his pilot 's license , a european government official briefed on the investigation told cnn on tuesday .", "while flying was `` a big part of his life , '' the source said , it 's only one theory being considered .", "another source , a law enforcement official briefed on the investigation , also told cnn that authorities believe the primary motive for lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems .", "lubitz 's girlfriend told investigators he had seen an eye doctor and a neuropsychologist , both of whom deemed him unfit to work recently and concluded he had psychological issues , the european government official said .", "but no matter what details emerge about his previous mental health struggles , there 's more to the story , said brian russell , a forensic psychologist .", "`` psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they were n't going to keep doing their job and they 're upset about that and so they 're suicidal , '' he said .", "`` but there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person 's problems . ''", "germanwings crash compensation : what we know .", "who was the captain of germanwings flight 9525 ?", "cnn 's margot haddad reported from marseille and pamela brown from dusseldorf , while laura smith-spark wrote from london .", "cnn 's frederik pleitgen , pamela boykoff , antonia mortensen , sandrine amiel and anna-maja rappard contributed to this report ."], "summary": ["marseille prosecutor says `` so far no videos were used in the crash investigation '' despite media reports .", "journalists at bild and paris match are `` very confident '' the video clip is real , an editor says .", "andreas lubitz had informed his lufthansa training school of an episode of severe depression , airline says ."], "publication": "cnndm", "compression": 22.283333333333335, "coverage": 0.8666666666666667, "density": 4.6} +{"label": [3, 5, 24], "text": ["-lrb- cnn -rrb- the palestinian authority officially became the 123rd member of the international criminal court on wednesday , a step that gives the court jurisdiction over alleged crimes in palestinian territories .", "the formal accession was marked with a ceremony at the hague , in the netherlands , where the court is based .", "the palestinians signed the icc 's founding rome statute in january , when they also accepted its jurisdiction over alleged crimes committed `` in the occupied palestinian territory , including east jerusalem , since june 13 , 2014 . ''", "later that month , the icc opened a preliminary examination into the situation in palestinian territories , paving the way for possible war crimes investigations against israelis .", "as members of the court , palestinians may be subject to counter-charges as well .", "israel and the united states , neither of which is an icc member , opposed the palestinians ' efforts to join the body .", "but palestinian foreign minister riad al-malki , speaking at wednesday 's ceremony , said it was a move toward greater justice .", "`` as palestine formally becomes a state party to the rome statute today , the world is also a step closer to ending a long era of impunity and injustice , '' he said , according to an icc news release .", "`` indeed , today brings us closer to our shared goals of justice and peace . ''", "judge kuniko ozaki , a vice president of the icc , said acceding to the treaty was just the first step for the palestinians .", "`` as the rome statute today enters into force for the state of palestine , palestine acquires all the rights as well as responsibilities that come with being a state party to the statute .", "these are substantive commitments , which can not be taken lightly , '' she said .", "rights group human rights watch welcomed the development .", "`` governments seeking to penalize palestine for joining the icc should immediately end their pressure , and countries that support universal acceptance of the court 's treaty should speak out to welcome its membership , '' said balkees jarrah , international justice counsel for the group .", "`` what 's objectionable is the attempts to undermine international justice , not palestine 's decision to join a treaty to which over 100 countries around the world are members . ''", "in january , when the preliminary icc examination was opened , israeli prime minister benjamin netanyahu described it as an outrage , saying the court was overstepping its boundaries .", "the united states also said it `` strongly '' disagreed with the court 's decision .", "`` as we have said repeatedly , we do not believe that palestine is a state and therefore we do not believe that it is eligible to join the icc , '' the state department said in a statement .", "it urged the warring sides to resolve their differences through direct negotiations .", "`` we will continue to oppose actions against israel at the icc as counterproductive to the cause of peace , '' it said .", "but the icc begs to differ with the definition of a state for its purposes and refers to the territories as `` palestine . ''", "while a preliminary examination is not a formal investigation , it allows the court to review evidence and determine whether to investigate suspects on both sides .", "prosecutor fatou bensouda said her office would `` conduct its analysis in full independence and impartiality . ''", "the war between israel and hamas militants in gaza last summer left more than 2,000 people dead .", "the inquiry will include alleged war crimes committed since june .", "the international criminal court was set up in 2002 to prosecute genocide , crimes against humanity and war crimes .", "cnn 's vasco cotovio , kareem khadder and faith karimi contributed to this report ."], "summary": ["membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june .", "israel and the united states opposed the move , which could open the door to war crimes investigations against israelis ."], "publication": "cnndm", "compression": 17.57894736842105, "coverage": 0.8947368421052632, "density": 3.1052631578947367} +{"label": [0, 6], "text": ["-lrb- cnn -rrb- governments around the world are using the threat of terrorism -- real or perceived -- to advance executions , amnesty international alleges in its annual report on the death penalty .", "`` the dark trend of governments using the death penalty in a futile attempt to tackle real or imaginary threats to state security and public safety was stark last year , '' said salil shetty , amnesty 's secretary general in a release .", "`` it is shameful that so many states around the world are essentially playing with people 's lives -- putting people to death for ` terrorism ' or to quell internal instability on the ill-conceived premise of deterrence . ''", "the report , `` death sentences and executions 2014 , '' cites the example of pakistan lifting a six-year moratorium on the execution of civilians following the horrific attack on a school in peshawar in december .", "china is also mentioned , as having used the death penalty as a tool in its `` strike hard '' campaign against terrorism in the restive far-western province of xinjiang .", "the annual report catalogs the use of state-sanctioned killing as a punitive measure across the globe , and this year 's edition contains some mixed findings .", "on one hand , the number of executions worldwide has gone down by almost 22 % on the previous year .", "at least 607 people were executed around the world in 2014 , compared to 778 in 2013 .", "amnesty 's figures do not include statistics on executions carried out in china , where information on the practice is regarded as a state secret .", "belarus and vietnam , too , do not release data on death penalty cases .", "`` the long-term trend is definitely positive -- we are seeing a decrease in the number of executions -lrb- worldwide -rrb- , '' audrey gaughran , amnesty 's director of global issues , told cnn .", "`` a number of countries are closer to abolition , and there are some signs that some countries will be abolitionist by 2015 .", "-lrb- there are -rrb- signals of a world that is nearing abolition . ''", "while the report notes some encouraging signs , it also highlights a marked increase in the number of people sentenced to death in 2014 .", "at least 2,466 people globally are confirmed to have been handed the sentence last year , an increase of 28 % compared with 2013 .", "the report notes that the spike in sentencing is attributable to mass-sentencing in countries including egypt and nigeria , `` against scores of people in some cases . ''", "the organization found `` positive developments '' worldwide , with most regions seeming to show reductions in the number of executions .", "opinion : sharp spike in death sentences .", "sub-saharan africa , for example , saw a 28 % fall in reported cases , and executions recorded in the middle east and north africa were down 23 % compared to 2013 .", "`` even though we 've highlighted some of the negative developments ... i think we would always highlight that there are positive developments , '' gaughran said .", "`` across the board , with the exception of europe and central asia there were fewer reports of executions in every region . ''", "the resumption of the use of capital punishment in belarus -- the only country in europe and central asia to execute people -- after a two year hiatus spoiled an near-universal decrease in countries using the death penalty by region .", "the united states has the dubious distinction of being the only country in the americas to conduct executions , but the number of convicts put to death here fell slightly , from 39 in 2013 to 35 in 2014 .", "the state of washington also imposed a moratorium on executions last year .", "the u.s. remains one of the worst offenders for imposing capital punishment , with only iran -lrb- 289 + -rrb- , iraq -lrb- 61 + -rrb- , and saudi arabia -lrb- 90 + -rrb- executing more people in 2014 .", "while figures are not available , amnesty estimates that china also executes `` thousands '' of prisoners each year , `` more than the rest of the world put together . ''", "the report also highlights the imperfections in the judiciary processes that lead to many sentenced to death .", "`` in the majority of countries where people were sentenced to death or executed , the death penalty was imposed after proceedings that did not meet international fair trial standards , '' the report stated .", "`` in 2014 amnesty international raised particular concerns in relation to court proceedings in afghanistan , bangladesh , china , egypt , iran , iraq , north korea , pakistan , saudi arabia and sri lanka . ''", "the united nations secretary-general , ban ki-moon , last year stressed the need to move toward abolition of capital punishment .", "`` the taking of life is too irreversible for one human being to inflict it on another , '' he said , in marking world day against death penalty in october .", "`` we must continue to argue strongly that the death penalty is unjust and incompatible with fundamental human rights . ''", "amnesty estimates that at least 19,094 people were believed to be on death row at the end of 2014 ."], "summary": ["amnesty 's annual death penalty report catalogs encouraging signs , but setbacks in numbers of those sentenced to death .", "organization claims that governments around the world are using the threat of terrorism to advance executions .", "the number of executions worldwide has gone down by almost 22 % compared with 2013 , but death sentences up by 28 % ."], "publication": "cnndm", "compression": 14.841269841269842, "coverage": 0.8888888888888888, "density": 5.079365079365079} +{"label": [8, 9, 34], "text": ["-lrb- cnn -rrb- on may 28 , 2014 , some 7,000 people gathered in a stadium in china 's northwestern xinjiang region .", "but they had not come to watch the local football team or any other grand sporting event .", "instead , the authorities paraded scores of prisoners dressed in orange jumpsuits .", "armed soldiers guarded the exits .", "in the patently unfair , open air trial that followed , 55 people were found guilty of a range of offenses linked to violent attacks in the region and jailed .", "three were sentenced to death .", "the public mass sentencing was part a china 's `` strike hard '' campaign against unrest in xinjiang , a campaign the government claims was launched to combat `` terrorism '' and `` separatism . ''", "but it was also indicative of a trend that was starkly evident last year around the world -- governments using the death penalty in a misguided , and often cynical , attempt to tackle crime and terrorism .", "today , amnesty international releases its annual review of the death penalty worldwide .", "much of it makes for grim reading .", "in pakistan , the government lifted a six-year moratorium on the execution of civilians in the wake of the horrific taliban attack on a school in peshawar in december .", "more than 60 people have been put to death since , and the government has threatened to send thousands more death row prisoners to the gallows .", "iran and iraq executed people for `` terrorism , '' and other countries expanded the scope of capital crimes in their penal codes .", "in a year when abhorrent summary executions by armed groups were branded on the global consciousness as never before , governments are themselves resorting to more executions in a knee-jerk reaction to terrorism .", "other countries made use of executions in similarly flawed attempts to address -- or appear to address -- crime rates .", "jordan ended an eight-year moratorium in december , putting 11 murder convicts to death , with the government saying it was a move to end a surge in violent crime .", "in indonesia , authorities announced plans to execute mainly drug traffickers to tackle a public safety `` national emergency . ''", "six people have already been executed this year .", "a sharp spike in death sentences recorded in 2014 -- up more than 500 on the previous year -- can also be attributed to governments using the death penalty as a political tool .", "the rise was largely because of developments in egypt and nigeria , where courts imposed hundreds of death sentences in the context of internal political instability or crime and armed conflict .", "the simple fact is that governments using the death penalty to tackle crime and security threats are deceiving themselves or the public or both .", "there is no evidence that the threat of execution is more of a deterrent to crime than a prison sentence , as united nations and other studies have repeatedly confirmed .", "it is high time that world leaders stop using the death penalty as an easy way out when times get tough .", "at amnesty international , we have campaigned for an end to the death penalty for decades .", "thankfully , most of the world now appears to agree with us .", "the numbers speak for themselves .", "in 1945 when the united nations was founded , only eight countries had abolished the death penalty .", "today , 140 states are abolitionist in law or practice .", "last year , we recorded executions in 22 countries , down by almost a half from 20 years ago .", "despite the troubling developments we recorded last year , there was still much good news to be found .", "the number of executions recorded around the world dropped significantly in 2014 compared with the previous year , from 778 to 607 .", "this number does not include china , where more people are put to death than the rest of the world put together , but with death penalty statistics treated as a state secret , the true figure is impossible to determine .", "executions were recorded in only three countries in sub-saharan africa -- equatorial guinea , somalia and sudan -- and the number of people put to death went down by more than a quarter .", "the americas continued to be execution-free , apart from the united states .", "those governments that still execute need to realize that they are on the wrong side of history .", "they must join the vast majority of countries which have dropped the ultimate cruel punishment .", "fighting for an end to the death penalty remains an uphill task , but all of us must try to make the world free of this punishment .", "with determination , i know that we can achieve this goal ."], "summary": ["amnesty international releases its annual review of the death penalty worldwide ; much of it makes for grim reading .", "salil shetty : countries that use executions to deal with problems are on the wrong side of history ."], "publication": "cnndm", "compression": 20.85, "coverage": 0.825, "density": 6.375} diff --git a/tests/data_for_tests/io/cnndm/test.label.jsonl b/tests/data_for_tests/io/cnndm/test.label.jsonl new file mode 100755 index 00000000..d74ebd9f --- /dev/null +++ b/tests/data_for_tests/io/cnndm/test.label.jsonl @@ -0,0 +1,4 @@ +{"label": [2, 3], "text": ["-lrb- cnn -rrb- the rev.", "robert h. schuller , california televangelist and founder of the television ministry `` hour of power , '' died thursday , according to his family .", "he was 88 years old .", "schuller , also the founder of crystal cathedral megachurch , had been diagnosed with esophageal cancer in august 2013 , a release from `` hour of power '' said .", "`` my father-in-law passed away peacefully early this morning .", "he was a great dad and a great man of god , '' said schuller 's daughter-in-law , donna schuller , in a twitter message .", "schuller 's life followed an almost shakespearean arc .", "he was born in a iowa farmhouse without running water and longed to preach from his earliest days .", "in his autobiography , `` prayer : my soul 's adventure with god , '' he described standing alone by a river and picturing himself delivering sermons to a rapt congregation .", "after attending a hope college and western theological seminary in michigan , he met his wife of more than 60 years , arvella , while preaching at her church -lrb- she was the organist -rrb- .", "with their young family in tow , the schullers caravanned west to california , where he rented a drive-in theater and preached from the roof of the snack bar .", "it was beneath the dignity of christian ministry , some local pastors huffed .", "the `` passion pits '' where teenagers necked was no place for the gospel .", "schuller was undeterred , and he quickly outgrew the drive-in .", "he called the explosive growth of his tiny congregation a `` miracle , '' though his many mainstream critics had other names for it .", "his confident , breezy version of christianity -- too breezy , by some estimations -- drew hordes of seekers and lapsed christians who were put off by the hellfire fulminations of many post-war american preachers .", "schuller sold a softer , gentler message , which borrowed heavily , he acknowledged , from the father of the feel-good gospel , norman vincent peale .", "he preached not to convert or condemn people , but to encourage them , a sentiment he called `` possibility thinking . ''", "people loved it .", "`` evangelicalism at its best wants to be innovative and reach people , '' said timothy larsen , a professor of christian thought at wheaton college in illinois .", "`` and schuller was a master at that . ''", "`` what he got right is that the gospel is good news , '' larsen continued .", "`` and he preached an uplifting message about personal transformation and uplift and hope . ''", "some of schuller 's favored phrases , though , struck others as cornpone christianity .", "`` turn your hurt into a halo ? ''", "said randall balmer , a professor of american religious history at dartmouth college , citing one such phrase .", "`` that 's pretty weak tea . ''", "still , balmer gives schuller some credit .", "`` it may be bad theology , but it 's brilliant marketing . ''", "in 1970 , schuller began broadcasting `` hour of power , '' believed to be one of the first , if not the very first , sunday service to be shown regularly on television .", "with his genial smile , priestly robes and gray hair , he looked and talked like a guy who wanted nothing more than to see his flock succeed .", "the show , which ran for decades , reached millions , making schuller a televangelist before the term became tarnished by the sins of his many successors .", "schuller 's crowning achievement , at least architecturally , still stands in orange county , california , though it is now owned by the roman catholic church .", "the crystal cathedral , a great gleaming edifice with 10,000 glass panels , gave worshipers a look at the clouds that house the heavens , while schuller preached in the pulpit below .", "the message was clear to many : the road to the former ran through the latter .", "during the 1980s and 1990s , schuller 's star continued to rise , with presidents stopping by the crystal cathedral -- often during campaigns , it should be said -- and future megachurch pastors like rick warren and bill hybels seeking his advice .", "as schuller aged , though , his family was beset by a succession scandal straight from the pages of `` king lear . ''", "he tried to install his only son , bobby jr. , as pastor of crystal cathedral .", "but the preaching styles of father and son were too different for the congregation -- measured at times at 10,000 strong -- to countenance .", "bobby schuller jr. left `` hour of power '' and the pulpit at crystal cathedral after a short time .", "as the family searched for a new successor and tussled over finances , viewers and donations to the church and its television show dropped precipitously .", "crystal cathedral ministries filed for bankruptcy in 2010 , citing debts of more than $ 43 million , according to the associated press .", "schuller 's empire , which once soared as high as his glassy cathedral , had fallen to dust .", "eventually , schuller 's grandson , also named bobby , took over `` hour of power , '' though at a different church .", "in a statement on thursday , the younger schuller recalled standing atop crystal cathedral 's 12-story tower of hope with his grandfather as they surveyed the surrounding landscape .", "`` you could see the whole world from there , '' he said .", "people we 've lost in 2015 .", "cnn 's stella chan reported from los angeles ."], "summary": ["the rev.", "robert schuller , 88 , had been diagnosed with esophageal cancer in 2013 .", "his tv show , `` hour of power , '' was enormously popular in the 1970s and 1980s ."], "publication": "cnndm", "compression": 26.342105263157894, "coverage": 0.8421052631578947, "density": 3.4210526315789473} +{"label": [4, 6], "text": ["-lrb- cnn -rrb- never mind cats having nine lives .", "a stray pooch in washington state has used up at least three of her own after being hit by a car , apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field -- only to survive .", "that 's according to washington state university , where the dog -- a friendly white-and-black bully breed mix now named theia -- has been receiving care at the veterinary teaching hospital .", "four days after her apparent death , the dog managed to stagger to a nearby farm , dirt-covered and emaciated , where she was found by a worker who took her to a vet for help .", "she was taken in by moses lake , washington , resident sara mellado .", "`` considering everything that she 's been through , she 's incredibly gentle and loving , '' mellado said , according to wsu news .", "`` she 's a true miracle dog and she deserves a good life . ''", "theia is only one year old but the dog 's brush with death did not leave her unscathed .", "she suffered a dislocated jaw , leg injuries and a caved-in sinus cavity -- and still requires surgery to help her breathe .", "the veterinary hospital 's good samaritan fund committee awarded some money to help pay for the dog 's treatment , but mellado has set up a fundraising page to help meet the remaining cost of the dog 's care .", "she 's also created a facebook page to keep supporters updated .", "donors have already surpassed the $ 10,000 target , inspired by theia 's tale of survival against the odds .", "on the fundraising page , mellado writes , `` she is in desperate need of extensive medical procedures to fix her nasal damage and reset her jaw .", "i agreed to foster her until she finally found a loving home . ''", "she is dedicated to making sure theia gets the medical attention she needs , mellado adds , and wants to `` make sure she gets placed in a family where this will never happen to her again ! ''", "any additional funds raised will be `` paid forward '' to help other animals .", "theia is not the only animal to apparently rise from the grave in recent weeks .", "a cat in tampa , florida , found seemingly dead after he was hit by a car in january , showed up alive in a neighbor 's yard five days after he was buried by his owner .", "the cat was in bad shape , with maggots covering open wounds on his body and a ruined left eye , but remarkably survived with the help of treatment from the humane society ."], "summary": ["theia , a bully breed mix , was apparently hit by a car , whacked with a hammer and buried in a field .", "`` she 's a true miracle dog and she deserves a good life , '' says sara mellado , who is looking for a home for theia ."], "publication": "cnndm", "compression": 9.150943396226415, "coverage": 0.9433962264150944, "density": 4.7924528301886795} +{"label": [32, 36], "text": ["-lrb- cnn -rrb- if you 've been following the news lately , there are certain things you doubtless know about mohammad javad zarif .", "he is , of course , the iranian foreign minister .", "he has been u.s. secretary of state john kerry 's opposite number in securing a breakthrough in nuclear discussions that could lead to an end to sanctions against iran -- if the details can be worked out in the coming weeks .", "and he received a hero 's welcome as he arrived in iran on a sunny friday morning .", "`` long live zarif , '' crowds chanted as his car rolled slowly down the packed street .", "you may well have read that he is `` polished '' and , unusually for one burdened with such weighty issues , `` jovial . ''", "an internet search for `` mohammad javad zarif '' and `` jovial '' yields thousands of results .", "he certainly has gone a long way to bring iran in from the cold and allow it to rejoin the international community .", "but there are some facts about zarif that are less well-known .", "here are six : .", "in september 2013 , zarif tweeted `` happy rosh hashanah , '' referring to the jewish new year .", "that prompted christine pelosi , the daughter of house minority leader nancy pelosi , to respond with a tweet of her own : `` thanks .", "the new year would be even sweeter if you would end iran 's holocaust denial , sir . ''", "and , perhaps to her surprise , pelosi got a response .", "`` iran never denied it , '' zarif tweeted back .", "`` the man who was perceived to be denying it is now gone .", "happy new year . ''", "the reference was likely to former iranian president mahmoud ahmadinejad , who had left office the previous month .", "zarif was nominated to be foreign minister by ahmadinejad 's successor , hassan rouhami .", "his foreign ministry notes , perhaps defensively , that `` due to the political and security conditions of the time , he decided to continue his education in the united states . ''", "that is another way of saying that he was outside the country during the demonstrations against the shah of iran , which began in 1977 , and during the iranian revolution , which drove the shah from power in 1979 .", "zarif left the country in 1977 , received his undergraduate degree from san francisco state university in 1981 , his master 's in international relations from the university of denver in 1984 and his doctorate from the university of denver in 1988 .", "both of his children were born in the united states .", "the website of the iranian foreign ministry , which zarif runs , can not even agree with itself on when he was born .", "the first sentence of his official biography , perhaps in a nod to the powers that be in tehran , says zarif was `` born to a religious traditional family in tehran in 1959 . ''", "later on the same page , however , his date of birth is listed as january 8 , 1960 .", "and the iranian diplomacy website says he was born in in 1961 .", "so he is 54 , 55 or maybe even 56 .", "whichever , he is still considerably younger than his opposite number , kerry , who is 71 .", "the feds investigated him over his alleged role in controlling the alavi foundation , a charitable organization .", "the u.s. justice department said the organization was secretly run on behalf of the iranian government to launder money and get around u.s. sanctions .", "but last year , a settlement in the case , under which the foundation agreed to give a 36-story building in manhattan along with other properties to the u.s. government , did not mention zarif 's name .", "early in the iranian revolution , zarif was among the students who took over the iranian consulate in san francisco .", "the aim , says the website iranian.com -- which cites zarif 's memoirs , titled `` mr. ambassador '' -- was to expel from the consulate people who were not sufficiently islamic .", "later , the website says , zarif went to make a similar protest at the iranian mission to the united nations .", "in response , the iranian ambassador to the united nations offered him a job .", "in fact , he has now spent more time with kerry than any other foreign minister in the world .", "and that amount of quality time will only increase as the two men , with help from other foreign ministers as well , try to meet a june 30 deadline for nailing down the details of the agreement they managed to outline this week in switzerland ."], "summary": ["mohammad javad zarif has spent more time with john kerry than any other foreign minister .", "he once participated in a takeover of the iranian consulate in san francisco .", "the iranian foreign minister tweets in english ."], "publication": "cnndm", "compression": 20.85, "coverage": 0.825, "density": 2.825} +{"label": [2], "text": ["-lrb- cnn -rrb- for the first time in eight years , a tv legend returned to doing what he does best .", "contestants told to `` come on down ! ''", "on the april 1 edition of `` the price is right '' encountered not host drew carey but another familiar face in charge of the proceedings .", "instead , there was bob barker , who hosted the tv game show for 35 years before stepping down in 2007 .", "looking spry at 91 , barker handled the first price-guessing game of the show , the classic `` lucky seven , '' before turning hosting duties over to carey , who finished up .", "despite being away from the show for most of the past eight years , barker did n't seem to miss a beat ."], "summary": ["bob barker returned to host `` the price is right '' on wednesday .", "barker , 91 , had retired as host in 2007 ."], "publication": "cnndm", "compression": 5.346153846153846, "coverage": 0.8076923076923077, "density": 2.5} diff --git a/tests/data_for_tests/io/cnndm/train.cnndm.jsonl b/tests/data_for_tests/io/cnndm/train.cnndm.jsonl new file mode 100755 index 00000000..97719a61 --- /dev/null +++ b/tests/data_for_tests/io/cnndm/train.cnndm.jsonl @@ -0,0 +1,10 @@ +{"label": [1, 19, 25], "text": ["marseille , france -lrb- cnn -rrb- the french prosecutor leading an investigation into the crash of germanwings flight 9525 insisted wednesday that he was not aware of any video footage from on board the plane .", "marseille prosecutor brice robin told cnn that `` so far no videos were used in the crash investigation . ''", "he added , `` a person who has such a video needs to immediately give it to the investigators . ''", "robin 's comments follow claims by two magazines , german daily bild and french paris match , of a cell phone video showing the harrowing final seconds from on board germanwings flight 9525 as it crashed into the french alps .", "all 150 on board were killed .", "paris match and bild reported that the video was recovered from a phone at the wreckage site .", "the two publications described the supposed video , but did not post it on their websites .", "the publications said that they watched the video , which was found by a source close to the investigation .", "`` one can hear cries of ` my god ' in several languages , '' paris match reported .", "`` metallic banging can also be heard more than three times , perhaps of the pilot trying to open the cockpit door with a heavy object .", "towards the end , after a heavy shake , stronger than the others , the screaming intensifies .", "then nothing . ''", "`` it is a very disturbing scene , '' said julian reichelt , editor-in-chief of bild online .", "an official with france 's accident investigation agency , the bea , said the agency is not aware of any such video .", "lt. col. jean-marc menichini , a french gendarmerie spokesman in charge of communications on rescue efforts around the germanwings crash site , told cnn that the reports were `` completely wrong '' and `` unwarranted . ''", "cell phones have been collected at the site , he said , but that they `` had n't been exploited yet . ''", "menichini said he believed the cell phones would need to be sent to the criminal research institute in rosny sous-bois , near paris , in order to be analyzed by specialized technicians working hand-in-hand with investigators .", "but none of the cell phones found so far have been sent to the institute , menichini said .", "asked whether staff involved in the search could have leaked a memory card to the media , menichini answered with a categorical `` no . ''", "reichelt told `` erin burnett : outfront '' that he had watched the video and stood by the report , saying bild and paris match are `` very confident '' that the clip is real .", "he noted that investigators only revealed they 'd recovered cell phones from the crash site after bild and paris match published their reports .", "`` that is something we did not know before .", "... overall we can say many things of the investigation were n't revealed by the investigation at the beginning , '' he said .", "what was mental state of germanwings co-pilot ?", "german airline lufthansa confirmed tuesday that co-pilot andreas lubitz had battled depression years before he took the controls of germanwings flight 9525 , which he 's accused of deliberately crashing last week in the french alps .", "lubitz told his lufthansa flight training school in 2009 that he had a `` previous episode of severe depression , '' the airline said tuesday .", "email correspondence between lubitz and the school discovered in an internal investigation , lufthansa said , included medical documents he submitted in connection with resuming his flight training .", "the announcement indicates that lufthansa , the parent company of germanwings , knew of lubitz 's battle with depression , allowed him to continue training and ultimately put him in the cockpit .", "lufthansa , whose ceo carsten spohr previously said lubitz was 100 % fit to fly , described its statement tuesday as a `` swift and seamless clarification '' and said it was sharing the information and documents -- including training and medical records -- with public prosecutors .", "spohr traveled to the crash site wednesday , where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside .", "he saw the crisis center set up in seyne-les-alpes , laid a wreath in the village of le vernet , closer to the crash site , where grieving families have left flowers at a simple stone memorial .", "menichini told cnn late tuesday that no visible human remains were left at the site but recovery teams would keep searching .", "french president francois hollande , speaking tuesday , said that it should be possible to identify all the victims using dna analysis by the end of the week , sooner than authorities had previously suggested .", "in the meantime , the recovery of the victims ' personal belongings will start wednesday , menichini said .", "among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board .", "check out the latest from our correspondents .", "the details about lubitz 's correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and lubitz 's possible motive for downing the jet .", "a lufthansa spokesperson told cnn on tuesday that lubitz had a valid medical certificate , had passed all his examinations and `` held all the licenses required . ''", "earlier , a spokesman for the prosecutor 's office in dusseldorf , christoph kumpa , said medical records reveal lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot 's license .", "kumpa emphasized there 's no evidence suggesting lubitz was suicidal or acting aggressively before the crash .", "investigators are looking into whether lubitz feared his medical condition would cause him to lose his pilot 's license , a european government official briefed on the investigation told cnn on tuesday .", "while flying was `` a big part of his life , '' the source said , it 's only one theory being considered .", "another source , a law enforcement official briefed on the investigation , also told cnn that authorities believe the primary motive for lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems .", "lubitz 's girlfriend told investigators he had seen an eye doctor and a neuropsychologist , both of whom deemed him unfit to work recently and concluded he had psychological issues , the european government official said .", "but no matter what details emerge about his previous mental health struggles , there 's more to the story , said brian russell , a forensic psychologist .", "`` psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they were n't going to keep doing their job and they 're upset about that and so they 're suicidal , '' he said .", "`` but there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person 's problems . ''", "germanwings crash compensation : what we know .", "who was the captain of germanwings flight 9525 ?", "cnn 's margot haddad reported from marseille and pamela brown from dusseldorf , while laura smith-spark wrote from london .", "cnn 's frederik pleitgen , pamela boykoff , antonia mortensen , sandrine amiel and anna-maja rappard contributed to this report ."], "summary": ["marseille prosecutor says `` so far no videos were used in the crash investigation '' despite media reports .", "journalists at bild and paris match are `` very confident '' the video clip is real , an editor says .", "andreas lubitz had informed his lufthansa training school of an episode of severe depression , airline says ."], "publication": "CNN", "compression": 22.283333333333335, "coverage": 0.8666666666666667, "density": 4.6} +{"label": [3, 5, 24], "text": ["-lrb- cnn -rrb- the palestinian authority officially became the 123rd member of the international criminal court on wednesday , a step that gives the court jurisdiction over alleged crimes in palestinian territories .", "the formal accession was marked with a ceremony at the hague , in the netherlands , where the court is based .", "the palestinians signed the icc 's founding rome statute in january , when they also accepted its jurisdiction over alleged crimes committed `` in the occupied palestinian territory , including east jerusalem , since june 13 , 2014 . ''", "later that month , the icc opened a preliminary examination into the situation in palestinian territories , paving the way for possible war crimes investigations against israelis .", "as members of the court , palestinians may be subject to counter-charges as well .", "israel and the united states , neither of which is an icc member , opposed the palestinians ' efforts to join the body .", "but palestinian foreign minister riad al-malki , speaking at wednesday 's ceremony , said it was a move toward greater justice .", "`` as palestine formally becomes a state party to the rome statute today , the world is also a step closer to ending a long era of impunity and injustice , '' he said , according to an icc news release .", "`` indeed , today brings us closer to our shared goals of justice and peace . ''", "judge kuniko ozaki , a vice president of the icc , said acceding to the treaty was just the first step for the palestinians .", "`` as the rome statute today enters into force for the state of palestine , palestine acquires all the rights as well as responsibilities that come with being a state party to the statute .", "these are substantive commitments , which can not be taken lightly , '' she said .", "rights group human rights watch welcomed the development .", "`` governments seeking to penalize palestine for joining the icc should immediately end their pressure , and countries that support universal acceptance of the court 's treaty should speak out to welcome its membership , '' said balkees jarrah , international justice counsel for the group .", "`` what 's objectionable is the attempts to undermine international justice , not palestine 's decision to join a treaty to which over 100 countries around the world are members . ''", "in january , when the preliminary icc examination was opened , israeli prime minister benjamin netanyahu described it as an outrage , saying the court was overstepping its boundaries .", "the united states also said it `` strongly '' disagreed with the court 's decision .", "`` as we have said repeatedly , we do not believe that palestine is a state and therefore we do not believe that it is eligible to join the icc , '' the state department said in a statement .", "it urged the warring sides to resolve their differences through direct negotiations .", "`` we will continue to oppose actions against israel at the icc as counterproductive to the cause of peace , '' it said .", "but the icc begs to differ with the definition of a state for its purposes and refers to the territories as `` palestine . ''", "while a preliminary examination is not a formal investigation , it allows the court to review evidence and determine whether to investigate suspects on both sides .", "prosecutor fatou bensouda said her office would `` conduct its analysis in full independence and impartiality . ''", "the war between israel and hamas militants in gaza last summer left more than 2,000 people dead .", "the inquiry will include alleged war crimes committed since june .", "the international criminal court was set up in 2002 to prosecute genocide , crimes against humanity and war crimes .", "cnn 's vasco cotovio , kareem khadder and faith karimi contributed to this report ."], "summary": ["membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june .", "israel and the united states opposed the move , which could open the door to war crimes investigations against israelis ."], "publication": "CNN", "compression": 17.57894736842105, "coverage": 0.8947368421052632, "density": 3.1052631578947367} +{"label": [0, 6], "text": ["-lrb- cnn -rrb- governments around the world are using the threat of terrorism -- real or perceived -- to advance executions , amnesty international alleges in its annual report on the death penalty .", "`` the dark trend of governments using the death penalty in a futile attempt to tackle real or imaginary threats to state security and public safety was stark last year , '' said salil shetty , amnesty 's secretary general in a release .", "`` it is shameful that so many states around the world are essentially playing with people 's lives -- putting people to death for ` terrorism ' or to quell internal instability on the ill-conceived premise of deterrence . ''", "the report , `` death sentences and executions 2014 , '' cites the example of pakistan lifting a six-year moratorium on the execution of civilians following the horrific attack on a school in peshawar in december .", "china is also mentioned , as having used the death penalty as a tool in its `` strike hard '' campaign against terrorism in the restive far-western province of xinjiang .", "the annual report catalogs the use of state-sanctioned killing as a punitive measure across the globe , and this year 's edition contains some mixed findings .", "on one hand , the number of executions worldwide has gone down by almost 22 % on the previous year .", "at least 607 people were executed around the world in 2014 , compared to 778 in 2013 .", "amnesty 's figures do not include statistics on executions carried out in china , where information on the practice is regarded as a state secret .", "belarus and vietnam , too , do not release data on death penalty cases .", "`` the long-term trend is definitely positive -- we are seeing a decrease in the number of executions -lrb- worldwide -rrb- , '' audrey gaughran , amnesty 's director of global issues , told cnn .", "`` a number of countries are closer to abolition , and there are some signs that some countries will be abolitionist by 2015 .", "-lrb- there are -rrb- signals of a world that is nearing abolition . ''", "while the report notes some encouraging signs , it also highlights a marked increase in the number of people sentenced to death in 2014 .", "at least 2,466 people globally are confirmed to have been handed the sentence last year , an increase of 28 % compared with 2013 .", "the report notes that the spike in sentencing is attributable to mass-sentencing in countries including egypt and nigeria , `` against scores of people in some cases . ''", "the organization found `` positive developments '' worldwide , with most regions seeming to show reductions in the number of executions .", "opinion : sharp spike in death sentences .", "sub-saharan africa , for example , saw a 28 % fall in reported cases , and executions recorded in the middle east and north africa were down 23 % compared to 2013 .", "`` even though we 've highlighted some of the negative developments ... i think we would always highlight that there are positive developments , '' gaughran said .", "`` across the board , with the exception of europe and central asia there were fewer reports of executions in every region . ''", "the resumption of the use of capital punishment in belarus -- the only country in europe and central asia to execute people -- after a two year hiatus spoiled an near-universal decrease in countries using the death penalty by region .", "the united states has the dubious distinction of being the only country in the americas to conduct executions , but the number of convicts put to death here fell slightly , from 39 in 2013 to 35 in 2014 .", "the state of washington also imposed a moratorium on executions last year .", "the u.s. remains one of the worst offenders for imposing capital punishment , with only iran -lrb- 289 + -rrb- , iraq -lrb- 61 + -rrb- , and saudi arabia -lrb- 90 + -rrb- executing more people in 2014 .", "while figures are not available , amnesty estimates that china also executes `` thousands '' of prisoners each year , `` more than the rest of the world put together . ''", "the report also highlights the imperfections in the judiciary processes that lead to many sentenced to death .", "`` in the majority of countries where people were sentenced to death or executed , the death penalty was imposed after proceedings that did not meet international fair trial standards , '' the report stated .", "`` in 2014 amnesty international raised particular concerns in relation to court proceedings in afghanistan , bangladesh , china , egypt , iran , iraq , north korea , pakistan , saudi arabia and sri lanka . ''", "the united nations secretary-general , ban ki-moon , last year stressed the need to move toward abolition of capital punishment .", "`` the taking of life is too irreversible for one human being to inflict it on another , '' he said , in marking world day against death penalty in october .", "`` we must continue to argue strongly that the death penalty is unjust and incompatible with fundamental human rights . ''", "amnesty estimates that at least 19,094 people were believed to be on death row at the end of 2014 ."], "summary": ["amnesty 's annual death penalty report catalogs encouraging signs , but setbacks in numbers of those sentenced to death .", "organization claims that governments around the world are using the threat of terrorism to advance executions .", "the number of executions worldwide has gone down by almost 22 % compared with 2013 , but death sentences up by 28 % ."], "publication": "CNN", "compression": 14.841269841269842, "coverage": 0.8888888888888888, "density": 5.079365079365079} +{"label": [8, 9, 34], "text": ["-lrb- cnn -rrb- on may 28 , 2014 , some 7,000 people gathered in a stadium in china 's northwestern xinjiang region .", "but they had not come to watch the local football team or any other grand sporting event .", "instead , the authorities paraded scores of prisoners dressed in orange jumpsuits .", "armed soldiers guarded the exits .", "in the patently unfair , open air trial that followed , 55 people were found guilty of a range of offenses linked to violent attacks in the region and jailed .", "three were sentenced to death .", "the public mass sentencing was part a china 's `` strike hard '' campaign against unrest in xinjiang , a campaign the government claims was launched to combat `` terrorism '' and `` separatism . ''", "but it was also indicative of a trend that was starkly evident last year around the world -- governments using the death penalty in a misguided , and often cynical , attempt to tackle crime and terrorism .", "today , amnesty international releases its annual review of the death penalty worldwide .", "much of it makes for grim reading .", "in pakistan , the government lifted a six-year moratorium on the execution of civilians in the wake of the horrific taliban attack on a school in peshawar in december .", "more than 60 people have been put to death since , and the government has threatened to send thousands more death row prisoners to the gallows .", "iran and iraq executed people for `` terrorism , '' and other countries expanded the scope of capital crimes in their penal codes .", "in a year when abhorrent summary executions by armed groups were branded on the global consciousness as never before , governments are themselves resorting to more executions in a knee-jerk reaction to terrorism .", "other countries made use of executions in similarly flawed attempts to address -- or appear to address -- crime rates .", "jordan ended an eight-year moratorium in december , putting 11 murder convicts to death , with the government saying it was a move to end a surge in violent crime .", "in indonesia , authorities announced plans to execute mainly drug traffickers to tackle a public safety `` national emergency . ''", "six people have already been executed this year .", "a sharp spike in death sentences recorded in 2014 -- up more than 500 on the previous year -- can also be attributed to governments using the death penalty as a political tool .", "the rise was largely because of developments in egypt and nigeria , where courts imposed hundreds of death sentences in the context of internal political instability or crime and armed conflict .", "the simple fact is that governments using the death penalty to tackle crime and security threats are deceiving themselves or the public or both .", "there is no evidence that the threat of execution is more of a deterrent to crime than a prison sentence , as united nations and other studies have repeatedly confirmed .", "it is high time that world leaders stop using the death penalty as an easy way out when times get tough .", "at amnesty international , we have campaigned for an end to the death penalty for decades .", "thankfully , most of the world now appears to agree with us .", "the numbers speak for themselves .", "in 1945 when the united nations was founded , only eight countries had abolished the death penalty .", "today , 140 states are abolitionist in law or practice .", "last year , we recorded executions in 22 countries , down by almost a half from 20 years ago .", "despite the troubling developments we recorded last year , there was still much good news to be found .", "the number of executions recorded around the world dropped significantly in 2014 compared with the previous year , from 778 to 607 .", "this number does not include china , where more people are put to death than the rest of the world put together , but with death penalty statistics treated as a state secret , the true figure is impossible to determine .", "executions were recorded in only three countries in sub-saharan africa -- equatorial guinea , somalia and sudan -- and the number of people put to death went down by more than a quarter .", "the americas continued to be execution-free , apart from the united states .", "those governments that still execute need to realize that they are on the wrong side of history .", "they must join the vast majority of countries which have dropped the ultimate cruel punishment .", "fighting for an end to the death penalty remains an uphill task , but all of us must try to make the world free of this punishment .", "with determination , i know that we can achieve this goal ."], "summary": ["amnesty international releases its annual review of the death penalty worldwide ; much of it makes for grim reading .", "salil shetty : countries that use executions to deal with problems are on the wrong side of history ."], "publication": "CNN", "compression": 20.85, "coverage": 0.825, "density": 6.375} +{"label": [2, 3], "text": ["-lrb- cnn -rrb- seventy years ago , anne frank died of typhus in a nazi concentration camp at the age of 15 .", "just two weeks after her supposed death on march 31 , 1945 , the bergen-belsen concentration camp where she had been imprisoned was liberated -- timing that showed how close the jewish diarist had been to surviving the holocaust .", "but new research released by the anne frank house shows that anne and her older sister , margot frank , died at least a month earlier than previously thought .", "researchers re-examined archives of the red cross , the international training service and the bergen-belsen memorial , along with testimonies of survivors .", "they concluded that anne and margot probably did not survive to march 1945 -- contradicting the date of death which had previously been determined by dutch authorities .", "in 1944 , anne and seven others hiding in the amsterdam secret annex were arrested and sent to the auschwitz-birkenau concentration camp .", "anne frank 's final entry .", "that same year , anne and margot were separated from their mother and sent away to work as slave labor at the bergen-belsen camp in germany .", "days at the camp were filled with terror and dread , witnesses said .", "the sisters stayed in a section of the overcrowded camp with no lighting , little water and no latrine .", "they slept on lice-ridden straw and violent storms shredded the tents , according to the researchers .", "like the other prisoners , the sisters endured long hours at roll call .", "her classmate , nannette blitz , recalled seeing anne there in december 1944 : `` she was no more than a skeleton by then .", "she was wrapped in a blanket ; she could n't bear to wear her clothes anymore because they were crawling with lice . ''", "listen to anne frank 's friends describe her concentration camp experience .", "as the russians advanced further , the bergen-belsen concentration camp became even more crowded , bringing more disease .", "a deadly typhus outbreak caused thousands to die each day .", "typhus is an infectious disease caused by lice that breaks out in places with poor hygiene .", "the disease causes high fever , chills and skin eruptions .", "`` because of the lice infesting the bedstraw and her clothes , anne was exposed to the main carrier of epidemic typhus for an extended period , '' museum researchers wrote .", "they concluded that it 's unlikely the sisters survived until march , because witnesses at the camp said the sisters both had symptoms before february 7 .", "`` most deaths caused by typhus occur around twelve days after the first symptoms appear , '' wrote authors erika prins and gertjan broek .", "the exact dates of death for anne and margot remain unclear .", "margot died before anne .", "`` anne never gave up hope , '' said blitz , her friend .", "`` she was absolutely convinced she would survive . ''", "her diary endures as one of the world 's most popular books .", "read more about anne frank 's cousin , a keeper of her legacy ."], "summary": ["museum : anne frank died earlier than previously believed .", "researchers re-examined archives and testimonies of survivors .", "anne and older sister margot frank are believed to have died in february 1945 ."], "publication": "CNN", "compression": 14.864864864864865, "coverage": 0.8378378378378378, "density": 2.189189189189189} +{"label": [1, 2, 10, 14, 19], "text": ["it is a week which has seen him in deep water - both on and off the pitch .", "just days after dallas cowboys ' greg hardy was suspended from 10 nfl games he appeared to get into trouble when he drove his luxury car through flash floods in dallas , getting stuck when the car could not make it through the rising , fast flowing waters .", "the 25-year-old was forced to abandon his bentley , leaving it stranded until the waters receded and the car could be towed away .", "it took the tow truck several hours to successfully remove the car and hardy was later seen returning to the vehicle to collect some of his possessions .", "he left in another luxury car , a white ferrari .", "scroll down for video .", "greg hardy found himself in more deep water when he was forced to abandon his bentley in flash floods .", "the problem with his car comes as more bad news for hardy who was suspended by the nfl just days ago after an incident of domestic abuse that allegedly occurred last year .", "hardy , who signed with the dallas cowboys last month , will be forced to sit out the first 10 games of the season and will not receive his salary for these games .", "last year hardy , 25 , was convicted by a judge in charlotte , north carolina of beating , strangling and threatening to kill his ex-girlfriend , nicki holder .", "those charges were later dropped on an appeal when holder could not be located to testify .", "a two month investigation by the nfl followed and officials decided he had to be suspended .", "hardy was informed in a letter from nfl commissioner roger goodell that the probe determined there was ` sufficient credible evidence that hardy engaged in conduct that violated nfl policies in multiple respects . '", "hardy was dropped by his previous team , the carolina panthers , because of these charges last season , but was still able to collect his salary during that time , which was roughly $ 770,000 a week .", "hardy previously played for the carolina panthers but was dropped after allegations of domestic abuse emerged and was then signed by dallas cowboys and suspended for 10 games by the nfl .", "hardy is seen talking to officials after his bentley got stuck in flash floods in dallas this week . '", "i understand that i need to step away from football right now and take care of this legal matter , ' hardy said in a statement after he was cut from the panthers .", "the panthers had originally agreed to wait to take action until hardy had a jury trial regarding the incident in may .", "his previous conviction was the result of a bench trial .", "a jury trial ultimately led to all charges being dropped .", "holder told police that hardy choked her , slammed her against a bathtub , threw her to the floor and threatened to kill her after a fight at his charlotte condo .", "the dallas cowboys star was seen attempting to drive his bentley during the floods , but had to abandon it .", "it took officials and a tow truck several hours to pull the luxury bentley free from dallas flood waters .", "this all came at a time when the league was under heavy scrutiny in the wake of two abuse scandals involving stars ray rice and adrian peterson .", "many were upset with the punishments those two received , feeling the nfl was too lenient .", "video of rice punching then-fianc\u00e9e janay palmer went public last monday , and peterson was indicted on charges of reckless or negligent injury to a child on friday for an incident in which he hit his son with a switch back in may .", "hardy -lrb- above -rrb- was convicted by a judge last july of beating , strangling and threatening to kill ex-girlfriend nicki holder .", "the nfl announced that hardy would be suspended without pay for 10 games at the start of the 2015 season .", "holder -lrb- above with hardy -rrb- told police that he choked her , slammed her against a bathtub , threw her to the floor and threatened to kill her after a fight at his condo .", "rice was definitely suspended from the nfl and had his contract terminated by the baltimore ravens , while peterson , who was sidelined by the minnesota vikings last sunday , has now been suspended by the team .", "both men are expected by many to return to play in the 2015 , with peterson back on the vikings after an nfl decision and rice winning a wrongful termination suit during the off-season .", "rice even pocketed roughly $ 1.6 million in back pay ."], "summary": ["hardy was convicted of domestic abuse against ex-girlfriend nicki holder and was suspended from the dallas cowboys for 10 days by the nfl .", "charges were eventually dropped after holder could not be located when hardy 's lawyers appealed the decision and asked for a jury trial .", "this week he got stuck in his bentley in deep flash flood waters in dallas .", "hardy was forced to abandon his car and it was towed away hours later ."], "publication": "DailyMail", "compression": 9.845238095238095, "coverage": 0.9047619047619048, "density": 2.3333333333333335} +{"label": [1, 2], "text": ["an hiv self-testing kit is on sale for the first time in the uk .", "the 99.7 per cent accurate biosure hiv self test enables people to test themselves when and where they like .", "an estimated 26,000 people in the uk have hiv but are unaware of it and may be transmitting the disease to others .", "the 99.7 per cent accurate biosure hiv self test enables people to test themselves when and where they like .", "the testing kit , on sale online , uses a small amount of blood from a finger-prick sample to detect the presence of hiv antibodies , giving a result in just 15 minutes .", "treatments available mean hiv is now a manageable disease -- but late diagnosis can have a devastating impact on health and life expectancy .", "the national aids trust warns that 40 per cent of those living with hiv remain undiagnosed for at least four years , with those diagnosed late 11 times more likely to die in the first year after diagnosis .", "the testing kit , on sale online , uses a small amount of blood from a finger-prick sample to detect the presence of hiv antibodies , giving a result in just 15 minutes .", "biosure founder brigette bard said it is a significant step towards normalising hiv testing , adding : ` knowing your hiv status is critical and the launch of this product will empower people to discreetly test themselves when it is convenient to them and in a place where they feel comfortable . '", "positive test results need to be confirmed by a healthcare professional and those in high-risk groups are recommended to be tested every three months .", "the only alternative currently available is ` home sampling ' , which involves collecting a blood sample 160 times larger than that for the self-test and posting it to a laboratory , with results given five days later .", "biosure founder brigette bard said it is a significant step towards normalising hiv testing ."], "summary": ["the 99.7 per cent accurate biosure hiv self test enables people to test themselves when and where they like .", "an estimated 26,000 people in the uk have hiv but are unaware of it .", "treatments available mean hiv is now a manageable disease ."], "publication": "DailyMail", "compression": 7.468085106382978, "coverage": 0.9574468085106383, "density": 14.446808510638299} +{"label": [4, 10, 15], "text": ["everyone knows the tortoise beat the hare , but this little fellow has gone one better and beaten two cheetahs .", "these pictures capture the amazing moment when one of the notoriously slow-moving reptiles escaped becoming big cat fast food by retreating into its shell before scuttling off across desert sands .", "the baffled cheetahs surrounded the tortoise and attempted to scare it out of its shell with snarls but the reptile kept well tucked up inside its tough exterior forcing the big cats to wander off in search of another snack .", "hard target : the tortoise attempts a quick getaway under the watchful eye of one of the curious cheetahs .", "confused : the two cheetahs exchange glances as they move in to size up their potential meal .", "the intriguing scene was captured by john mullineux , a chemical engineer from secunda , south africa .", "he said : ` while driving on the sandy tracks of the kalahari desert in south africa , i came across two cheetahs lying in the shade near the road .", "` shortly after i stopped , they got up and slowly headed to the dunes .", "` halfway up the red sandy dune the younger one stopped to inspect a tortoise , the older one also stopped and tried to bite the shell but could n't manage it .", "now you see me : the tortoise retreats into its shell as the big cats get too close for comfort .", "snarl : one of the cheetahs gets up close and personal to the little reptile and tries to scare it out of its shell .", "` by the time the older cheetah had made it to the top of the dune , the younger one decided to run off and follow rather than spend more time at the hard meal .", "` the tortoise then casually moved on as if nothing unusual had happened .", "from a young age i have loved cheetahs for their elegance and speed - seeing two so close was dream but seeing them size up their lunch was unique .", "` it was something that was both exciting and naturally beautiful at the same time . '", "slow and steady : the tortoise continues his escape across the sands of the kalahari desert in south africa .", "john mullineux , a chemical engineer from secunda , south africa , spotted the scene while driving along a desert track .", "one of the cheetahs appears to admit defeat and wander off throwing a last glance of its shoulder at the lucky tortoise ."], "summary": ["amazing scene captured on film in south africa 's kalahari desert .", "two of the big cats approach the little reptile as it scuttled across the sands .", "but they were denied their meal and forced to wander off disappointed ."], "publication": "DailyMail", "compression": 10.209302325581396, "coverage": 0.7674418604651163, "density": 1.4651162790697674} +{"label": [4, 9, 33], "text": ["angus hawley 's brother has spoken of his shock after his brother , the ex-husband of antonia kidman , died of a suspected heart attack , age 46 , in new york on saturday .", "speaking to daily mail australia on monday , david hawley said : ` it 's a real shock , he was one of the fittest men i 've ever met -- he 's swimming everyday . '", "responding to a question about whether angus had a history of heart problems , david answered : ` no , no , not that we know of ' , adding : ` he 's so fit , i do n't understand . '", "scroll down for video .", "` he did n't have heart problems ' angus hawley 's brother reveals shock after ex-husband of antonia kidman dies from a suspected heart attack in new york after ` returning from a swim ' .", "angus and antonia pictured together in 2005 at the chuan spa opening in the langham hotel .", "mr hawley , who was in new york attending a business conference at the time , collapsed after returning from a swim .", "` he did go for a big swim in the morning , he trains very hard , ' david said of his brother , who he described as a ` bit of a fitness fanatic ' and was known to lead a healthy and active lifestyle . '", "i think his body clock was round the wrong way and it just got everything round the wrong way and he 's over done it . '", "mr hawley was a father to four children , lucia , 16 , hamish , 14 , james , 12 , and sybella , eight , all of whom he shared with nicole kidman 's sister antonia before their 2007 split .", "the children are reportedly set to join the family in sydney as they rally around david 's second wife prue fisher , who he married in palm beach in 2011 .", "sad news : antonia kidman 's former husband angus hawley has died of a suspected heart attack aged 46 in new york .", "the pair are seen here in 2003 .", "fitness fanatic : mr hawley 's brother says he does n't ` understand ' the death of his fit and healthy brother , pictured with his wife prue fisher in 2011 .", "led an active lifestyle : mr hawley , 46 , is believed to have suffered a heart attack after returning from a swim .", "the former couple are pictured above with antonia 's parents janelle and the late dr. antony kidman .", "david described his brother , a business development manager at valor private wealth , as ` one of the most beautiful men that i have ever known .", "` he is absolutely adored by everybody , he made everybody feel like he 's their best friend and that 's why everybody loved him .", "and he loved everybody else , it 's just a really emotional time . '", "prue is being comforted by her family in sydney , after they traveled from orange in new south wales to be by her side .", "she was reportedly seen at the bondi icebergs public pool , a place her late husband often frequented , on sunday .", "moved on : both antonia and mr hawley remarried following their divorce in 2007 - she to businessman craig marran -lrb- l -rrb- in 2010 , and he to sydney fashion boutique manager prue the following year -lrb- r -rrb- .", "david described prue as ` devastated ' saying she 's ` terrible , terrible ' , adding , ` it 's a huge hole in our lives .", "` they were absolutely devoted to each other and prue 's relationship with angus 's children was fantastic , ' said david of his late brother 's wife .", "` his wife adores him , and he adored her , his four children , it 's just so sad .", "it 's a tragic loss to our family and to his family , it 's just a nightmare .", "` no matter what happens for the rest of her life , she 'll still be my sister-in-law . '", "on saturday another of angus 's brothers phillip released a statement , describing his death as ` sudden ' and ` very unexpected ' to news.com.au .", "wedding day : antonia and angus wed in 1996 , they were together for 11 years before their divorced was finalised in 2007 .", "legacy : the 46-year-old was a father to four children in lucia , 16 , hamish , 14 , james , 12 , and sybella , eight , all of whom he shared with nicole kidman 's sister antonia , pictured .", "` there are no further details at this time as it only occurred last night , our time , ' the statement read .", "reports about his death have as yet been mixed , with news.com.au saying that mr hawley went to dinner with a friend in new york and then went into cardiac arrest .", "he is said to have later passed away in the ambulance on the way to hospital .", "mr hawley 's death comes less than seven months after the sudden passing of nicole and antonia 's father dr. antony kidman , who also suffered a suspected heart attack , in singapore .", "family tragedy : mr hawley 's death comes less than seven months after the sudden passing of nicole and antonia 's father dr. antony , who also suffered a heart attack , in singapore .", "both 44-years-old antonia and her ex husband both remarried following their divorce in 2007 - she to businessman craig marran in 2010 , and he to sydney fashion boutique manager prue , the following year .", "he has kept himself largely out of the spotlight following his split from antonia and a battle with depression .", "the father of four checked himself into a sydney rehab clinic in 2007 following a period of mental health issues .", "tragic : antonia 's second husband craig marran accompanied her , her sister nicole and husband keith urban to dr. antony 's funeral in september last year .", "he told woman 's day in 2009 : ' i was depressed , out of control and full of self-loathing , and i resorted to drugs to get through it . '", "i was n't in a happy place and it was an appalling thing , but i was sick , and at least i was big enough to do something about it . '", "merivale hotel founder justin hemmes , has paid tribute to his good friend angus , explaining to the daily telegraph that the pair became friends at just four years old .", "family man : dr. antony kidman was visiting antonia and her family in singapore when he passed away .", "day of mourning : antonia 's six children lucia , hamish , james , sybella , nicholas , two , and alexander , one , attended the funeral along with nicole 's daughters sunday rose and faith .", "support : keith and craig acted as pallbearers at the funeral , as did family friends russell crowe and channel nine newsreader peter overton .", "` he was my next door neighbour but quickly became a best friend , one i was fortunate enough to have by my side ever since , ' he said , describing mr hawley as ` the most caring , thoughtful and loving man . '", "` the most loving father to his four wonderful children and adoring wife .", "his family was his treasure .", "his kids were his life , ' he continued .", "mr hawley 's death is the second devastating loss the kidman family has suffered in the past seven months , after dr. antony kidman sadly collapsed and died in a singapore hotel last september at the age of 75 .", "family photo : antonia , janelle , dr. antony and nicole are seen here in 1990 .", "nicole said at his funeral she was ` so lucky ' to be her father 's daughter .", "close knit : nicole and antonia are pictured here with their late father in 1990 .", "a respected sydney psychologist , dr. antony was in the country visiting antonia and his six grandchildren .", "antonia , a journalist and writer , is currently based in singapore with her second husband with whom she shares two sons , nicholas , two , and alexander , one .", "she remembered the close relationship she had with her father at his funeral last year and said they were ` similar in many ways ' .", "new home : antonia resides in singapore with second husband craig .", "she 's pictured here with nicole , who lives in nashville with keith urban , in 2005 .", "` i 'm so lucky to be his daughter , ' 47-year-old nicole said , ` and that he chose my mother to make me with . '", "appearing on ellen last october , nicole said husband keith urban had to carry her , sometimes literally , because she was ` so devastated ' by the loss .", "daily mail australia has contacted the kidman family 's management .", "tribute : a good friend of mr hawley , merivale founder justin hemmes has described him as ` the most caring , thoughtful and loving man '"], "summary": ["angus hawley 's brother said his late sibling ` did n't have heart problems ' he is reported to have had a suspected heart attack in new york .", "angus was a father of four children - lucia , hamish , james and sybella .", "he had all four with nicole kidman 's sister antonia before their 2007 split .", "both 44-year-old antonia and angus , 46 , remarried following their divorce .", "angus ' death comes seven months after dr. antony kidman 's death .", "nicole and antonia 's father also died of a heart attack in singapore ."], "publication": "DailyMail", "compression": 15.157407407407407, "coverage": 0.9259259259259259, "density": 3.740740740740741} +{"label": [7, 17], "text": ["despite the hype surrounding its first watch , the iphone is still the engine behind apple 's phenomenal success , its latest figures have revealed .", "the results far surpassed most analysts ' expectations for the first three months of the year , when sales traditionally fall from their holiday-season peak .", "apple sold more than 61 million iphones in the quarter , accounting for more than two-thirds of its $ 58 billion in revenue for the quarter and the lion 's share of its $ 13.6 billion in profit - and up 40 % from a year ago .", "sales of iphones in china were also revealed to have outstripped those in the us .", "apple sold more than 61 million iphones in the quarter , accounting for more than two-thirds of its $ 58 billion in revenue for the quarter and the lion 's share of its $ 13.6 billion in profit .", "$ 58 billion in revenue , $ 13.6 billion in profit .", "$ 200 billion in cash , up from around $ 150 billion a year ago .", "more than 61 million iphones sole .", "ipad revenue fell 29 % to $ 5.4 billion .", "revenue from mac computers rose 2 % from a year earlier , to $ 5.6 billion .", "` we are thrilled by the continued strength of iphone , mac and the app store , which drove our best march quarter results ever , ' said tim cook , apple 's ceo .", "` we 're seeing a higher rate of people switching to iphone than we 've experienced in previous cycles , and we 're off to an exciting start to the june quarter with the launch of apple watch . '", "as expected , the numbers were down from the previous quarter , when holiday shoppers snapped up a record 74 million of apple 's new iphone 6 , 6 plus and older models .", "but it was a 40 percent increase over the number of iphones sold in the first three months of 2014 .", "` we 're seeing great results all over the world , ' apple chief financial officer luca maestri told the associated press , adding that iphone sales grew 72 percent in china , where the company has big hopes for expansion .", "other products played a much smaller role .", "revenue from mac computers rose 2 percent from a year earlier , to $ 5.6 billion , while ipad revenue fell 29 percent , to $ 5.4 billion -- continuing a steady decline in tablet sales .", "apple did n't report any results for the new apple watch , which it began selling this month , after the quarter ended .", "maestri said customer response had been ` positive . '", "analysts estimate about 2 million have sold to date , suggesting early demand is healthy but not of blockbuster proportions .", "apple shares have gained more than 50 percent over the last year , making it the world 's most valuable company .", "` it 's been really great to see the reaction of customers , ' said cook .", "` the response has been overwhelming .", "we ca n't wait to see more of the inspiring apps developers dream up . '", "the iphone is another story .", "since it began offering models with bigger screens last fall , apple has vied with south korea 's samsung for the no.", "1 position in the global smartphone market .", "by some estimates , apple outsold samsung in the quarter that ended in december , and analysts will be watching closely when samsung reports its latest results this week .", "apple also announced an expansion of its effort to return more of its sizable cash war chest to investors .", "the company said it will raise its quarterly dividend by 11 percent , to 52 cents a share , and has increased a $ 90 billion stock buyback program to $ 140 billion .", "apple did n't report any results for the new apple watch , which it began selling this month , after the quarter ended .", "in total , the company said the program will return $ 200 billion to investors by the end of march 2017 .", "as iphone sales have surged , so has apple 's stock .", "apple shares have gained more than 50 percent over the last year , making it the world 's most valuable company .", "the stock closed monday at $ 132.65 , up 1.8 percent for the day , and was rising in late trading .", "the iphone is n't just apple 's ` dominant product , ' said frank gillett , a tech industry analyst at forrester research .", "` it 's more than anything else what 's driving the success of their company . '", "market researchers , however , expect growth in the world smartphone market will slow this year , particularly at the higher price range where apple competes , as most consumers in developed countries have already bought one .", "that could make it difficult for apple to maintain its recent pace .", "` they 're extremely dependent on the iphone , ' said investment colin gillis at bgc partners .", "` at some point , the market dynamics change , ' he said , adding that ` the question is what could replace the iphone ' if sales begin to slow .", "customers looking at apple iphones in an apple store in shanghai , china , on january 14 , 2014 .", "apple ceo tim cook has said he 's optimistic about new markets such as china , where apple has made a strong showing against samsung and china 's xiaomi .", "and even if apple is increasingly selling new iphones to people who are simply upgrading older models , ` that 's still a pretty healthy market , ' said gartner analyst van baker , noting that more than 700 million iphones have been sold since the first model was introduced in 2007 .", "maestri also stressed the potential for new products like apple watch and apple pay , the company 's mobile payment service .", "while these currently provide minimal revenue , analysts say they have big potential .", "and they are designed to work closely with the iphone , which means each may bolster the other 's popularity in the future , gillett said ."], "summary": ["apple sold more than 61 million iphones in the quarter .", "apple did n't report any results for the new apple watch .", "believed around 2 million watches have been sold , according to estimates ."], "publication": "DailyMail", "compression": 28.657894736842106, "coverage": 0.868421052631579, "density": 6.342105263157895} diff --git a/tests/data_for_tests/io/cnndm/vocab b/tests/data_for_tests/io/cnndm/vocab new file mode 100755 index 00000000..26e83ade --- /dev/null +++ b/tests/data_for_tests/io/cnndm/vocab @@ -0,0 +1,100 @@ +. 12172211 +the 11896296 +, 9609022 +to 5751102 +a 5100569 +and 4892246 +of 4867879 +in 4431149 +'s 2202754 +was 2086001 +for 1995054 +that 1944328 +' 1880335 +on 1858606 +` 1821696 +is 1797908 +he 1678396 +it 1603145 +with 1497568 +said 1348297 +: 1344327 +his 1302056 +at 1260578 +as 1230256 +i 1089458 +by 1064355 +have 1016505 +from 1015625 +has 969042 +her 935151 +be 932950 +'' 904149 +`` 898933 +but 884494 +are 865728 +she 850971 +they 816011 +an 766001 +not 738121 +had 725375 +who 722127 +this 721027 +after 669231 +were 655187 +been 647432 +their 645014 +we 625684 +will 577581 +when 506811 +-rrb- 501827 +n't 499765 +-lrb- 497508 +one 490666 +which 465040 +you 461359 +-- 460450 +up 437177 +more 433177 +out 432343 +about 428037 +would 400420 +- 399113 +or 399001 +there 389590 +people 386121 +new 380970 +also 380041 +all 350670 +two 343787 +can 341110 +him 338345 +do 330166 +into 319067 +last 315857 +so 308507 +than 306701 +just 305759 +time 302071 +police 301341 +could 298919 +told 298384 +over 297568 +if 297292 +what 293759 +years 288999 +first 283683 +no 274488 +my 273829 +year 272392 +them 270715 +its 269566 +now 262011 +before 260991 +mr 250970 +other 247663 +some 245191 +being 243458 +home 229570 +like 229425 +did 227833 diff --git a/tests/data_for_tests/io/conll2003/dev.txt b/tests/data_for_tests/io/conll2003/dev.txt new file mode 100755 index 00000000..90834721 --- /dev/null +++ b/tests/data_for_tests/io/conll2003/dev.txt @@ -0,0 +1,49 @@ +-DOCSTART- -X- -X- O + +CRICKET NNP B-NP O +- : O O +LEICESTERSHIRE NNP B-NP B-ORG +TAKE NNP I-NP O +OVER IN B-PP O +AT NNP B-NP O +TOP NNP I-NP O +AFTER NNP I-NP O +INNINGS NNP I-NP O +VICTORY NN I-NP O +. . O O + +LONDON NNP B-NP B-LOC +1996-08-30 CD I-NP O + +Phil NNP B-NP B-PER +Simmons NNP I-NP I-PER +took VBD B-VP O +four CD B-NP O +for IN B-PP O +38 CD B-NP O +on IN B-PP O +Friday NNP B-NP O +as IN B-PP O +Leicestershire NNP B-NP B-ORG +beat VBD B-VP O +Somerset NNP B-NP B-ORG +by IN B-PP O +an DT B-NP O +innings NN I-NP O +and CC O O +39 CD B-NP O +runs NNS I-NP O +in IN B-PP O +two CD B-NP O +days NNS I-NP O +to TO B-VP O +take VB I-VP O +over IN B-PP O +at IN B-PP O +the DT B-NP O +head NN I-NP O +of IN B-PP O +the DT B-NP O +county NN I-NP O +championship NN I-NP O +. . O O diff --git a/tests/data_for_tests/io/conll2003/test.txt b/tests/data_for_tests/io/conll2003/test.txt new file mode 100755 index 00000000..b5b3aef0 --- /dev/null +++ b/tests/data_for_tests/io/conll2003/test.txt @@ -0,0 +1,51 @@ +-DOCSTART- -X- -X- O + +SOCCER NN B-NP O +- : O O +JAPAN NNP B-NP B-LOC +GET VB B-VP O +LUCKY NNP B-NP O +WIN NNP I-NP O +, , O O +THE NP B-NP B-PER +CHINA NNP I-NP I-PER +IN IN B-PP O +SURPRISE DT B-NP O +DEFEAT NN I-NP O +. . O O + +Nadim NNP B-NP B-PER +Ladki NNP I-NP I-PER + +AL-AIN NNP B-NP B-LOC +, , O O +United NNP B-NP B-LOC +Arab NNP I-NP I-LOC +Emirates NNPS I-NP I-LOC +1996-12-06 CD I-NP O + +Japan NNP B-NP B-LOC +began VBD B-VP O +the DT B-NP O +defence NN I-NP O +of IN B-PP O +their PRP$ B-NP O +Asian JJ I-NP B-MISC +Cup NNP I-NP I-MISC +title NN I-NP O +with IN B-PP O +a DT B-NP O +lucky JJ I-NP O +2-1 CD I-NP O +win VBP B-VP O +against IN B-PP O +Syria NNP B-NP B-LOC +in IN B-PP O +a DT B-NP O +Group NNP I-NP O +C NNP I-NP O +championship NN I-NP O +match NN I-NP O +on IN B-PP O +Friday NNP B-NP O +. . O O diff --git a/tests/data_for_tests/io/conll2003/train.txt b/tests/data_for_tests/io/conll2003/train.txt new file mode 100755 index 00000000..4f0c4bf2 --- /dev/null +++ b/tests/data_for_tests/io/conll2003/train.txt @@ -0,0 +1,48 @@ +-DOCSTART- -X- -X- O + +EU NNP B-NP B-ORG +rejects VBZ B-VP O +German JJ B-NP B-MISC +call NN I-NP O +to TO B-VP O +boycott VB I-VP O +British JJ B-NP B-MISC +lamb NN I-NP O +. . O O + +Peter NNP B-NP B-PER +Blackburn NNP I-NP I-PER + +BRUSSELS NNP B-NP B-LOC +1996-08-22 CD I-NP O + +The DT B-NP O +European NNP I-NP B-ORG +Commission NNP I-NP I-ORG +said VBD B-VP O +on IN B-PP O +Thursday NNP B-NP O +it PRP B-NP O +disagreed VBD B-VP O +with IN B-PP O +German JJ B-NP B-MISC +advice NN I-NP O +to TO B-PP O +consumers NNS B-NP O +to TO B-VP O +shun VB I-VP O +British JJ B-NP B-MISC +lamb NN I-NP O +until IN B-SBAR O +scientists NNS B-NP O +determine VBP B-VP O +whether IN B-SBAR O +mad JJ B-NP O +cow NN I-NP O +disease NN I-NP O +can MD B-VP O +be VB I-VP O +transmitted VBN I-VP O +to TO B-PP O +sheep NN B-NP O +. . O O diff --git a/tests/data_for_tests/io/cws_as/dev.txt b/tests/data_for_tests/io/cws_as/dev.txt new file mode 100755 index 00000000..f4c96e9e --- /dev/null +++ b/tests/data_for_tests/io/cws_as/dev.txt @@ -0,0 +1,6 @@ +時間 : +三月 十日 ( 星期四 ) 上午 十時 。 +並 辦理 加州 大學 退休 等 手續 。 +包括 一九七八年 獲有 數學 諾貝爾 之 稱 的 費爾茲獎 , +在 台大 的 四 年 裡 , +他 語重心長 的 勉勵 同學 們 一 番 話 , diff --git a/tests/data_for_tests/io/cws_as/test.txt b/tests/data_for_tests/io/cws_as/test.txt new file mode 100755 index 00000000..a61009b2 --- /dev/null +++ b/tests/data_for_tests/io/cws_as/test.txt @@ -0,0 +1,6 @@ +許多 社區 長青 學苑 多 開設 有 書法 、 插花 、 土風舞班 , +文山區 長青 學苑 則 有 個 十分 特別 的 「 英文 歌唱班 」 , +成員 年齡 均 超過 六十 歲 , +這 群 白髮蒼蒼 , +爺爺 、 奶奶級 的 學員 唱起 英文 歌 來 字正腔圓 , +有模有樣 。 diff --git a/tests/data_for_tests/io/cws_as/train.txt b/tests/data_for_tests/io/cws_as/train.txt new file mode 100755 index 00000000..b6eab6a3 --- /dev/null +++ b/tests/data_for_tests/io/cws_as/train.txt @@ -0,0 +1,6 @@ +地點 : +學術 活動 中心 一樓 簡報室 。 +主講 : +民族所 所長 莊英章 先生 。 +講題 : +閩 、 台 漢人 社會 研究 的 若干 考察 。 diff --git a/tests/data_for_tests/io/cws_cityu/dev.txt b/tests/data_for_tests/io/cws_cityu/dev.txt new file mode 100755 index 00000000..eac550f2 --- /dev/null +++ b/tests/data_for_tests/io/cws_cityu/dev.txt @@ -0,0 +1,6 @@ +立會 選情 告一段落 民主 進程 還 看 明天 +所謂 「 左 」 的 勢力 , 是 指 以 鄭經翰 、 梁國雄 ( 長毛 ) 為 代表 的 激進 民主 勢力 , 他們 尖銳 批評 中央 和 特區 政府 , 積極 為 基層 勞工 爭取 福利 , 可能 會 為 民主派 與 中央 和解 增加 困難 , 牽制 民主黨 走 中產 溫和 路線 。 +特區 政府 應該 積極 與 民主派 改善 關係 , 尤其 要 爭取 中間 及 「 右 」 翼 的 民主 勢力 , 因為 這些 人 背後 反映 的 是 香港 的 主流 民意 , 除了 民主 步伐 和 涉及 中央 的 敏感 政治 議題 , 他們 和 建制派 的 溫和 力量 沒有 基本 不同 , 很 容易 達成 跨 黨派 的 共識 , 令 特區 政府 處於 不得不 從 的 被動 位置 , 23 條 立法 撤回 、 追究 SARS 責任 等 , 都 是 記憶猶新 的 例子 。 +為 何秀蘭 喝彩 為 香港 人 神傷 +單說 立法會 , 自 91 年 以來 , 經歷 5 次 類似 的 地區 直選 。 +點票 過程 出現 的 笑話 更 多 。 diff --git a/tests/data_for_tests/io/cws_cityu/test.txt b/tests/data_for_tests/io/cws_cityu/test.txt new file mode 100755 index 00000000..aa838fe2 --- /dev/null +++ b/tests/data_for_tests/io/cws_cityu/test.txt @@ -0,0 +1,6 @@ +「 練 得 銅皮鐵骨 」 露宿 早 慣 蚊叮 +本 港 約 有 450 至 600 名 露宿者 , 其中 近 四分之一 , 即 約 150 人 露宿 在 深水埗 。 +有 外展 社工 稱 , 露宿者 日間 多 到 商場 等 冷氣 場所 避暑 , 流連 至 晚上 11 、 12 時 , 才 用 紙皮 在 公園 外 「 打地鋪 」 , 他們 早已 「 練 得 一 身 銅皮鐵骨 」 , 徹夜 被 蚊 叮 也 習以為常 , 但 社工 在 炎夏 仍 會 頻頻 給 他們 派發 蚊香 。 +基督教 關懷 無家者 協會 的 外展 社工 , 過去 一直 有 探訪 李鄭屋 遊樂場 外 的 露宿者 , 該 會 總幹事 賴淑芬 說 , 該 處 的 露宿者 只 有 數 人 , 且 流動性 很 大 。 +不管 被 多少 蚊 叮 也 沒 什 感覺 +她 指 這些 露宿者 日間 都 會 流連 於 冷氣 場所 , 晚上 才 到 遊樂場 露宿 , 但 礙於 遊樂場 晚上 關門 , 他們 只 可 在 外圍 「 打地鋪 」 。 diff --git a/tests/data_for_tests/io/cws_cityu/train.txt b/tests/data_for_tests/io/cws_cityu/train.txt new file mode 100755 index 00000000..6338621c --- /dev/null +++ b/tests/data_for_tests/io/cws_cityu/train.txt @@ -0,0 +1,6 @@ +立法會 選舉 出現 了 戲劇性 的 結果 , 儘管 投票率 創下 新高 , 而 過去 經驗 顯示 高 投票率 對 民主派 較 有利 , 但 由於 名單 協調 不當 及 配票 策略 失誤 , 加上 醜聞 影響 選情 , 民主黨 的 議席 比 上 一 屆 減少 , 由 第 一 大 黨 跌 至 第 三 ; +而 泛民主派 在 30 席 普選 中 亦 只能 取得 18 席 , 比 選前 預期 的 20 席 少 ; +但 在 功能 組別 選舉 卻 有 意外 收穫 , 除 保住 原有 的 5 個 議席 , 還 搶佔 了 醫學 和 會計 兩 個 專業 界別 , 令 議席 總數 達到 25 席 , 比 上 一 屆 多 了 3 席 。 +更 值得 注意 的 是 , 泛民主派 候選人 在 普選 中 合共 取得 110萬 張 選票 , 佔 178萬 選票 總數 的 62 % , 顯示 多數 市民 認同 早日 實現 全面 普選 的 民主 訴求 , 這 一 點 應 為 政府 及 各 黨派 人士 所 尊重 。 +須 為 2012 全面 普選 創造 條件 +親 建制 陣營 方面 , 民建聯 和 自由黨 都 取得 佳績 , 分別 取得 12 席 和 11 席 , 成為 立法會 內 的 第 一 及 第 二 大 黨 。 diff --git a/tests/data_for_tests/io/cws_msra/dev.txt b/tests/data_for_tests/io/cws_msra/dev.txt new file mode 100755 index 00000000..9c6b34ee --- /dev/null +++ b/tests/data_for_tests/io/cws_msra/dev.txt @@ -0,0 +1,2 @@ +“ 人们 常 说 生活 是 一 部 教科书 , 而 血 与 火 的 战争 更 是 不可多得 的 教科书 , 她 确实 是 名副其实 的 ‘ 我 的 大学 ’ 。 +他 “ 严格要求 自己 , 从 一个 科举 出身 的 进士 成为 一个 伟大 的 民主主义 者 , 进而 成为 一 位 杰出 的 党外 共产主义 战士 , 献身 于 崇高 的 共产主义 事业 。 diff --git a/tests/data_for_tests/io/cws_msra/test.txt b/tests/data_for_tests/io/cws_msra/test.txt new file mode 100755 index 00000000..8d5c6b3c --- /dev/null +++ b/tests/data_for_tests/io/cws_msra/test.txt @@ -0,0 +1,2 @@ +扬帆 远东 做 与 中国 合作 的 先行 +希腊 的 经济 结构 较 特殊 。 diff --git a/tests/data_for_tests/io/cws_msra/train.txt b/tests/data_for_tests/io/cws_msra/train.txt new file mode 100755 index 00000000..35c2cad0 --- /dev/null +++ b/tests/data_for_tests/io/cws_msra/train.txt @@ -0,0 +1,3 @@ +“ 心 静 渐 知 春 似 海 , 花 深 每 觉 影 生 香 。 +“ 吃 屎 的 东西 , 连 一 捆 麦 也 铡 不 动 呀 ? +复旦大学 百年 校庆 。 \ No newline at end of file diff --git a/tests/data_for_tests/io/cws_pku/dev.txt b/tests/data_for_tests/io/cws_pku/dev.txt new file mode 100755 index 00000000..df77c5ca --- /dev/null +++ b/tests/data_for_tests/io/cws_pku/dev.txt @@ -0,0 +1,6 @@ +在 十五大 精神 指引 下 胜利 前进 —— 元旦 献辞 +我们 即将 以 丰收 的 喜悦 送 走 牛年 , 以 昂扬 的 斗志 迎来 虎年 。 我们 伟大 祖国 在 新 的 一 年 , 将 是 充满 生机 、 充满 希望 的 一 年 。 +李 鹏 在 北京 考察 企业 +李 鹏 说 : “ 作为 首都 的 电力 工作者 , 你们 为 首都 的 各项 重大 活动 的 顺利 进行 , 为 保障 人民 群众 的 工作 、 生活 和 学习 , 为 促进 首都 经济 的 发展 作出 了 自己 的 贡献 。 明天 就 是 元旦 , 你们 还有 许多 同志 要 坚守 岗位 , 我 向 你们 、 向 全体 电力 工作者 表示 感谢 。 现在 , 我们 的 首都 已经 结束 了 拉 闸 限 电 的 历史 , 希望 依靠 大家 , 使 拉 闸 限 电 的 历史 永远 不再 重演 。 同时 , 也 希望 你们 安全 生产 、 经济 调度 , 实现 经济 增长 方式 的 转变 。 ” 李 鹏 最后 向 电业 职工 , 向 全 北京市 的 人民 拜年 , 向 大家 致以 新春 的 问候 , 祝愿 电力 事业 取得 新 的 成绩 , 祝愿 北京市 在 改革 、 发展 和 稳定 的 各项 工作 中 取得 新 的 成就 。 +( 附 图片 1 张 ) +据 介绍 , 播音员 、 主持人 持证 上岗 工作 , 是 在 1996年 全国 广播 影视 系统 语言 工作 会议 上 提 出来 的 , 它 是 加强 宣传 队伍 建设 , 促进 语言 文字 走向 标准化 、 规范化 的 重要 举措 。 播音员 、 主持人 只有 通过 汉语 普通话 水平 测试 和 政治 、 业务 考核 后 才 能 获得 上岗 资格 证书 。 diff --git a/tests/data_for_tests/io/cws_pku/test.txt b/tests/data_for_tests/io/cws_pku/test.txt new file mode 100755 index 00000000..c7ad3e85 --- /dev/null +++ b/tests/data_for_tests/io/cws_pku/test.txt @@ -0,0 +1,6 @@ +共同 创造 美好 的 新 世纪 —— 二○○一年 新年 贺词 +( 二○○○年 十二月 三十一日 ) ( 附 图片 1 张 ) +女士 们 , 先生 们 , 同志 们 , 朋友 们 : +2001年 新年 钟声 即将 敲响 。 人类 社会 前进 的 航船 就要 驶入 21 世纪 的 新 航程 。 中国 人民 进入 了 向 现代化 建设 第三 步 战略 目标 迈进 的 新 征程 。 +在 这个 激动人心 的 时刻 , 我 很 高兴 通过 中国 国际 广播 电台 、 中央 人民 广播 电台 和 中央 电视台 , 向 全国 各族 人民 , 向 香港 特别 行政区 同胞 、 澳门 特别 行政区 同胞 和 台湾 同胞 、 海外 侨胞 , 向 世界 各国 的 朋友 们 , 致以 新 世纪 第一 个 新年 的 祝贺 ! +过去 的 一 年 , 是 我国 社会主义 改革 开放 和 现代化 建设 进程 中 具有 标志 意义 的 一 年 。 在 中国 共产党 的 领导 下 , 全国 各族 人民 团结 奋斗 , 国民经济 继续 保持 较 快 的 发展 势头 , 经济 结构 的 战略性 调整 顺利 部署 实施 。 西部 大 开发 取得 良好 开端 。 精神文明 建设 和 民主 法制 建设 进一步 加强 。 我们 在 过去 几 年 取得 成绩 的 基础 上 , 胜利 完成 了 第九 个 五年计划 。 我国 已 进入 了 全面 建设 小康 社会 , 加快 社会主义 现代化 建设 的 新 的 发展 阶段 。 diff --git a/tests/data_for_tests/io/cws_pku/train.txt b/tests/data_for_tests/io/cws_pku/train.txt new file mode 100755 index 00000000..94ee7c93 --- /dev/null +++ b/tests/data_for_tests/io/cws_pku/train.txt @@ -0,0 +1,9 @@ +迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 ) +中共中央 总书记 、 国家 主席 江 泽民 +( 一九九七年 十二月 三十一日 ) +12月 31日 , 中共中央 总书记 、 国家 主席 江 泽民 发表 1998年 新年 讲话 《 迈向 充满 希望 的 新 世纪 》 。 ( 新华社 记者 兰 红光 摄 ) +同胞 们 、 朋友 们 、 女士 们 、 先生 们 : +在 1998年 来临 之际 , 我 十分 高兴 地 通过 中央 人民 广播 电台 、 中国 国际 广播 电台 和 中央 电视台 , 向 全国 各族 人民 , 向 香港 特别 行政区 同胞 、 澳门 和 台湾 同胞 、 海外 侨胞 , 向 世界 各国 的 朋友 们 , 致以 诚挚 的 问候 和 良好 的 祝愿 ! +占 比 57.8% > 40% +占 比 57.8% < 40% +占 比 57.8% < < 40% > \ No newline at end of file diff --git a/tests/data_for_tests/io/dbpedia/test.csv b/tests/data_for_tests/io/dbpedia/test.csv new file mode 100755 index 00000000..4e50b3fb --- /dev/null +++ b/tests/data_for_tests/io/dbpedia/test.csv @@ -0,0 +1,5 @@ +1,"TY KU"," TY KU /taɪkuː/ is an American alcoholic beverage company that specializes in sake and other spirits. The privately-held company was founded in 2004 and is headquartered in New York City New York. While based in New York TY KU's beverages are made in Japan through a joint venture with two sake breweries. Since 2011 TY KU's growth has extended its products into all 50 states." +1,"Odd Lot Entertainment"," OddLot Entertainment founded in 2001 by longtime producers Gigi Pritzker and Deborah Del Prete (The Wedding Planner) is a film production and financing company based in Culver City California.OddLot produced the film version of Orson Scott Card's sci-fi novel Ender's Game. A film version of this novel had been in the works in one form or another for more than a decade by the time of its release." +1,"Henkel"," Henkel AG & Company KGaA operates worldwide with leading brands and technologies in three business areas: Laundry & Home Care Beauty Care and Adhesive Technologies. Henkel is the name behind some of America’s favorite brands." +1,"GOAT Store"," The GOAT Store (Games Of All Type Store) LLC is one of the largest retro gaming online stores and an Independent Video Game Publishing Label. Additionally they are one of the primary sponsors for Midwest Gaming Classic." +1,"RagWing Aircraft Designs"," RagWing Aircraft Designs (also called the RagWing Aeroplane Company and RagWing Aviation) was an American aircraft design and manufacturing company based in Belton South Carolina." diff --git a/tests/data_for_tests/io/dbpedia/train.csv b/tests/data_for_tests/io/dbpedia/train.csv new file mode 100755 index 00000000..d3698589 --- /dev/null +++ b/tests/data_for_tests/io/dbpedia/train.csv @@ -0,0 +1,14 @@ +1,"Boneau/Bryan-Brown"," Boneau/Bryan-Brown Inc. is a public relations company based in Manhattan New York USA largely supporting Broadway theatre productions as a theatrical press agency.The company was formed by the partnership of Chris Boneau and Adrian Bryan-Brown in 1991. Broadway productions supported include among hundreds the musical Guys and Dolls in 1992. The company initially represented the rock musical Spider-Man: Turn Off the Dark which finally opened on Broadway in 2011." +2,"Dubai Gem Private School & Nursery"," Dubai Gem Private School (DGPS) is a British school located in the Oud Metha area of Dubai United Arab Emirates. Dubai Gem Nursery is located in Jumeirah. Together the institutions enroll almost 1500 students aged 3 to 18." +3,"Shahar Marcus"," Shahar Marcus (born 1971 in Petach Tikva) is an Israeli performance artist." +4,"Martin McKinnon"," Martin Marty McKinnon (born 5 July 1975 in Adelaide) is a former Australian rules footballer who played with Adelaide Geelong and the Brisbane Lions in the Australian Football League (AFL).McKinnon was recruited by Adelaide in the 1992 AFL Draft with their first ever national draft pick. He was the youngest player on Adelaide's list at the time and played for Central District in the SANFL when not appearing with Adelaide." +5,"Steve Howitt"," Steven S. Howitt is the current member of the Massachusetts House of Representatives for the 4th Bristol district." +6,"Wedell-Williams XP-34"," The Wedell-Williams XP-34 was a fighter aircraft design submitted to the United States Army Air Corps (USAAC) before World War II by Marguerite Clark Williams widow of millionaire Harry P. Williams former owner and co-founder of the Wedell-Williams Air Service Corporation." +7,"Nationality Rooms"," The Nationality Rooms are a collection of 29 classrooms in the University of Pittsburgh's Cathedral of Learning depicting and donated by the ethnic groups that helped build the city of Pittsburgh." +8,"Duruitoarea River"," The Duruitoarea River is a tributary of the Camenca River in Romania." +9,"Shirvan Shahlu"," Shirvan Shahlu (Persian: شيروان شاهلو‎ also Romanized as Shīrvān Shāhlū; also known as Shīravān Shāmnū) is a village in Gavdul-e Sharqi Rural District in the Central District of Malekan County East Azerbaijan Province Iran. At the 2006 census its population was 137 in 35 families." +10,"Oenopota impressa"," Oenopota impressa is a species of sea snail a marine gastropod mollusk in the family Mangeliidae." +11,"Utricularia simulans"," Utricularia simulans the fringed bladderwort is a small to medium-sized probably perennial carnivorous plant that belongs to the genus Utricularia. U. simulans is native to tropical Africa and the Americas. It grows as a terrestrial plant in damp sandy soils in open savanna at altitudes from near sea level to 1575 m (5167 ft). U. simulans was originally described and published by Robert Knud Friedrich Pilger in 1914." +12,"Global Chillage"," Global Chillage is the second album by The Irresistible Force released in 1994 through Rising High Records." +13,"The Nuisance (1933 film)"," The Nuisance is a 1933 film starring Lee Tracy as a lawyer Madge Evans as his love interest (with a secret) and Frank Morgan as his accomplice." +14,"Razadarit Ayedawbon"," Razadarit Ayedawbon (Burmese: ရာဇာဓိရာဇ် အရေးတော်ပုံ) is a Burmese chronicle covering the history of Ramanya from 1287 to 1421. The chronicle consists of accounts of court intrigues rebellions diplomatic missions wars etc. About half of the chronicle is devoted to the reign of King Razadarit (r." diff --git a/tests/data_for_tests/io/imdb/dev.txt b/tests/data_for_tests/io/imdb/dev.txt new file mode 100755 index 00000000..423e158b --- /dev/null +++ b/tests/data_for_tests/io/imdb/dev.txt @@ -0,0 +1,6 @@ +neg You can never have seen either film and still know that The Jerk Too is a disaster. The question is not, "How did it get made," because if you throw money at anyone and tell them to make a film, they will do so.

No. The question is "Why, oh why, did Steve Martin allow it to be made?" I think he needed the money to fight a nuisance lawsuit and was determined it not cost him anything. He knew the sequel was going to be so frightful, that out of pride, he wouldn't even count it's royalties as income.

The only way this sequel could not be an embarrassment is to have had Carl Gottlieb and Steve Martin revive the nation's favorite poor black family.

And "dcreasy2001" (aka Mark Blankfield?): It's just transparently obvious that you worked on this film in some sad capacity, and the only way you can feel better about your involvement is to be the sequel's lone cheerleader as an IMDb user comment. I was praying for you to veer over into satire, but alas, you were really making an effort at spin. Why not 10 stars? +neg The Hazing is confused mumbo-jumbo that wants so hard to be The Evil Dead that it even references Bruce Campbell several times. The problem is, it is simply not in the same league as that terrific movie. This movie is nowhere near as original. The plot has been used before, by Kevin Tenney in Night of the Demons, and that was a lot more fun. This flick wastes too much time with complicated exposition before getting the kids into the spooky mansion and starting the demonic happenings.

Brad Dourif is, as usual, not given much to do here, but when he is on screen he puts in another over-the-top performance that would make Christopher Walken jealous. As for the acting of the kids, it's passable but by no means good. The shaky camera work is more annoying than clever or atmospheric. There are a few good moments when the first guy gets possessed and throws around some deadly one liners while dispatching his victims, but it was never scary for a second. The gore level is mid-range to low, but the director tries to make up for it by showing the actresses topless a few times. All in all, just okay if you have 87 minutes to waste. +neg I have seen bad movies before, but this one takes the "Worst Movie of a Lifetime" award by far !! Anthony Hopkins has to be completely mentally ill to have his name attached to this one - anywhere ! I will never see another movie with him in it, directing it, etc., etc. ! I can't believe the other actors & actresses that I liked, (in this picture), that stooped so low to be a part of this disaster ! There must be some great drugs out there ! For anyone to not be embarrassed to be a part of such a film, is beyond me ! Save your money on this one ! HUGE FLOP from beginning to end ! Shame on you Mr. Hopkins ! Also, shame on Christian Slater ! I can't believe you put your reputations on the line for this one ! +neg You may want to know up front that I am not a Mormon, unlike a good number of those who have already reviewed this film. I mention this so you'll understand that the way I look at the film may differ greatly from those in the faith. For some, being critical of the film might be seen as being critical of the faith--and that is NOT my intention. So, my review is that of an outsider trying to look inside and learn more about who this man and his people were. Well, after seeing the film, I doubt if I have learned much at all. Since I have been a history teacher, I have a good basic understanding about Young as well as Joseph Smith as well as the teachings of the church. But anyone wanting to see this film to really learn anything will probably be disappointed because the film seems so gosh-darn nice--too nice and too unrealistic in its portrayal. Plus, you learn practically nothing about the church's beliefs other than they are nice people, work hard and some have many wives (and this latter part is only barely hinted at in the film). Instead, the people are almost cartoon-like in their simplistic portrayals. Joseph Smith and Brigham Young and their followers are angelic, the non-Mormons were all devils and Brian Donlevy (playing EXACTLY the same sort of role Edward G. Robinson later played in THE TEN COMMANDMENTS) is the trouble-maker who claims to be a Mormon but just comes along so the film can have a bad guy. It's all so very simple....too simple. Almost like an indoctrination film or infomercial.

Brigham Young especially was a very complex man--with many good points (an excellent organizer and visionary) as well as bad (don't even get me started on his views about Blacks within the church or intermarriage). To portray him in such vague terms is just plain silly. It's also a lot like how Gandhi was portrayed in the film with Ben Kingsley--only the facts that led to his being almost super-human were emphasized. Heck, now that I think about that, this is the trouble with most religious films--they often come off as one-dimensional, trite and bland. Let's have a full and more complete film of these men--one that will stick to facts and not emotional appeals.

Now if you can ignore the fact that you won't learn very much about the faith or its second leader, the film is enjoyable enough. It's obvious someone at 20th Century-Fox really cared about the film, as they had a wonderful cast of both premier actors (Tyrone Power), up and coming actors (Linda Darnell, Jane Darwell and Vincent Price) and wonderful character actors (Dean Jagger, John Carradine and Brian Donlevy). The film also had wonderful location shooting and lots of gloss. It just didn't have a lot to tell us other than they were all "swell". Plus, there were plenty of factual errors and a few just plain dumb scenes. A few of the mistakes include Young taking over the helm immediately after the death of Joseph Smith (it was three years later), no mention of the various Mormon denominations and splinter groups, talk of "gold in California"--even though it was 1847 and gold wouldn't be discovered until 1948, as well as no specific mention of polygamy or Smith's many wives. Just plain dumb scenes include Carradine pulling out a gun and waving it about in the courtroom scene--and no one seemed to care--even though it was a very hostile audience! Don't you think at least the judge would tell him to put it away and stop threatening people with it?!

One final comment. Do not, I repeat, do not watch this film when it's shown on American Movie Classics (a one great station that has sunk a lot in recent years). While I am critical of the film because of its simplistic message, I was horrified with the complete disrespect the station had for the church and its traditions. What I mean is this. The film was punctuated with ads for penis enlargement formulas as well as tons of pop-ups (some advertising a show that features the "sexiest cast"). Talk about disrespectful and gross and I would be just as offended if they did this for any other religious film. By doing this, they not only insult the faith but marginalize their market--after all, who is into hearing about these things AND the life of Brigham Young?! Is this a movie, in this form, that you can show to your kids or recommend to others?! +pos Fifteen years later and Paris Is Burning is still aflame. This is a classic in black gay films, right up there with the other honorary black gay films, The Color Purple and Mahoganoy. This seminal work captures underground and underclass (i.e."underserved) black and Latin gay culture and community like no other work before or since, including all the sentimental Harlem Rennaissance gay retrospectives and renderings. They're good, but this is the best (dare I say the only "real") film you'll find on the subject. It's Relentlessy Cunty (the classic house music invention)comes to Hollywood, non-stop, hilarious camp (like only we do it) and dead-on social critique. All this by a white female director (who obviously must have been a Sister Gurl or Mizz Thing in a former life.) I could go on, but I think you get the point by now: I love this movie! +pos I have been an admirer of Edward Burtynsky's work for years, and it was such a pleasure to be able to see the man at work, thanks to Jennifer Baichwal's documentary. The severe beauty of the ship-breaking yard in Bangladesh, the stone quarry in Vermont, the enormous assembly plant in China, the beleaguered old neighbourhoods in Shanghai that are just waiting to be torn down: these landscapes are captured so well by the photographer and the filmmaker.

At times I thought of old TV documentaries on abandoned coal mines and plastic-mold factories; the sort of stuff I grew up watching. Burtynsky's work has the great value of pointing out how the industrial activity has only shifted to Asia, it has not stopped. The strangest scene for me was the computer scrap-yard somewhere in China--the waste had a threatening air about it, while the workers were very jovial. diff --git a/tests/data_for_tests/io/imdb/test.txt b/tests/data_for_tests/io/imdb/test.txt new file mode 100755 index 00000000..68768ec6 --- /dev/null +++ b/tests/data_for_tests/io/imdb/test.txt @@ -0,0 +1,6 @@ +neg Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook. +neg I have seen this movie and I did not care for this movie anyhow. I would not think about going to Paris because I do not like this country and its national capital. I do not like to learn french anyhow because I do not understand their language. Why would I go to France when I rather go to Germany or the United Kingdom? Germany and the United Kingdom are the nations I tolerate. Apparently the Olsen Twins do not understand the French language just like me. Therefore I will not bother the France trip no matter what. I might as well stick to the United Kingdom and meet single women and play video games if there is a video arcade. That is all. +neg In Los Angeles, the alcoholic and lazy Hank Chinaski (Matt Dillon) performs a wide range of non-qualified functions just to get enough money to drink and gamble in horse races. His primary and only objective is writing and having sexy with dirty women.

"Factotum" is an uninteresting, pointless and extremely boring movie about an irresponsible drunken vagrant that works a couple of days or weeks just to get enough money to buy spirits and gamble, being immediately fired due to his reckless behavior. In accordance with IMDb, this character would be the fictional alter-ego of the author Charles Bukowski, and based on this story, I will certainly never read any of his novels. Honestly, if the viewer likes this theme of alcoholic couples, better off watching the touching and heartbreaking Hector Babenco's "Ironweed" or Marco Ferreri's "Storie di Ordinaria Follia" that is based on the life of the same writer. My vote is four.

Title (Brazil): "Factotum – Sem Destino" ("Factotum – Without Destiny") +neg This film is bundled along with "Gli fumavano le Colt... lo chiamavano Camposanto" and both films leave a lot to be desired in the way of their DVD prints. First, both films are very dark--occasionally making it hard to see exactly what's happening. Second, neither film has subtitles and you are forced to watch a dubbed film--though "Il Prezzo del Potere" does seem to have a better dub. Personally, I always prefer subtitles but for the non-purists out there this isn't a problem. These DVD problems, however, are not the fault of the original film makers--just the indifferent package being marketed four decades later.

As for the film, it's about the assassination of President Garfield. This is a MAJOR problem, as Van Johnson looks about as much like Garfield as Judy Garland. In no way whatsoever does he look like Garfield. He's missing the beard, has the wrong hair color and style and is just not even close in any way (trust me on this, I am an American History teacher and we are paid to know these sort of things!). The real life Garfield was a Civil War general and looked like the guys on the Smith Brothers cough drop boxes. Plus, using some other actor to provide the voice for Johnson in the dubbing is just surreal. Never before or since has Van Johnson sounded quite so macho!! He was a fine actor...but certainly not a convincing general or macho president.

In addition to the stupid casting, President Garfield's death was in no way like this film. It's obvious that the film makers are actually cashing in on the crazy speculation about conspiracies concerning the death of JFK, not Garfield. Garfield was shot in Washington, DC (not Dallas) by a lone gunman with severe mental problems--not a group of men with rifles. However, according to most experts, what actually killed Garfield (over two months later) were incompetent doctors--who probed and probed and probed to retrieve a bullet (to no avail) and never bothered cleaning their hands or implements in the process. In other words, like George Washington (who was basically killed by repeated bloodletting when suffering with pneumonia) he died due to malpractice. In the movie they got nothing right whatsoever...other than indeed President Garfield was shot.

Because the film bears almost no similarity to real history, it's like a history lesson as taught from someone from another planet or someone with a severe brain injury. Why not also include ninjas, fighting robots and the Greek gods while you're at it?!?! Aside from some decent acting and production values, because the script is utter cow crap, I don't recommend anyone watch it. It's just a complete and utter mess. +neg I only comment on really very good films and on utter rubbish. My aim is to help people who want to see great films to spend their time - and money - wisely.

I also want to stop people wasting their time on garbage, and want to publicize the fact that the director/producer of these garbage films can't get away with it for very long. We will find out who you are and will vote with out feet - and wallets.

This film clearly falls into the garbage category.

The director and writer is John Shiban. It's always a bad sign when the writer is also the director. Maybe he wants two pay cheques. He shouldn't get any. So remember the name - John SHIBAN. And if you see anything else by him, forget it.

I won't say anything about the plot - others have already. I am a little worried by how much the director likes to zoom in to the poor girl's face when she is crying and screaming. These long duration shots are a little worrying and may say something about the state of mind of Mr. Shiban. Maybe he should get psychiatric help.

Enough already. It's crap - don't waste your time on it. +neg When you look at the cover and read stuff about it an entirely different type of movie comes to mind than what you get here. Then again maybe I read the summary for the other movie called "Mausolem" instead as there were two movies of this title released about the same time with both featuring plots that had key elements in common. However, reading stuff about that movie here I know I saw this one and not that one and that movie is even less what one would imagine a movie with that title would be about. I will be honest, I expect more of a zombie type picture and you get that in this movie to some degree. However, there is more stuff involving the occult and strange powers as the opening scene of the people being taken away by the coroner at the beginning of the film will attest to. The movie also has the old theme of kids going somewhere they do not belong to have some crazy party, in this case it is in fact a mausoleum. The other movie I do not think really has that key feature playing that prominent role in the movie and I see the score for this one is higher too, still it was just not the movie I was expecting. diff --git a/tests/data_for_tests/io/imdb/train.txt b/tests/data_for_tests/io/imdb/train.txt new file mode 100755 index 00000000..bbf4d799 --- /dev/null +++ b/tests/data_for_tests/io/imdb/train.txt @@ -0,0 +1,6 @@ +neg The monster from Enemy Mine somehow made his way into a small mountain community, where he has taken up residence. He's being hunted by a female doctor-turned-vigilante who is out to exterminate him. This female assassin, who looks like a refugee from a Motley Crue video, rides around on a motorcycle and tries to save a bunch of kids who have chosen to have a Big Chill weekend right smack dab in the middle of the monster's turf. Decapitations and lots of blood are primarily in place to draw attention away from the story which limps along like a bad version of the Island of Dr. Moreau (and yes, it's worse than the one with Val Kilmer). +neg I'll try to use words to describe this on....

I saw the original, which was good in its own way, but back then I should have feared a sequel.

And I was 'afraid' when I picked this one up, but now that I've seen it, I have to say, it's even worse then I thought. Why these movies still get money still makes my mind spin.

Let's start with the actors;they aren't all that good, but it has to be said, some make heads turn by being just plain awful. But what can an actor do with a script like this one. It's trying to be a copy of the original only this time the places have changed, any form of story is gone and any attempt of actually coming up with something that hasn't been done before, fails miserably. In a futile attempt to get it up-to-date, they try to make it exciting by making use of the whole 'big-brother' theme , but that has been worn out ages ago and offers nothing but a filler for between the beginning and the end. An attempt was made to try to save the movie by making a ton of references to the '83 original, but it just ended up being plain funny and sometimes a bit sad. In conclusion, if you have nothing , and I mean nothing , to do... go watch it, or play Frisbee... with the DVD.... by yourself. It'll offer you the same amount of fun.. I promise +pos Most yeti pictures are fatally undermined by a grave paucity of energy and enthusiasm. Not so this gloriously bent, batty and berserk over-the-top Italian-made shot-in-Canada kitsch gut-buster: It's a wildly ripe and vigorously moronic ghastly marvel which reaches a stunning apotheosis of righteously over-baked "what the hell's going on?" crackpot excess and inanity.

A freighter ship crew discovers the body of a 30-foot yeti that resembles a hirsute 70's disco stud (complete with jumbo wavy afro) perfectly preserved in a large chunk of ice. They dethaw the beast, jolt him back to life with electric charges, grossly mistreat him, and keep the poor hairy Goliath in an enormous glass booth. Before you can say "Hey, the filmmakers are obviously ripping off 'King Kong'," our titanic abominable snowdude breaks free of his cage, grabs the first luscious nubile blonde Euro vixen (the gorgeous Pheonix Grant) he lays lustful eyes on, and storms away with his new lady love. The yeti gets recaptured and flown to Toronto to be showed off to a gawking audience. Of course, he breaks free again, nabs the vixen, and goes on the expected stomping around the city rampage.

The sublimely stupid dialogue (sample line: "Philosophy has no place in science, professor"), cheesy (far from) special effects (the horrendous transparent blue screen work and cruddy Tonka toy miniatures are especially uproarious in their very jaw-dropping awfulness), clunky (mis)direction, and a heavy-handed script that even attempts a clumsily sincere "Is the yeti a man or a beast?" ethical debate all combine together to create one of the single most delightfully ridiculous giant monster flicks to ever roar its absurd way across the big screen. Better still, we also have a few funky offbeat touches to add extra shoddy spice to the already succulently schlocky cinematic brew: the vixen accidentally brushes against one of the yeti's nipples, which causes it to harden and elicits a big, leering grin of approval from the lecherous behemoth (!); the vixen nurses the yeti's wounded hand while he makes goo-goo eyes at her, the yeti smashes windows with his feet while climbing a towering office building, and the furry fellow even breaks a man's neck with his toes (!!). Overall, this singularly screwball and shamefully unheralded should-be camp classic stands tall as a remarkable monolith of infectiously asinine celluloid lunacy that's eminently worthy of a substantial hardcore underground cult following. +pos One of the best movies I ever saw was an Irish movie titled Philadelphia,Here I Come. I read the play before I saw the movie and loved them both. It's the story of a young man preparing to leave Ireland to go to America because he can't earn a living in Ireland. It is told both from the perspective of the young man(whom the other characters in the film can see) and another young man representing his uncensored thoughts and feelings., but who cannot be seen by the other characters in the film. It is a very sad movie, but deeply touching, and I would recommend this film to anyone who wants something to think about. I love any Irish movie, or almost any movie about Ireland, and any film that has the late Irish actor Donal McCann in it gets my vote.I would watch that man chew gum for 2 hours on screen, and unfortunately,I have.Terrible shame to have lost him so young. +pos There is such rubbish on the cable movie channels that I hit a gem with this one. From beginning to end it had me gripped and deserves top marks.

Father of two sons hears messages from "God" to kill people who he is told are 'demons'.

When the opening credits showed the director as one of the cast that can often be a warning of a bad film; exceptionally it is the reverse here as the drama is non-stop from beginning to end.

And there is not one moment in the movie when one is not fully enthralled as there are no unnecessary or needless sub-plots, and the script is first class.

All the actors give wholly convincing performances especially the lead child actor who is exceptional.

This film is at least as good as the likes of 'Silence of the Lambs'. +pos This is a nice piece of work. Very sexy and engaging enough plot to keep my interest throughout. Its main disadvantage is that it seems like it was made-for-TV: Full screen, and though there were several sex scenes, there was absolutely no nudity (but boy did it come close!). Strange, too, since Netflix shows that it was rated R.

Nonetheless, very titillating, and I wish Alicia Silverstone made more movies like this.

One Netflix reviewer stated that it was part of a series, but I have been unable to find out what series that is. I'd like to find out, though, because this movie was THAT good.

Walt D in LV. 8/23/2005 diff --git a/tests/data_for_tests/io/mr/dev.csv b/tests/data_for_tests/io/mr/dev.csv new file mode 100755 index 00000000..a00e0b77 --- /dev/null +++ b/tests/data_for_tests/io/mr/dev.csv @@ -0,0 +1,6 @@ +1,"apesar de seus graves problemas , o filme consegue entreter" +0,"except as an acting exercise or an exceptionally dark joke , you wonder what anyone saw in this film that allowed it to get made" +0,"a real clunker a well made , thoughtful , well acted clunker , but a clunker nonetheless" +0,an ugly duckling tale so hideously and clumsily told it feels accidental +0,"unspeakable , of course , barely begins to describe the plot and its complications vulgar is too optimistic a title" +0,at least moore is a real charmer \ No newline at end of file diff --git a/tests/data_for_tests/io/mr/test.csv b/tests/data_for_tests/io/mr/test.csv new file mode 100755 index 00000000..f3804141 --- /dev/null +++ b/tests/data_for_tests/io/mr/test.csv @@ -0,0 +1,6 @@ +1,the animated sequences are well done and perfectly constructed to convey a sense of childhood imagination and creating adventure out of angst +1,a great companion piece to other napoleon films +1,spellbinding fun and deliciously exploitative +0,an ugly duckling tale so hideously and clumsily told it feels accidental +0,"unspeakable , of course , barely begins to describe the plot and its complications vulgar is too optimistic a title" +0,at least moore is a real charmer \ No newline at end of file diff --git a/tests/data_for_tests/io/mr/train.csv b/tests/data_for_tests/io/mr/train.csv new file mode 100755 index 00000000..82c01beb --- /dev/null +++ b/tests/data_for_tests/io/mr/train.csv @@ -0,0 +1,6 @@ +1,"'moore is like a progressive bull in a china shop , a provocateur crashing into ideas and special interest groups as he slaps together his own brand of liberalism '" +1,idiotic and ugly +1,"even if the naipaul original remains the real masterpiece , the movie possesses its own languorous charm" +1,"the movie is amateurish , but it 's a minor treat" +1,"some people march to the beat of a different drum , and if you ever wondered what kind of houses those people live in , this documentary takes a look at 5 alternative housing options" +1,the movie plays up the cartoon 's more obvious strength of snazziness while neglecting its less conspicuous writing strength diff --git a/tests/data_for_tests/io/ohsumed/dev.csv b/tests/data_for_tests/io/ohsumed/dev.csv new file mode 100755 index 00000000..7a26fb04 --- /dev/null +++ b/tests/data_for_tests/io/ohsumed/dev.csv @@ -0,0 +1,6 @@ +C23,"assessment biliary tract liver transplantation tube cholangiography iodida scanning biliary tract obstruction anastomotic leakage common problems following liver transplantation sequential study , 31 patients liver transplant investigated 99mtc iodida \( iodida \) scanning tube cholangiography \( ttc \) results compared clinical outcome seven patients extrahepatic biliary obstruction one patient biliary leak detection biliary complications ttc iodida scanning similar terms sensitivity \( 63 per cent \) ttc better specificity \( 79 per cent versus 60 per cent \) accuracy \( 74 per cent versus 60 per cent \) iodida scanning liver function taken account , diagnostic efficacy tests patients bilirubin levels less 200 mumol l similar levels greater 200 mumol l greater number false positive results iodida scanning \( 12 per cent versus 54 per cent \) significant biliary leak clearly detected ttc iodida scanning ttc remains effective way evaluating biliary tract transplantation iodida scanning limited value bilirubin levels elevated , may provide additional information blood supply , hepatocyte function intrahepatic cholestasis" +C23,"patterns dyspepsia patients clinical evidence organic diseases studied 2000 dyspeptic patients obvious signs organic disease first examination , order \( 1 \) verify many diagnoses idiopathic dyspepsia really made diagnostic procedures \( 2 \) evaluate diagnostic power symptoms distinguishing organic idiopathic dyspepsia latter considered structural abnormalities found cases , distinction made related associated organic dyspepsia according whether certain relationship abnormalities dyspeptic symptoms patients referred us follows \( 1 \) spontaneously , \( 2 \) sent physicians us , \( 3 \) referred open access endoscopic service results show frequency idiopathic dyspepsia 26 , whereas associated structural abnormalities present 45 4 obvious organic causes dyspepsia seen 28 6 \( 24 benign 4 6 malignant diseases \) considered separately , symptom alone allows correct diagnosis simultaneous evaluation symptoms linear discriminant analysis distinguishes idiopathic organic dyspeptic patients 70 cases higher discrimination percentage 70 cases higher discrimination percentage could probably obtained using wider range clinical parameters complex statistical analysis interrelationships exist clinical symptoms final diagnosis" +C23,"evaluation 13c urea breath test detection helicobacter pylori monitoring effect non ulcer dyspepsia sixty nine patients non ulcer dyspepsia studied endoscopy , biopsy , quick urease \( \) test , helicobacter pylori culture , 13c urea breath test treatment \( \) two tablets twice daily four weeks symptoms non ulcer dyspepsia recorded using standard questionnaire using h pylori culture gold standard , sensitivity 13c urea breath test 90 , specificity 98 6 , accuracy 94 8 positive predictive value 98 2 negative predictive value 92 5 conversion rate h pylori positive negative status treatment 17 9 symptoms non ulcer dyspepsia improved appreciably treatment irrespective h pylori status 13c urea breath test accurate research tool suitable serial testing population surveys" +C23,"demonstration area slow conduction human atrial flutter ten patients chronic atrial flutter studied prospectively using electrophysiologic mapping pacing techniques assess mechanism atrial flutter presence area slow conduction atria electrograms recorded greater equal 30 right atrial sites patient atrial flutter demonstrated right atrial free wall activation interatrial septum activation , consistent reentrant circuit involving right atrium six patients , slow conduction occurred atrial flutter inferior right atrium spatially associated fractionated recordings four patients , missing interval electrical activity occurred inferior right atrium average 40 atrial flutter cycle transient criteria demonstrated patient rapid high right atrial pacing mean activation time high right atrial pacing site coronary sinus \( inferior left atrial \) recording site long \( 228 ms \) consistent activation area slow conduction rapid pacing atrial flutter coronary sinus site , transient criteria could demonstrated mean activation time coronary sinus pacing site high right atrial recording site relatively short \( 134 ms \) consistent activation high right atrium area slow conduction high right atrial pacing sinus rhythm rates similar atrial flutter demonstrated short activation time coronary sinus low right atrial sites \( mean 169 88 ms , respectively \) , indicating activation area slow conduction coronary sinus pacing sinus rhythm demonstrated phenomena low right atrial electrograms recorded sinus rhythm rapid pacing sinus rhythm fractionated , although atrial flutter thus , atrial mapping pacing data complementary , indicating human atrial flutter patients studied generated reentrant circuit right atrium , area slow conduction low right atrium present atrial flutter" +C23,"analysis base station morphine orders assessment physician consistency paramedic contact base station consistent recommendations reflecting consensus base station physician care urban ems system , paramedics must contact single base station provide morphine sulfate \( ms \) patient chest pain performed retrospective cohort analysis prehospital ms requests chest pain determine consistency circumstances paramedic team refused ms ms requests represented 123 1 , \( 7 \) line physician consultations 6 month study 15 123 \( 12 \) ms requests refused neither mean patient age , sex distribution , presenting vital signs correlated ms refusal maximum estimate transport time hospital less equal 5 minutes noted 7 15 \( 47 \) medication compared 11 96 \( 11 \) documented estimated transport times \( p less equal 0 005 \) simultaneous request nitroglycerin \( \) noted 6 15 \( 40 \) medication 15 108 \( 14 \) \( p less 0 05 \) found refusal ms administration uncommon physicians tended ms transport time short requested concomitant administration also noted physician inconsistencies refusal findings guide physician consensus development avoid mixed paramedics" +C23,"predictors smoking nhanes followup experience published prospective studies predictors spontaneously cigarette smoking nationally representative u population paper describes study , using cohort taken first national health nutrition examination survey \( nhanes , 1971 1975 \) traced nhanes epidemiologic followup survey \( 1982 1984 \) successful \( least 1 year time followup \) ascertained among adults \( age 25 74 years \) smokers time nhanes disabled followup independent predictors \( proportional hazards multiple regression \) \( 1 \) older age \( 2 \) white race \( 3 \) fewer cigarettes smoked day \( 4 \) higher household income \( 5 \) hospitalization followup period predictors relapse \( ex smokers nhanes smoking time followup \) \( 1 \) younger age \( 2 \) urban residence \( 3 \) female gender findings implications intervention strategies , public health projections research" diff --git a/tests/data_for_tests/io/ohsumed/test.csv b/tests/data_for_tests/io/ohsumed/test.csv new file mode 100755 index 00000000..553af66a --- /dev/null +++ b/tests/data_for_tests/io/ohsumed/test.csv @@ -0,0 +1,6 @@ +C23,"development small caliber biologic vascular graft evaluation antithrombogenicity early healing process authors previously showed small caliber xenograft using crosslinking technique applicable aortocoronary bypass grafting study graft , antithrombogenicity healing process evaluated early stage implantation fresh sheep carotid artery \( id \) obtained cross linked compounds , used small caliber vascular graft graft white soft six cm segments graft implanted carotid arteries bilaterally nine dogs sodium heparin given surgery , anticoagulant used postoperatively fifteen grafts eight dogs removed 1 hr 30 days implantation , 13 15 grafts found patent two grafts , one 3 days , 14 days , occluded anastomotic area occluded grafts felt hard outside one dog , grafts shown angiographically patent 14 days implantation , dog kept long term observation macroscopically , thrombus observed patent grafts microscopically , inner surface near anastomotic lines covered endothelial cells , infiltration fibroblasts observed outside 7 days implantation foreign body reactions seen around graft 30 days implantation , thin layer plasma protein middle graft observed scanning electron microscopy \( sem \) observations , concluded grafts exhibited satisfactory early antithrombogenicity healing implantation" +C23,"proliferation substrate effects endothelial cell thrombogenicity effects cellular differentiation status adhesive substrate endothelial cell function cell culture measured enzyme based assay surface thrombogenicity solid plastic , microporous polymeric , fibronectin \( fn \) treated microporous polymeric used substrates growth endothelial cells microporous fn treated synthetic substrates shown aid induction cellular differentiation mechanisms cells studied proliferative growth conditions thrombogenicity surface created endothelial cell monolayers various experimental conditions determined using enzyme based assay fibrin deposition actively proliferating cells solid plastic substrate produced thrombogenic surface , confluent endothelial cell monolayers grown fn treated microporous substrate least thrombogenic surfaces data suggest endothelial cell surface thrombogenicity substrate control , also related cellular differentiation status findings used design novel approach small diameter synthetic vascular graft problem" +C23,"effect complement arachidonic acid pathway inhibition white blood cell count deposition vascular grafts determine role complement arachidonic acid metabolites decrease peripheral white blood cell count \( pwbc \) observed graft implantation , dacron aortic grafts implanted control rabbits \( group , n 13 \) , rabbits pretreated venom factor \( 80 u kg \) complement \( group ii , n 13 \) , indomethacin \( 2 5 mg kg \) inhibit cyclooxygenase \( group iii , n 7 \) , diethylcarbamazine \( dec , 90 mg kg \) inhibit leukotriene synthesis \( group iv , n 7 \) pwbc measured 15 min 1 hr graft implantation graft removal , wbc count grafts \( gwbc \) determined light microscopy \( \) scanning electron microscopy \( sem \) one hr graft implantation , pwbc decreased significantly groups iv 46 , 52 , 40 , 45 preoperative pwbc , respectively significant difference among groups revealed gwbc per field 8 0 , 12 3 , 5 8 , 6 8 groups iv , respectively similarly , sem showed gwbc per field 2 5 , 5 6 , 0 7 , 1 5 groups iv , respectively sem gwbc significantly greater group ii \( p less 0 01 \) , significantly less group iii \( p less 0 05 \) results suggested complement arachidonic acid pathways alone affect fall pwbc , may influence gwbc" +C23,"total perinatal wastage clarification priorities pregnancy outcome 16 , women carrying 17 , living fetuses 16 weeks gestation studied well recording perinatal deaths , losses 28 weeks one year delivery recorded give total perinatal wastage rate 21 6 per 1000 fetuses alive 16 weeks compared perinatal mortality rate \( plus early neonatal deaths \) 7 8 per 1000 births deaths classified according pathological sub groups concept perinatal care using perinatal mortality compared using total perinatal wastage" +C23,"magnetic resonance imaging idiopathic retroperitoneal fibrosis measurement t1 relaxation time magnetic resonance imaging 0 08 performed nine patients proven idiopathic retroperitoneal fibrosis total 11 scans performed three patients scanned diagnosis one also two follow scans six patients scanned variable time diagnosis treatment scan , soft tissue mass readily identified , distribution corresponding seen computed tomography difference mean t1 relaxation time mass patients scanned diagnosis scanned treatment however , patient followed serial scans showed progressive reduction t1 value mass time comparison results obtained patients lymphoma suggests t1 values retroperitoneal fibrosis lower lymphoma , particularly non hodgkin 's lymphoma" +C23,"development reversibility lymphocyte dysfunction experimental obstructive jaundice study evaluates effect experimental biliary obstruction bile duct ligation \( \) biliary drainage cell mediated immunity wistar rats immune status assessed mitogen stimulation test lymphocytes animals followed 35 days regression analysis showed significant negative correlation lymphocyte function period jaundice \( correlation coefficient 0 57 , p less 0 001 \) following 21 days , groups animals internal biliary drainage 7 , 14 28 days , external drainage 14 days compared obstructed animals , 14 days internal drainage required improve lymphocyte function \( p less 0 05 \) animals 14 days external drainage significantly lower lymphocyte stimulation internal drainage animals \( p less 0 05 \) results demonstrate obstructive jaundice produces progressive reduction lymphocyte function reversed biliary drainage , internal drainage effective external drainage" diff --git a/tests/data_for_tests/io/ohsumed/train.csv b/tests/data_for_tests/io/ohsumed/train.csv new file mode 100755 index 00000000..7a6cfba7 --- /dev/null +++ b/tests/data_for_tests/io/ohsumed/train.csv @@ -0,0 +1,6 @@ +C23,"role membrane proteins monosodium urate crystal membrane interactions ii effect erythrocyte membranes membrane permeable impermeable protein crosslinking agents intact , human erythrocytes pretreated membrane permeable , dimethyl \( \) dimethyl \( \) membrane impermeable 3 , 3' \( \) \( \) protein crosslinking agents incubated monosodium urate monohydrate \( \) crystals percent inhibition lysis values pretreated cells relative untreated cells determined 3 agents caused concentration dependent inhibition induced hemolysis due decrease binding pretreated membranes proposed inhibition lysis due crosslinking integral cytoskeletal membrane proteins , resulting reduced mobility proteins , inhibition integral proteins aggregates decreased pore formation membrane" +C23,"biliary gut function following shock aim study characterize alterations gallbladder intestinal function hemorrhagic shock blood reperfusion animals subjected shock 30 mm hg arterial blood pressure 60 minutes resuscitated blood reinfusion gallbladder epithelial ion transport , gallbladder motility vitro vivo , gastrointestinal motility , flora stomach small bowel studied 2 24 hours shock changes 2 hours included decreased gallbladder contractility vitro decreased emptying vivo , loss coordination intestinal motor activity , decrease frequency intestinal electrical slow waves , reduced duration intestinal migrating motor complex cycle 24 hours , gallbladder epithelial permeability increased vitro contractility remained reduced vivo functions showed partial recovery gastrointestinal flora affected changes data demonstrate hemorrhagic shock reperfusion affect digestive motility early timing alterations observed partial recovery 24 hours post shock suggest ischemia hypoxia mechanism injury" +C23,"short term long term changes renal function donor nephrectomy retrospectively examined effect nephrectomy renal function 55 living related donors renal function measured scans patients studied preoperatively , 1 week 1 year postoperatively 20 patients 10 year followup available compensatory hypertrophy complete 1 week postoperatively effective renal plasma flow remaining kidney 32 5 higher preoperatively increase remained stable least year degree compensatory hypertrophy significantly greater male patients \( 46 9 1 week \) female patients \( 26 7 \) compensatory hypertrophy occurred age groups studied pronounced patients less 30 years old patients followed 10 years effective renal plasma flow decreased 387 7 ml per minute 1 week nephrectomy 4 ml per minute 10 years result similar decrease seen normal population according results , renal donation living related persons lead long term decrease renal function" +C23,treatment idiopathic retroperitoneal fibrosis immunosuppression idiopathic retroperitoneal fibrosis exceedingly uncommon childhood etiology uncertain support immunological basis disease given report 14 year old girl severe retroperitoneal fibrosis causing progressive azathioprine prednisolone used successfully case supports efficacy immunotherapy treatment idiopathic retroperitoneal fibrosis +C23,en bloc transplantation kidneys donors weighing less 15 kg adult recipients en bloc transplantation kidneys donors weighed less 15 kg 20 adult patients described medial kidney allowed adequate renal positioning growth graft venous thrombosis occurred 1 patient irreversible graft rejection occurred 4 patients graft survival 65 excellent function mean followup 8 8 months en bloc transplantation pediatric cadaver kidney grafts adults acceptable procedure +C23,"afferent nipple valve malfunction caused anchoring collar unexpected late complication kock continent ileal reservoir construction kock continent ileal reservoir urinary diversion , significantly high rates late postoperative complications regarding nipple valves , efferent limb particular , reported reports afferent nipple valve malfunction total 42 patients underwent kock pouch operation observed 12 months \( mean 38 months \) evaluated terms afferent nipple valve malfunction late afferent nipple valve complications observed 10 42 patients \( 24 \) complications included erosion fiber fabric used collar \( 5 patients \) , stenosis afferent limb \( 2 \) obstruction afferent nipple mucous plug fungus ball \( 3 \) latter 2 complications due mechanical dynamic obstruction urine flow caused nonabsorbable collar none 10 patients problems efferent nipple valve function results suggest peristaltic direction intestine use nonabsorbable material collar primarily responsible late afferent nipple valve complications modifications needed produce stable nipple valve otherwise , simpler reliable alternative techniques anastomosis considered" diff --git a/tests/data_for_tests/io/peopledaily/dev.txt b/tests/data_for_tests/io/peopledaily/dev.txt new file mode 100755 index 00000000..4769eb79 --- /dev/null +++ b/tests/data_for_tests/io/peopledaily/dev.txt @@ -0,0 +1,7 @@ +中 B-ORG +共 I-ORG +中 I-ORG +央 I-ORG + +致 O +中 B-ORG diff --git a/tests/data_for_tests/io/peopledaily/test.txt b/tests/data_for_tests/io/peopledaily/test.txt new file mode 100755 index 00000000..1a983ebd --- /dev/null +++ b/tests/data_for_tests/io/peopledaily/test.txt @@ -0,0 +1,41 @@ +美 B-LOC +国 I-LOC + +的 O +华 B-PER + +莱 B-PER +士 B-PER + +中 B-ORG +共 I-ORG + +中 I-ORG +央 I-ORG + +举 O +办 O + +《 O +“ O + +一 O +国 O + +两 O +制 O + +” O +与 O + +香 B-LOC +港 I-LOC + +基 O +本 O + +法 O +》 O + +讲 O +座 O diff --git a/tests/data_for_tests/io/peopledaily/train.txt b/tests/data_for_tests/io/peopledaily/train.txt new file mode 100755 index 00000000..4fb5f61b --- /dev/null +++ b/tests/data_for_tests/io/peopledaily/train.txt @@ -0,0 +1,46 @@ +我 O +们 O + +收 O +藏 O + +北 B-LOC +京 I-LOC + +史 O +料 O + +历 B-LOC +博 I-LOC + +、 O +古 B-ORG +研 I-ORG +所 I-ORG + +、 O +北 B-LOC + +大 I-LOC +清 I-LOC + +华 I-LOC +图 I-LOC + +书 I-LOC +馆 I-LOC + +我 O +们 O + +是 O +受 O + +到 O +郑 B-PER + +振 I-PER +铎 I-PER + +先 O +生 O diff --git a/tests/data_for_tests/io/weibo_NER/dev.conll b/tests/data_for_tests/io/weibo_NER/dev.conll new file mode 100755 index 00000000..11db48f8 --- /dev/null +++ b/tests/data_for_tests/io/weibo_NER/dev.conll @@ -0,0 +1,21 @@ +老 B-PER.NOM +百 I-PER.NOM +姓 I-PER.NOM + +心 O + +新 B-GPE.NAM +乡 I-GPE.NAM + +年 O + +大 B-ORG.NOM +学 I-ORG.NOM + +同 O + +宿 B-LOC.NOM +舍 I-LOC.NOM + +三 O +年 O diff --git a/tests/data_for_tests/io/weibo_NER/test.conll b/tests/data_for_tests/io/weibo_NER/test.conll new file mode 100755 index 00000000..b92e7efa --- /dev/null +++ b/tests/data_for_tests/io/weibo_NER/test.conll @@ -0,0 +1,17 @@ +感 O +动 O + +了 O + +李 B-PER.NAM +开 I-PER.NAM +复 I-PER.NAM + +小 B-ORG.NOM +学 I-ORG.NOM + +美 O +术 O + +新 O +课 O \ No newline at end of file diff --git a/tests/data_for_tests/io/weibo_NER/train.conll b/tests/data_for_tests/io/weibo_NER/train.conll new file mode 100755 index 00000000..6d6182c0 --- /dev/null +++ b/tests/data_for_tests/io/weibo_NER/train.conll @@ -0,0 +1,69 @@ +坏 O +男 B-PER.NOM +人 I-PER.NOM + +男 B-PER.NOM +人 I-PER.NOM +帮 I-PER.NOM + + +不 O + +南 B-GPE.NAM +都 I-GPE.NAM + +南 B-GPE.NAM +方 I-GPE.NAM +都 I-GPE.NAM +市 I-GPE.NAM + +的 O + +那 B-LOC.NOM +座 I-LOC.NOM + +来 O + +学 B-ORG.NOM +校 I-ORG.NOM + +的 O + +卫 B-ORG.NAM +生 I-ORG.NAM +部 I-ORG.NAM + +台 B-GPE.NAM +灣 I-GPE.NAM + +火 B-LOC.NAM +焰 I-LOC.NAM +山 I-LOC.NAM + +的 O + +成 O +李 B-PER.NAM +力 I-PER.NAM +帆 I-PER.NAM + +我 O + +南 B-GPE.NAM +都 I-GPE.NAM + +深 B-GPE.NAM +圳 I-GPE.NAM + +一 O +个 O + +国 B-GPE.NOM +家 I-GPE.NOM + +以 O + +民 B-PER.NOM + +为 O +本 O diff --git a/tests/data_for_tests/io/yelp_review_full/dev.csv b/tests/data_for_tests/io/yelp_review_full/dev.csv new file mode 100755 index 00000000..ecc93b0b --- /dev/null +++ b/tests/data_for_tests/io/yelp_review_full/dev.csv @@ -0,0 +1,6 @@ +"2","Two meals, on the recommendation of a friend who lives near the place, and after the second trip, I was compelled to write. 'Rocky' would definitely describe the experiences.\n\nOn the first trip, I went to try their (at that time)raved about Reuben. And YET to find a true good Reuben in da burgh, I tried it.\n\nWell, they were out of the proper bread, and the guy had to run to the store to buy the closest thing he could find, which was not the proper bread, and instead of one of their 'raved about' Reubens, I received two mini-Reubens, which basically took the guts from one Reuben, and spread it out onto two sandwiches on regular sized bread. I ate it. It wasn't great, but they swore it was because they'd run out of the bread. Bread or not, it still wasn't great. The atmosphere was pleasant in that 'blue collar bar' kind of way, and the staff was very nice, but not a winning pitch on the Reuben.\n\nThe second trip was after a long day of moving furniture with the same friend. Sat in the back room, instead of the bar, which felt more like a restaurant, of course, with the big screen TV covering the sports of the moment.\n\nI was in the mood for dinner this time, and after a scan, decided on fried chicken and mashed potatoes with the salad bar. My friend ordered one of her faves, the breaded pork chops.\n\nWe hit the salad bar, which was uber-basic. Three soups (mostly vegetable loaded, which left me out), basic iceberg lettuce mix (very probably out of a bag), a few veggie toppings, and three or four dressings. It was a basic salad, no big deal. More or less an appetizer filler before the meal.\n\nThe mind-blower in this trip was the ordering of the fried chicken dinner. Our waiter looked like a 19 year old gas station attendant, skinny little blonde guy with a sweet but incredibly naive face, and an air of vapidity, which was confirmed when I placed my order. I asked what chicken pieces came in the dinner, and asked if it was possible to only get dark meat. I never imagined how confusing a question that could possibly be. It literally took him two trips back to the kitchen to 'ask', and the child honestly had no clue what 'white meat' and 'dark meat' meant. The first answer he came back with was that the chicken came in a pre-portioned prepared bag, kind of Kentucky Fried Chicken style...which didn't answer my question, thus prompting the second trip. \n\nAfter the second trip back I heard the cook holler 'Tell him I'll fix him up'. \n\nWell, the chicken was prepackaged dreck like you'd find in the freezer case of Walmart, tiny and not good, and the potatoes had that slight tinge of chem-spuds flavor, laden with some kind of chopped up green (parsley?), and a side of that basic brown gravy served up in 5 gallon buckets.\n\nThank goodness for the basic salad bar.\n\nEven my friend admitted that her pork chops were different and not what she'd expected. They also appeared to be from a freezer bag.\n\nThe irony was that the boy who didn't know white meat from dark meat, was chatting with some other customers...about baseball...and he was a genius about the mindless sport of baseball. Ahhhh da burgh.\n\nThird base? Nah...why bother when there are so many other options around. Go on in a grab a beer and chat black and gold if you happen to be in Carnegie...they can help you out all types of ways in that area. Just don't go hungry if you actually have tastebuds.\n\nFrom what I understand it 'used to be' really good homecooked food. But apparently, mama has left the kitchen." +"4","I belong to this gym... I live in the South section of Pittsburgh, and I find that this gym is not too far from me. The staff is friendly, the equipment is quite good. You get two free personal training sessions when you join. They have lots of weights (which my boyfriend uses) and a decent cardio room. The only thing I would say is to increase some of the cardio equipment. Water is only $1 a bottle!" +"3","I've been to Papa J's twice and had mixed experiences.\n\nBoth times I had the banana pepper appetizer, which is great and goes really well with the FRESH and delicious bread and cheese they give you at the start of your meal.\n\nFor entrees, me and my girlfriend have had mixed experience. I've had the fish sandwich (very good) and the eggplant parm sandwich (okay). My girlfriend got the salad with bread and basil on it, but the basil was over powering and the bread was soggy with the dressing. \n\nThe service is also a mixed bag. The first time our server went out of her way to take care of us and even MADE me cocktail sauce for my fish sandwich. The second time, the server was lackluster, didn't know anything about the menu and wasn't able to take proper care of us. \n\nI would return to Papa J's, but I my terrible experience last time isn't enough to say it would be my first pick of places to eat around Carnegie/Robinson." +"4","Yay, I'm a fan but sometimes service is a little slow, it was very good for us this visit. Go to Papa j's every once in a while but mostly for the White Pizza. It is the best white pizza I have ever had. Order the white pizza on our visit this weekend... it has garlic, spinach, feta cheese and we usually add some veggie on top. It was delicious! Order fried calamari and it was OK...note to self next time try the calamari roman style.\n\nLike the dinning room with the hardwood floors and bright lighting. \n\nThe bar was jumping thou never go to the bar." +"3","Had dinner at Papa J's with a group of 6. I loved how the restaurant is in a old brick building with large windows. It felt like a neighborhood restaurant. On a Saturday night, the restaurant was full but not crowded. We were seated in a room with poor acoustics. It was difficult to hear people at our table and the waitress. While she tried, I can see the asperation in her face when she had to repeat the specials to both sides of the table.\n\nPeople ordered bourbon on the rocks before dinner which seemed watered down, while my lemon drop was made nice. The bread was delicious! Can you describe it to be creamy? The fried zucchini was lightly breaded and not too oily. It was a large portion made up of 2 sliced zucchinis.\n\nWe ordered a variety of dishes. The pasta dish was dry with more pasta than sauce or meat. Those who ordered the fish special thought it was delicious. The shrimp dish was enjoyed as well. I had the chicken marsala which was pretty good. The marsala sauce wasn't too thick, and the chicken moist.\n\nHard to tell if the deserts were \""homemade.\"" The tiramisu and spumoni were small in portion and meant for one. \n\nOn the whole, I was on the fence with my overall impression of Papa J's. \""A-ok\"" probably is the best way to describe it." +"2","Rather typical SnS. Had a good lunch crowd. Milkshake was good but not as good as EnP down the street. It took to long to get the burger for some reason, 25 minutes, I realized cooked to order but this is a little long for SnS. Ordered the Guacamole Steakburger and it only had a small portion of Gauc...not your usual amount..kitchen was not up to speed on portion sizing for some reason. Definitely did not look like the picture on the website. Oh well!" diff --git a/tests/data_for_tests/io/yelp_review_full/test.csv b/tests/data_for_tests/io/yelp_review_full/test.csv new file mode 100755 index 00000000..63d84891 --- /dev/null +++ b/tests/data_for_tests/io/yelp_review_full/test.csv @@ -0,0 +1,6 @@ +"1","I got 'new' tires from them and within two weeks got a flat. I took my car to a local mechanic to see if i could get the hole patched, but they said the reason I had a flat was because the previous patch had blown - WAIT, WHAT? I just got the tire and never needed to have it patched? This was supposed to be a new tire. \nI took the tire over to Flynn's and they told me that someone punctured my tire, then tried to patch it. So there are resentful tire slashers? I find that very unlikely. After arguing with the guy and telling him that his logic was far fetched he said he'd give me a new tire \""this time\"". \nI will never go back to Flynn's b/c of the way this guy treated me and the simple fact that they gave me a used tire!" +"1","Don't waste your time. We had two different people come to our house to give us estimates for a deck (one of them the OWNER). Both times, we never heard from them. Not a call, not the estimate, nothing." +"1","All I can say is the worst! We were the only 2 people in the place for lunch, the place was freezing and loaded with kids toys! 2 bicycles, a scooter, and an electronic keyboard graced the dining room. A fish tank with filthy, slimy fingerprints smeared all over it is there for your enjoyment.\n\nOur food came... no water to drink, no tea, medium temperature food. Of course its cold, just like the room, I never took my jacket off! The plates are too small, you food spills over onto some semi-clean tables as you sit in your completely worn out booth seat. The fried noodles were out of a box and nasty, the shrimp was mushy, the fried rice was bright yellow.\n\nWe asked for water, they brought us 1 in a SOLO cup for 2 people. I asked for hot tea, they said 10 minutes. What Chinese restaurant does not have hot tea available upon request?\n\nOver all.... my first and last visit to this place. The only good point was that it was cheap, and deservingly so." +"1","I have been to this restaurant twice and was disappointed both times. I won't go back. The first time we were there almost 3 hours. It took forever to order and then forever for our food to come and the place was empty. When I complained the manager was very rude and tried to blame us for taking to long to order. It made no sense, how could we order when the waitress wasn't coming to the table? After arguing with me he ended up taking $6 off of our $200+ bill. Ridiculous. If it were up to me I would have never returned. Unfortunately my family decided to go here again tonight. Again it took a long time to get our food. My food was cold and bland, my kids food was cold. My husbands salmon was burnt to a crisp and my sister in law took one bite of her trout and refused to eat any more because she claims it was so disgusting. The wedding soup and bread were good, but that's it! My drink sat empty throughout my meal and never got refilled even when I asked. Bad food, slow service and rude managers. I'll pass on this place if my family decides to go again. Not worth it at all with all the other good Italian options around." +"1","Food was NOT GOOD at all! My husband & I ate here a couple weeks ago for the first time. I ordered a salad & basil pesto cream pasta & my husband ordered the spinach & feta pasta. The salad was just a huge plate of spring mix (nothing else in it) with WAY to much vinegar dressing. My lettuce was drowning in the vinegar. My pesto pasta had no flavor (did not taste like a cream sauce to me) & the pesto was so runny/watery & way too much sauce not enough noodles. My husband's pasta had even less flavor than mine. We ate about a quarter of the food & couldn't even finish it. We took it home & it was so bad I didn't even eat my leftovers. And I hate wasting food!! Plus the prices are expensive for the amount of food you get & of course the poor quality. Don't waste your time eating here. There are much better Italian restaurants in Pittsburgh." +"3","This is a tiny Starbucks and it locations like this (although cute) makes you wonder if your really meant to hang out or just grab your coffee and leave. Leaving is always a good idea at this location anyway since you have a nice fountain in the back with benches and it is a central part of the Waterfront Shopping. \n\nStarbuck isn't my favorite coffee chain by any means. Is it just me or do all Starbuck coffees taste a little burnt and bitter? No matter how trendy, cool and upscale their establishments are I can't get around the yicky tasting bitterness of Staryucks regular coffees. Talk about over roasting a bean...Maybe something has changed with their regular coffee but I have not drank it in about a year. I am not one for soy caramel latte foofy stuff. Still I'll give the establishment tres estrellas for the fact that their espresso is acceptable and doesn't taste half as bad as the regular coffee bean." diff --git a/tests/data_for_tests/io/yelp_review_full/train.csv b/tests/data_for_tests/io/yelp_review_full/train.csv new file mode 100755 index 00000000..032d423a --- /dev/null +++ b/tests/data_for_tests/io/yelp_review_full/train.csv @@ -0,0 +1,6 @@ +"5","dr. goldberg offers everything i look for in a general practitioner. he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first. really, what more do you need? i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank." +"2","Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff. It seems that his staff simply never answers the phone. It usually takes 2 hours of repeated calling to get an answer. Who has time for that or wants to deal with it? I have run into this problem with many other doctors and I just don't get it. You have office workers, you have patients with medical needs, why isn't anyone answering the phone? It's incomprehensible and not work the aggravation. It's with regret that I feel that I have to give Dr. Goldberg 2 stars." +"4","Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life." +"3","Got a letter in the mail last week that said Dr. Goldberg is moving to Arizona to take a new position there in June. He will be missed very much. \n\nI think finding a new doctor in NYC that you actually like might almost be as awful as trying to find a date!" +"1","I don't know what Dr. Goldberg was like before moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really need them. I have never felt compelled to write a bad review about anyone until I met this pathetic excuse for a doctor who is all about the money." +"5","Top notch doctor in a top notch practice. Can't say I am surprised when I was referred to him by another doctor who I think is wonderful and because he went to one of the best medical schools in the country. \nIt is really easy to get an appointment. There is minimal wait to be seen and his bedside manner is great." diff --git a/tests/data_for_tests/io/yelp_review_polarity/dev.csv b/tests/data_for_tests/io/yelp_review_polarity/dev.csv new file mode 100755 index 00000000..09228213 --- /dev/null +++ b/tests/data_for_tests/io/yelp_review_polarity/dev.csv @@ -0,0 +1,6 @@ +"1","Hoofah." +"1","Two meals, on the recommendation of a friend who lives near the place, and after the second trip, I was compelled to write. 'Rocky' would definitely describe the experiences.\n\nOn the first trip, I went to try their (at that time)raved about Reuben. And YET to find a true good Reuben in da burgh, I tried it.\n\nWell, they were out of the proper bread, and the guy had to run to the store to buy the closest thing he could find, which was not the proper bread, and instead of one of their 'raved about' Reubens, I received two mini-Reubens, which basically took the guts from one Reuben, and spread it out onto two sandwiches on regular sized bread. I ate it. It wasn't great, but they swore it was because they'd run out of the bread. Bread or not, it still wasn't great. The atmosphere was pleasant in that 'blue collar bar' kind of way, and the staff was very nice, but not a winning pitch on the Reuben.\n\nThe second trip was after a long day of moving furniture with the same friend. Sat in the back room, instead of the bar, which felt more like a restaurant, of course, with the big screen TV covering the sports of the moment.\n\nI was in the mood for dinner this time, and after a scan, decided on fried chicken and mashed potatoes with the salad bar. My friend ordered one of her faves, the breaded pork chops.\n\nWe hit the salad bar, which was uber-basic. Three soups (mostly vegetable loaded, which left me out), basic iceberg lettuce mix (very probably out of a bag), a few veggie toppings, and three or four dressings. It was a basic salad, no big deal. More or less an appetizer filler before the meal.\n\nThe mind-blower in this trip was the ordering of the fried chicken dinner. Our waiter looked like a 19 year old gas station attendant, skinny little blonde guy with a sweet but incredibly naive face, and an air of vapidity, which was confirmed when I placed my order. I asked what chicken pieces came in the dinner, and asked if it was possible to only get dark meat. I never imagined how confusing a question that could possibly be. It literally took him two trips back to the kitchen to 'ask', and the child honestly had no clue what 'white meat' and 'dark meat' meant. The first answer he came back with was that the chicken came in a pre-portioned prepared bag, kind of Kentucky Fried Chicken style...which didn't answer my question, thus prompting the second trip. \n\nAfter the second trip back I heard the cook holler 'Tell him I'll fix him up'. \n\nWell, the chicken was prepackaged dreck like you'd find in the freezer case of Walmart, tiny and not good, and the potatoes had that slight tinge of chem-spuds flavor, laden with some kind of chopped up green (parsley?), and a side of that basic brown gravy served up in 5 gallon buckets.\n\nThank goodness for the basic salad bar.\n\nEven my friend admitted that her pork chops were different and not what she'd expected. They also appeared to be from a freezer bag.\n\nThe irony was that the boy who didn't know white meat from dark meat, was chatting with some other customers...about baseball...and he was a genius about the mindless sport of baseball. Ahhhh da burgh.\n\nThird base? Nah...why bother when there are so many other options around. Go on in a grab a beer and chat black and gold if you happen to be in Carnegie...they can help you out all types of ways in that area. Just don't go hungry if you actually have tastebuds.\n\nFrom what I understand it 'used to be' really good homecooked food. But apparently, mama has left the kitchen." +"2","I've lived in Pittsburgh for 6 years, and in Carnegie for over 2 years, and by far, this is the best greasy spoon joint I've found. If you can stomach the wait (no reservations, naturally), you'll enjoy overflowing plates of goodness, thanks to the well-seasoned griddle where all of the food is made. \n\nHere are the highlights:\n\n-Cheap: Breakfast for two can be well under $10, with lunch around the same.\n-Crowded: Get there early and expect to wait. They close pretty early on the weekends too (oddly, at 12:45pm)\n-Cash only\n-Huge portions: When ordering fries or homefries, always get the half order, unless you're a lumberjack\n-About those homefries: They're often undercooked. I've had better, believe me. My favorite things to eat in life are potato products.\n-My favorite item: hot sausage sandwich on thick Italian toast, with cheese, lettuce, tomato and mayo" +"2","Classic breakfast joint. Grimy looking hole in the wall located on one end of a seedy looking strip mall. Window is opaque due to the grease so you can't hardly see inside. On the outside, there are about a dozen people waiting to get in. When you finally do get inside, you see that there are 15 tables and a counter, all occupied by people from all walks of life.\n\nWhat's the attraction behind this flea hole? The FOOD! Lots of it and dirt cheap. I sat at a vacant stool behind the formica counter and ordered the mixed grill. Potatoes, eggs, sausage, bacon and Italian toast. A giant mound of food guaranteed to sooth any hangover. I swear the full mixed grill had two pounds of food. Neat thing is that the grill is right in front of you so you can see your potatoes and eggs frying in a pool of fresh grease. All that food, plus coffee and tip for around ten bucks. Cash only, so put that plastic away.\n\nOnly bad thing that could happen is some douche bag from the Food Network or Travel Channel will make this place famous, and then I'll never be able to get in." +"1","Some of the worst pizza I've ever had. We used a coupon from the paper for a 2 topping 8 cut Sicilian. First of all the pizza wasn't even cut through, and the sad attempt at cutting was so uneven that 4 of the slices were about an inch wide, while the others were about 4\"" each. The toppings were scarce, they used mini pepperoni and put maybe 8 on the whole pizza. The onions were huge chunks and the mushrooms were straight from a can. The worst part though was the thick doughy crust that tasted more like a fishy sourdough roll. I'm serious... It was so noticeable that it made me wonder if the dough was bad or if they for some weird reason put fish sauce in it. It was gross. \n\nWe also ordered steak and Italian hoagies. The veggies were old and wilted, and there was no dressing on either. The Italian had deli meat that was clearly bottom of the line and not very generous. The \""steak\"" (if you an call it that) was greyish instead of brown and looked like it was a processed meat chopped into pieces. No flavor or seasoning and the texture was reminiscent of spam. It was so bad that I only ate 1/4 of it and tossed the rest. \n\nI have ordered from here in the past and always been disappointed. I thought I would give them another try since I'd never ordered a Sicilian pizza from there. What a mistake. I will never order from them again!" +"1","Terrible service. Food unremarkable. Waiter disappeared for 45 minutes to serve larger group due to staffing mismanagement. Saved his tip by discounting meal after I complained. All and all, a very crude and unpleasant dining experience for me and my guests. Not to be repeated, never again!" diff --git a/tests/data_for_tests/io/yelp_review_polarity/test.csv b/tests/data_for_tests/io/yelp_review_polarity/test.csv new file mode 100755 index 00000000..95ac34f3 --- /dev/null +++ b/tests/data_for_tests/io/yelp_review_polarity/test.csv @@ -0,0 +1,6 @@ +"2","Contrary to other reviews, I have zero complaints about the service or the prices. I have been getting tire service here for the past 5 years now, and compared to my experience with places like Pep Boys, these guys are experienced and know what they're doing. \nAlso, this is one place that I do not feel like I am being taken advantage of, just because of my gender. Other auto mechanics have been notorious for capitalizing on my ignorance of cars, and have sucked my bank account dry. But here, my service and road coverage has all been well explained - and let up to me to decide. \nAnd they just renovated the waiting room. It looks a lot better than it did in previous years." +"1","Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They \""fixed\"" it for free, and the very next morning I had the same issue. I called to complain, and the \""manager\"" didn't even apologize!!! So frustrated. Never going back. They seem overpriced, too." +"2","Friendly staff, same starbucks fair you get anywhere else. Sometimes the lines can get long." +"1","The food is good. Unfortunately the service is very hit or miss. The main issue seems to be with the kitchen, the waiters and waitresses are often very apologetic for the long waits and it's pretty obvious that some of them avoid the tables after taking the initial order to avoid hearing complaints." +"2","Even when we didn't have a car Filene's Basement was worth the bus trip to the Waterfront. I always find something (usually I find 3-4 things and spend about $60) and better still, I am always still wearing the clothes and shoes 3 months later. \n\nI kind of suspect this is the best shopping in Pittsburgh; it's much better than the usual department stores, better than Marshall's and TJ Maxx and better than the Saks downtown, even when it has a sale. Selection, bargains AND quality.\n\nI like this Filene's better than Gabriel Brothers, which are harder to get to. Gabriel Brothers are a real discount shopper's challenge and I'm afraid I didn't live in Pittsburgh long enough to develop the necessary skills . . . Filene's was still up and running in June 2007 when I left town." +"2","Picture Billy Joel's \""Piano Man\"" DOUBLED mixed with beer, a rowdy crowd, and comedy - Welcome to Sing Sing! A unique musical experience found in Homestead.\n\nIf you're looking to grab a bite to eat or a beer, come on in! Serving food and brews from Rock Bottom Brewery, Sing Sing keeps your tummy full while you listen to two (or more) amazingly talented pianists take your musical requests. They'll play anything you'd like, for tips of course. Wanting to hear Britney Spears? Toto? Duran Duran? Yep, they play that... new or old.\n\nThe crowd makes the show, so make sure you come ready for a good time. If the crowd is dead, it's harder for the Guys to get a reaction. If you're wanting to have some fun, it can be a GREAT time! It's the perfect place for Birthday parties - especially if you want to embarrass a friend. The guys will bring them up to the pianos and perform a little ditty. For being a good sport, you get the coveted Sing Sing bumper sticker. Now who wouldn't want that?\n\nDueling Pianos and brews... time to Shut Up & Sing Sing!" diff --git a/tests/data_for_tests/io/yelp_review_polarity/train.csv b/tests/data_for_tests/io/yelp_review_polarity/train.csv new file mode 100755 index 00000000..6b72a7d6 --- /dev/null +++ b/tests/data_for_tests/io/yelp_review_polarity/train.csv @@ -0,0 +1,6 @@ +"1","Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff. It seems that his staff simply never answers the phone. It usually takes 2 hours of repeated calling to get an answer. Who has time for that or wants to deal with it? I have run into this problem with many other doctors and I just don't get it. You have office workers, you have patients with medical needs, why isn't anyone answering the phone? It's incomprehensible and not work the aggravation. It's with regret that I feel that I have to give Dr. Goldberg 2 stars." +"2","Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life." +"1","I don't know what Dr. Goldberg was like before moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really need them. I have never felt compelled to write a bad review about anyone until I met this pathetic excuse for a doctor who is all about the money." +"1","I'm writing this review to give you a heads up before you see this Doctor. The office staff and administration are very unprofessional. I left a message with multiple people regarding my bill, and no one ever called me back. I had to hound them to get an answer about my bill. \n\nSecond, and most important, make sure your insurance is going to cover Dr. Goldberg's visits and blood work. He recommended to me that I get a physical, and he knew I was a student because I told him. I got the physical done. Later, I found out my health insurance doesn't pay for preventative visits. I received an $800.00 bill for the blood work. I can't pay for my bill because I'm a student and don't have any cash flow at this current time. I can't believe the Doctor wouldn't give me a heads up to make sure my insurance would cover work that wasn't necessary and was strictly preventative. The office can't do anything to help me cover the bill. In addition, the office staff said the onus is on me to make sure my insurance covers visits. Frustrating situation!" +"2","All the food is great here. But the best thing they have is their wings. Their wings are simply fantastic!! The \""Wet Cajun\"" are by the best & most popular. I also like the seasoned salt wings. Wing Night is Monday & Wednesday night, $0.75 whole wings!\n\nThe dining area is nice. Very family friendly! The bar is very nice is well. This place is truly a Yinzer's dream!! \""Pittsburgh Dad\"" would love this place n'at!!" +"1","Wing sauce is like water. Pretty much a lot of butter and some hot sauce (franks red hot maybe). The whole wings are good size and crispy, but for $1 a wing the sauce could be better. The hot and extra hot are about the same flavor/heat. The fish sandwich is good and is a large portion, sides are decent." diff --git a/tests/data_for_tests/people.txt b/tests/data_for_tests/people.txt new file mode 100755 index 00000000..9ef0de6d --- /dev/null +++ b/tests/data_for_tests/people.txt @@ -0,0 +1,307 @@ +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +中 B-nt +共 M-nt +中 M-nt +央 E-nt +总 B-n +书 M-n +记 E-n +、 S-w +国 B-n +家 E-n +主 B-n +席 E-n +江 B-nr +泽 M-nr +民 E-nr + +( S-w +一 B-t +九 M-t +九 M-t +七 M-t +年 E-t +十 B-t +二 M-t +月 E-t +三 B-t +十 M-t +一 M-t +日 E-t +) S-w + +1 B-t +2 M-t +月 E-t +3 B-t +1 M-t +日 E-t +, S-w +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +中 B-nt +共 M-nt +中 M-nt +央 E-nt +总 B-n +书 M-n +记 E-n +、 S-w +国 B-n +家 E-n +主 B-n +席 E-n +江 B-nr +泽 M-nr +民 E-nr + +( S-w +一 B-t +九 M-t +九 M-t +七 M-t +年 E-t +十 B-t +二 M-t +月 E-t +三 B-t +十 M-t +一 M-t +日 E-t +) S-w + +1 B-t +2 M-t +月 E-t +3 B-t +1 M-t +日 E-t +, S-w +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w \ No newline at end of file diff --git a/tests/data_for_tests/people_daily_raw.txt b/tests/data_for_tests/people_daily_raw.txt new file mode 100755 index 00000000..8255edb6 --- /dev/null +++ b/tests/data_for_tests/people_daily_raw.txt @@ -0,0 +1,27 @@ +19980101-01-001-001/m 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n ——/w 一九九八年/t 新年/t 讲话/n (/w 附/v 图片/n 1/m 张/q )/w +19980101-01-001-002/m 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr +19980101-01-001-003/m (/w 一九九七年/t 十二月/t 三十一日/t )/w +19980101-01-001-004/m 12月/t 31日/t ,/w 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr 发表/v 1998年/t 新年/t 讲话/n 《/w 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n 》/w 。/w (/w 新华社/nt 记者/n 兰/nr 红光/nr 摄/Vg )/w +19980101-01-001-005/m 同胞/n 们/k 、/w 朋友/n 们/k 、/w 女士/n 们/k 、/w 先生/n 们/k :/w +19980101-01-001-006/m 在/p 1998年/t 来临/v 之际/f ,/w 我/r 十分/m 高兴/a 地/u 通过/p [中央/n 人民/n 广播/vn 电台/n]nt 、/w [中国/ns 国际/n 广播/vn 电台/n]nt 和/c [中央/n 电视台/n]nt ,/w 向/p 全国/n 各族/r 人民/n ,/w 向/p [香港/ns 特别/a 行政区/n]ns 同胞/n 、/w 澳门/ns 和/c 台湾/ns 同胞/n 、/w 海外/s 侨胞/n ,/w 向/p 世界/n 各国/r 的/u 朋友/n 们/k ,/w 致以/v 诚挚/a 的/u 问候/vn 和/c 良好/a 的/u 祝愿/vn !/w +19980101-01-001-007/m 1997年/t ,/w 是/v 中国/ns 发展/vn 历史/n 上/f 非常/d 重要/a 的/u 很/d 不/d 平凡/a 的/u 一/m 年/q 。/w 中国/ns 人民/n 决心/d 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n ,/w 继续/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 事业/n 推向/v 前进/v 。/w [中国/ns 政府/n]nt 顺利/ad 恢复/v 对/p 香港/ns 行使/v 主权/n ,/w 并/c 按照/p “/w 一国两制/j ”/w 、/w “/w 港人治港/l ”/w 、/w 高度/d 自治/v 的/u 方针/n 保持/v 香港/ns 的/u 繁荣/an 稳定/an 。/w [中国/ns 共产党/n]nt 成功/a 地/u 召开/v 了/u 第十五/m 次/q 全国/n 代表大会/n ,/w 高举/v 邓小平理论/n 伟大/a 旗帜/n ,/w 总结/v 百年/m 历史/n ,/w 展望/v 新/a 的/u 世纪/n ,/w 制定/v 了/u 中国/ns 跨/v 世纪/n 发展/v 的/u 行动/vn 纲领/n 。/w +19980101-01-001-008/m 在/p 这/r 一/m 年/q 中/f ,/w 中国/ns 的/u 改革/vn 开放/vn 和/c 现代化/vn 建设/vn 继续/v 向前/v 迈进/v 。/w 国民经济/n 保持/v 了/u “/w 高/a 增长/vn 、/w 低/a 通胀/j ”/w 的/u 良好/a 发展/vn 态势/n 。/w 农业/n 生产/vn 再次/d 获得/v 好/a 的/u 收成/n ,/w 企业/n 改革/vn 继续/v 深化/v ,/w 人民/n 生活/vn 进一步/d 改善/v 。/w 对外/vn 经济/n 技术/n 合作/vn 与/c 交流/vn 不断/d 扩大/v 。/w 民主/a 法制/n 建设/vn 、/w 精神文明/n 建设/vn 和/c 其他/r 各项/r 事业/n 都/d 有/v 新/a 的/u 进展/vn 。/w 我们/r 十分/m 关注/v 最近/t 一个/m 时期/n 一些/m 国家/n 和/c 地区/n 发生/v 的/u 金融/n 风波/n ,/w 我们/r 相信/v 通过/p 这些/r 国家/n 和/c 地区/n 的/u 努力/an 以及/c 有关/v 的/u 国际/n 合作/vn ,/w 情况/n 会/v 逐步/d 得到/v 缓解/vn 。/w 总的来说/c ,/w 中国/ns 改革/v 和/c 发展/v 的/u 全局/n 继续/v 保持/v 了/u 稳定/an 。/w +19980101-01-001-009/m 在/p 这/r 一/m 年/q 中/f ,/w 中国/ns 的/u 外交/n 工作/vn 取得/v 了/u 重要/a 成果/n 。/w 通过/p 高层/n 互访/v ,/w 中国/ns 与/p 美国/ns 、/w 俄罗斯/ns 、/w 法国/ns 、/w 日本/ns 等/u 大国/n 确定/v 了/u 双方/n 关系/n 未来/t 发展/v 的/u 目标/n 和/c 指导/vn 方针/n 。/w 中国/ns 与/p 周边/n 国家/n 和/c 广大/b 发展中国家/l 的/u 友好/a 合作/vn 进一步/d 加强/v 。/w 中国/ns 积极/ad 参与/v [亚/j 太/j 经合/j 组织/n]nt 的/u 活动/vn ,/w 参加/v 了/u 东盟/ns —/w 中/j 日/j 韩/j 和/c 中国/ns —/w 东盟/ns 首脑/n 非正式/b 会晤/vn 。/w 这些/r 外交/n 活动/vn ,/w 符合/v 和平/n 与/c 发展/v 的/u 时代/n 主题/n ,/w 顺应/v 世界/n 走向/v 多极化/v 的/u 趋势/n ,/w 对于/p 促进/v 国际/n 社会/n 的/u 友好/a 合作/vn 和/c 共同/b 发展/vn 作出/v 了/u 积极/a 的/u 贡献/n 。/w +19980101-01-001-010/m 1998年/t ,/w 中国/ns 人民/n 将/d 满怀信心/l 地/u 开创/v 新/a 的/u 业绩/n 。/w 尽管/c 我们/r 在/p 经济/n 社会/n 发展/v 中/f 还/d 面临/v 不少/m 困难/an ,/w 但/c 我们/r 有/v 邓小平理论/n 的/u 指引/vn ,/w 有/v 改革/v 开放/v 近/a 20/m 年/q 来/f 取得/v 的/u 伟大/a 成就/n 和/c 积累/v 的/u 丰富/a 经验/n ,/w 还/d 有/v 其他/r 的/u 各种/r 有利/a 条件/n ,/w 我们/r 一定/d 能够/v 克服/v 这些/r 困难/an ,/w 继续/v 稳步前进/l 。/w 只要/c 我们/r 进一步/d 解放思想/i ,/w 实事求是/i ,/w 抓住/v 机遇/n ,/w 开拓进取/l ,/w 建设/v 有/v 中国/ns 特色/n 社会主义/n 的/u 道路/n 就/c 会/v 越/d 走/v 越/d 宽广/a 。/w +19980101-01-001-011/m 实现/v 祖国/n 的/u 完全/a 统一/vn ,/w 是/v 海内外/s 全体/n 中国/ns 人/n 的/u 共同/b 心愿/n 。/w 通过/p 中/j 葡/j 双方/n 的/u 合作/vn 和/c 努力/an ,/w 按照/p “/w 一国两制/j ”/w 方针/n 和/c 澳门/ns 《/w 基本法/n 》/w ,/w 1999年/t 12月/t 澳门/ns 的/u 回归/vn 一定/d 能够/v 顺利/ad 实现/v 。/w +19980101-01-001-012/m 台湾/ns 是/v 中国/ns 领土/n 不可分割/l 的/u 一/m 部分/n 。/w 完成/v 祖国/n 统一/vn ,/w 是/v 大势所趋/i ,/w 民心所向/l 。/w 任何/r 企图/v 制造/v “/w 两/m 个/q 中国/ns ”/w 、/w “/w 一中一台/j ”/w 、/w “/w 台湾/ns 独立/v ”/w 的/u 图谋/n ,/w 都/d 注定/v 要/v 失败/v 。/w 希望/v 台湾/ns 当局/n 以/p 民族/n 大义/n 为重/v ,/w 拿/v 出/v 诚意/n ,/w 采取/v 实际/a 的/u 行动/vn ,/w 推动/v 两岸/n 经济/n 文化/n 交流/vn 和/c 人员/n 往来/vn ,/w 促进/v 两岸/n 直接/ad 通邮/v 、/w 通航/v 、/w 通商/v 的/u 早日/d 实现/v ,/w 并/c 尽早/d 回应/v 我们/r 发出/v 的/u 在/p 一个/m 中国/ns 的/u 原则/n 下/f 两岸/n 进行/v 谈判/vn 的/u 郑重/a 呼吁/vn 。/w +19980101-01-001-013/m 环顾/v 全球/n ,/w 日益/d 密切/a 的/u 世界/n 经济/n 联系/vn ,/w 日新月异/i 的/u 科技/n 进步/vn ,/w 正在/d 为/p 各国/r 经济/n 的/u 发展/vn 提供/v 历史/n 机遇/n 。/w 但是/c ,/w 世界/n 还/d 不/d 安宁/a 。/w 南北/f 之间/f 的/u 贫富/n 差距/n 继续/v 扩大/v ;/w 局部/n 冲突/vn 时有发生/l ;/w 不/d 公正/a 不/d 合理/a 的/u 旧/a 的/u 国际/n 政治/n 经济/n 秩序/n 还/d 没有/v 根本/a 改变/vn ;/w 发展中国家/l 在/p 激烈/a 的/u 国际/n 经济/n 竞争/vn 中/f 仍/d 处于/v 弱势/n 地位/n ;/w 人类/n 的/u 生存/vn 与/c 发展/vn 还/d 面临/v 种种/q 威胁/vn 和/c 挑战/vn 。/w 和平/n 与/c 发展/vn 的/u 前景/n 是/v 光明/a 的/u ,/w 21/m 世纪/n 将/d 是/v 充满/v 希望/n 的/u 世纪/n 。/w 但/c 前进/v 的/u 道路/n 不/d 会/v 也/d 不/d 可能/v 一帆风顺/i ,/w 关键/n 是/v 世界/n 各国/r 人民/n 要/v 进一步/d 团结/a 起来/v ,/w 共同/d 推动/v 早日/d 建立/v 公正/a 合理/a 的/u 国际/n 政治/n 经济/n 新/a 秩序/n 。/w +19980101-01-001-014/m [中国/ns 政府/n]nt 将/d 继续/v 坚持/v 奉行/v 独立自主/i 的/u 和平/n 外交/n 政策/n ,/w 在/p 和平共处/l 五/m 项/q 原则/n 的/u 基础/n 上/f 努力/ad 发展/v 同/p 世界/n 各国/r 的/u 友好/a 关系/n 。/w 中国/ns 愿意/v 加强/v 同/p 联合国/nt 和/c 其他/r 国际/n 组织/n 的/u 协调/vn ,/w 促进/v 在/p 扩大/v 经贸/j 科技/n 交流/vn 、/w 保护/v 环境/n 、/w 消除/v 贫困/an 、/w 打击/v 国际/n 犯罪/vn 等/u 方面/n 的/u 国际/n 合作/vn 。/w 中国/ns 永远/d 是/v 维护/v 世界/n 和平/n 与/c 稳定/an 的/u 重要/a 力量/n 。/w 中国/ns 人民/n 愿/v 与/p 世界/n 各国/r 人民/n 一道/d ,/w 为/p 开创/v 持久/a 和平/n 、/w 共同/d 发展/v 的/u 新/a 世纪/n 而/c 不懈努力/l !/w +19980101-01-001-015/m 在/p 这/r 辞旧迎新/l 的/u 美好/a 时刻/n ,/w 我/r 祝/v 大家/r 新年/t 快乐/a ,/w 家庭/n 幸福/a !/w +19980101-01-001-016/m 谢谢/v !/w (/w 新华社/nt 北京/ns 12月/t 31日/t 电/n )/w + +19980101-01-002-001/m 在/p 十五大/j 精神/n 指引/vn 下/f 胜利/vd 前进/v ——/w 元旦/t 献辞/n +19980101-01-002-002/m 我们/r 即将/d 以/p 丰收/vn 的/u 喜悦/an 送/v 走/v 牛年/t ,/w 以/p 昂扬/a 的/u 斗志/n 迎来/v 虎年/t 。/w 我们/r 伟大/a 祖国/n 在/p 新/a 的/u 一/m 年/q ,/w 将/d 是/v 充满/v 生机/n 、/w 充满/v 希望/n 的/u 一/m 年/q 。/w +19980101-01-002-003/m 刚刚/d 过去/v 的/u 一/m 年/q ,/w 大气磅礴/i ,/w 波澜壮阔/i 。/w 在/p 这/r 一/m 年/q ,/w 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt ,/w 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n ,/w 高举/v 邓小平理论/n 的/u 伟大/a 旗帜/n ,/w 领导/v 全党/n 和/c 全国/n 各族/r 人民/n 坚定不移/i 地/u 沿着/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 道路/n 阔步/d 前进/v ,/w 写/v 下/v 了/u 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 辉煌/a 篇章/n 。/w 顺利/a 地/u 恢复/v 对/p 香港/ns 行使/v 主权/n ,/w 胜利/v 地/u 召开/v 党/n 的/u 第十五/m 次/q 全国/n 代表大会/n ———/w 两/m 件/q 大事/n 办/v 得/u 圆满/a 成功/a 。/w 国民经济/n 稳中求进/l ,/w 国家/n 经济/n 实力/n 进一步/d 增强/v ,/w 人民/n 生活/vn 继续/v 改善/v ,/w 对外/vn 经济/n 技术/n 交流/vn 日益/d 扩大/v 。/w 在/p 国际/n 金融/n 危机/n 的/u 风浪/n 波及/v 许多/m 国家/n 的/u 情况/n 下/f ,/w 我国/n 保持/v 了/u 金融/n 形势/n 和/c 整个/b 经济/n 形势/n 的/u 稳定/a 发展/vn 。/w 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 取得/v 新/a 的/u 成绩/n ,/w 各项/r 社会/n 事业/n 全面/ad 进步/v 。/w 外交/n 工作/vn 取得/v 可喜/a 的/u 突破/vn ,/w 我国/n 的/u 国际/n 地位/n 和/c 国际/n 威望/n 进一步/d 提高/v 。/w 实践/v 使/v 亿万/m 人民/n 对/p 邓小平理论/n 更加/d 信仰/v ,/w 对/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 更加/d 信赖/v ,/w 对/p 伟大/a 祖国/n 的/u 光辉/n 前景/n 更加/d 充满/v 信心/n 。/w +19980101-01-002-004/m 1998年/t ,/w 是/v 全面/ad 贯彻/v 落实/v 党/n 的/u 十五大/j 提出/v 的/u 任务/n 的/u 第一/m 年/q ,/w 各/r 条/q 战线/n 改革/v 和/c 发展/v 的/u 任务/n 都/d 十分/m 繁重/a ,/w 有/v 许多/m 深/a 层次/n 的/u 矛盾/an 和/c 问题/n 有待/v 克服/v 和/c 解决/v ,/w 特别/d 是/v 国有/vn 企业/n 改革/vn 已经/d 进入/v 攻坚/vn 阶段/n 。/w 我们/r 必须/d 进一步/d 深入/ad 学习/v 和/c 掌握/v 党/n 的/u 十五大/j 精神/n ,/w 统揽全局/l ,/w 精心/ad 部署/v ,/w 狠抓/v 落实/v ,/w 团结/a 一致/a ,/w 艰苦奋斗/i ,/w 开拓/v 前进/v ,/w 为/p 夺取/v 今年/t 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 新/a 胜利/vn 而/c 奋斗/v 。/w +19980101-01-002-005/m 今年/t 是/v 党/n 的/u 十一/m 届/q 三中全会/j 召开/v 20/m 周年/q ,/w 是/v 我们/r 党/n 和/c 国家/n 实现/v 伟大/a 的/u 历史/n 转折/vn 、/w 进入/v 改革/vn 开放/vn 历史/n 新/a 时期/n 的/u 20/m 周年/q 。/w 在/p 新/a 的/u 一/m 年/q 里/f ,/w 大力/d 发扬/v 十一/m 届/q 三中全会/j 以来/f 我们/r 党/n 所/u 恢复/v 的/u 优良/z 传统/n 和/c 在/p 新/a 的/u 历史/n 条件/n 下/f 形成/v 的/u 优良/z 作风/n ,/w 对于/p 完成/v 好/a 今年/t 的/u 各项/r 任务/n 具有/v 十分/m 重要/a 的/u 意义/n 。/w +19980101-01-002-006/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 解放思想/i 、/w 实事求是/i 的/u 思想/n 路线/n 。/w 解放思想/i 、/w 实事求是/i ,/w 是/v 邓小平理论/n 的/u 精髓/n 。/w 实践/v 证明/v ,/w 只有/c 解放思想/i 、/w 实事求是/i ,/w 才/c 能/v 冲破/v 各种/r 不/d 切合/v 实际/n 的/u 或者/c 过时/a 的/u 观念/n 的/u 束缚/vn ,/w 真正/d 做到/v 尊重/v 、/w 认识/v 和/c 掌握/v 客观/a 规律/n ,/w 勇于/v 突破/v ,/w 勇于/v 创新/v ,/w 不断/d 开创/v 社会主义/n 现代化/vn 建设/vn 的/u 新/a 局面/n 。/w 党/n 的/u 十五大/j 是/v 我们/r 党/n 解放思想/i 、/w 实事求是/i 的/u 新/a 的/u 里程碑/n 。/w 进一步/d 认真/ad 学习/v 和/c 掌握/v 十五大/j 精神/n ,/w 解放思想/i 、/w 实事求是/i ,/w 我们/r 的/u 各项/r 事业/n 就/d 能/v 结/v 出/v 更加/d 丰硕/a 的/u 成果/n 。/w +19980101-01-002-007/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 。/w 各项/r 工作/vn 必须/d 以/p 经济/n 建设/vn 为/v 中心/n ,/w 是/v 邓小平理论/n 的/u 基本/a 观点/n ,/w 是/v 党/n 的/u 基本/a 路线/n 的/u 核心/n 内容/n ,/w 近/a 20/m 年/q 来/f 的/u 实践/vn 证明/v ,/w 坚持/v 这个/r 中心/n ,/w 是/v 完全/ad 正确/a 的/u 。/w 今后/t ,/w 我们/r 能否/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 伟大/a 事业/n 全面/ad 推向/v 21/m 世纪/n ,/w 关键/n 仍然/d 要/v 看/v 能否/v 把/p 经济/n 工作/vn 搞/v 上去/v 。/w 各级/r 领导/n 干部/n 要/v 切实/ad 把/p 精力/n 集中/v 到/v 贯彻/v 落实/v 好/a 中央/n 关于/p 今年/t 经济/n 工作/vn 的/u 总体/n 要求/n 和/c 各项/r 重要/a 任务/n 上/f 来/v ,/w 不断/d 提高/v 领导/v 经济/n 建设/vn 的/u 能力/n 和/c 水平/n 。/w +19980101-01-002-008/m 我们/r 要/v 更/d 好/a 地/u 坚持/v “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 的/u 方针/n 。/w 在/p 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 的/u 同时/n ,/w 积极/ad 推进/v 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn ,/w 是/v 建设/v 富强/a 、/w 民主/a 、/w 文明/a 的/u 社会主义/n 现代化/vn 国家/n 的/u 重要/a 内容/n 。/w 实践/v 证明/v ,/w 经济/n 建设/vn 的/u 顺利/a 进行/vn ,/w 离/v 不/d 开/v 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 的/u 保证/vn 。/w 党/n 的/u 十五大/j 依据/p 邓小平理论/n 和/c 党/n 的/u 基本/a 路线/n 提出/v 的/u 党/n 在/p 社会主义/n 初级/b 阶段/n 经济/n 、/w 政治/n 、/w 文化/n 的/u 基本/a 纲领/n ,/w 为/p “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 提供/v 了/u 新/a 的/u 理论/n 根据/n ,/w 提出/v 了/u 更/d 高/a 要求/n ,/w 现在/t 的/u 关键/n 是/v 认真/ad 抓好/v 落实/v 。/w +19980101-01-002-009/m 我们/r 要/v 更/d 好/a 地/u 发扬/v 求真务实/l 、/w 密切/ad 联系/v 群众/n 的/u 作风/n 。/w 这/r 是/v 把/p 党/n 的/u 方针/n 、/w 政策/n 落到实处/l ,/w 使/v 改革/v 和/c 建设/v 取得/v 胜利/vn 的/u 重要/a 保证/vn 。/w 在/p 当前/t 改革/v 进一步/d 深化/v ,/w 经济/n 不断/d 发展/v ,/w 同时/c 又/d 出现/v 一些/m 新/a 情况/n 、/w 新/a 问题/n 和/c 新/a 困难/an 的/u 形势/n 下/f ,/w 更/d 要/v 发扬/v 这样/r 的/u 好/a 作风/n 。/w 要/v 尊重/v 群众/n 的/u 意愿/n ,/w 重视/v 群众/n 的/u 首创/vn 精神/n ,/w 关心/v 群众/n 的/u 生活/vn 疾苦/n 。/w 江/nr 泽民/nr 同志/n 最近/t 强调/vd 指出/v ,/w 要/v 大力/d 倡导/v 说实话/l 、/w 办/v 实事/n 、/w 鼓/v 实劲/n 、/w 讲/v 实效/n 的/u 作风/n ,/w 坚决/ad 制止/v 追求/v 表面文章/i ,/w 搞/v 花架子/n 等/u 形式主义/n ,/w 坚决/ad 杜绝/v 脱离/v 群众/n 、/w 脱离/v 实际/n 、/w 浮躁/a 虚夸/v 等/u 官僚主义/n 。/w 这/r 是/v 非常/d 重要/a 的/u 。/w 因此/c ,/w 各级/r 领导/n 干部/n 务必/d 牢记/v 全心全意/i 为/p 人民/n 服务/v 的/u 宗旨/n ,/w 在/p 勤政廉政/l 、/w 艰苦奋斗/i 方面/n 以身作则/i ,/w 当/v 好/a 表率/n 。/w +19980101-01-002-010/m 1998/m ,/w 瞩目/v 中华/nz 。/w 新/a 的/u 机遇/n 和/c 挑战/vn ,/w 催/v 人/n 进取/v ;/w 新/a 的/u 目标/n 和/c 征途/n ,/w 催/v 人/n 奋发/v 。/w 英雄/n 的/u 中国/ns 人民/n 在/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 坚强/a 领导/vn 和/c 党/n 的/u 十五大/j 精神/n 指引/v 下/f ,/w 更/d 高/a 地/u 举起/v 邓小平理论/n 的/u 伟大/a 旗帜/n ,/w 团结/a 一致/a ,/w 扎实/ad 工作/v ,/w 奋勇/d 前进/v ,/w 一定/d 能够/v 创造/v 出/v 更加/d 辉煌/a 的/u 业绩/n !/w diff --git a/tests/data_for_tests/sample_mnli.tsv b/tests/data_for_tests/sample_mnli.tsv new file mode 100755 index 00000000..9a30b95b --- /dev/null +++ b/tests/data_for_tests/sample_mnli.tsv @@ -0,0 +1,12 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 label2 label3 label4 label5 gold_label +0 63735 63735n slate ( ( The ( new rights ) ) ( are ( nice enough ) ) ) ( Everyone ( really ( likes ( the ( newest benefits ) ) ) ) ) (ROOT (S (NP (DT The) (JJ new) (NNS rights)) (VP (VBP are) (ADJP (JJ nice) (RB enough))))) (ROOT (S (NP (NN Everyone)) (VP (ADVP (RB really)) (VBZ likes) (NP (DT the) (JJS newest) (NNS benefits))))) The new rights are nice enough Everyone really likes the newest benefits neutral entailment neutral neutral neutral neutral +1 91383 91383c government ( ( This site ) ( ( includes ( ( ( ( a list ) ( of ( all ( award winners ) ) ) ) and ) ( ( a ( searchable database ) ) ( of ( Government ( Executive articles ) ) ) ) ) ) . ) ) ( ( ( The ( Government ( Executive articles ) ) ) ( housed ( on ( the website ) ) ) ) ( ( ( are not ) ( able ( to ( be searched ) ) ) ) . ) ) (ROOT (S (NP (DT This) (NN site)) (VP (VBZ includes) (NP (NP (NP (DT a) (NN list)) (PP (IN of) (NP (DT all) (NN award) (NNS winners)))) (CC and) (NP (NP (DT a) (JJ searchable) (NN database)) (PP (IN of) (NP (NNP Government) (NNP Executive) (NNS articles)))))) (. .))) (ROOT (S (NP (NP (DT The) (NNP Government) (NNP Executive) (NNS articles)) (VP (VBN housed) (PP (IN on) (NP (DT the) (NN website))))) (VP (VBP are) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB be) (ADJP (JJ searched))))))) (. .))) This site includes a list of all award winners and a searchable database of Government Executive articles. The Government Executive articles housed on the website are not able to be searched. contradiction contradiction contradiction contradiction contradiction contradiction +2 755 755e telephone ( ( ( ( uh ( i ( ( do n't ) ( know ( ( i i ) ( have ( ( mixed emotions ) ( about ( him ( ( uh sometimes ) ( i ( like him ) ) ) ) ) ) ) ) ) ) ) ) but ) ( ( at ( the ( same times ) ) ) ( i ( love ( to ( see somebody ) ) ) ) ) ) ( beat him ) ) ( I ( ( ( ( ( ( like him ) ( for ( the ( most part ) ) ) ) , ) but ) ( ( would still ) ( enjoy ( seeing ( someone ( beat him ) ) ) ) ) ) . ) ) (ROOT (SINV (S (S (INTJ (UH uh)) (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP have) (VP (VBN mixed) (NP (NNS emotions)) (PP (IN about) (S (NP (PRP him)) (VP (VBG uh) (ADVP (RB sometimes)) (NP (NP (FW i)) (PP (IN like) (NP (PRP him))))))))))))))) (CC but) (S (PP (IN at) (NP (DT the) (JJ same) (NNS times))) (NP (FW i)) (VP (VBP love) (S (VP (TO to) (VP (VB see) (NP (NN somebody)))))))) (VP (VBD beat)) (NP (PRP him)))) (ROOT (S (NP (PRP I)) (VP (VP (VBP like) (NP (PRP him)) (PP (IN for) (NP (DT the) (JJS most) (NN part)))) (, ,) (CC but) (VP (MD would) (ADVP (RB still)) (VP (VB enjoy) (S (VP (VBG seeing) (S (NP (NN someone)) (VP (VB beat) (NP (PRP him))))))))) (. .))) uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him I like him for the most part, but would still enjoy seeing someone beat him. entailment entailment entailment entailment entailment entailment +3 78013 78013c telephone ( yeah ( ( i i ) ( think ( ( my ( favorite restaurant ) ) ( ( is always ) ( been ( ( the ( one closest ) ) ( you ( ( know ( the closest ) ) ( ( as long ) ( as ( it ( 's ( it ( meets ( ( the ( minimum criteria ) ) ( you ( know ( of ( good food ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( My ( favorite restaurants ) ) ( ( ( ( are always ) ( ( ( ( ( at least ) a ) hundred ) miles ) away ) ) ( from ( my house ) ) ) . ) ) (ROOT (S (VP (VB yeah) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP think) (SBAR (S (NP (PRP$ my) (JJ favorite) (NN restaurant)) (VP (VBZ is) (ADVP (RB always)) (VP (VBN been) (NP (NP (DT the) (CD one) (JJS closest)) (SBAR (S (NP (PRP you)) (VP (VBP know) (NP (DT the) (JJS closest)) (ADVP (ADVP (RB as) (RB long)) (SBAR (IN as) (S (NP (PRP it)) (VP (VBZ 's) (SBAR (S (NP (PRP it)) (VP (VBZ meets) (NP (NP (DT the) (JJ minimum) (NNS criteria)) (SBAR (S (NP (PRP you)) (VP (VBP know) (PP (IN of) (NP (JJ good) (NN food))))))))))))))))))))))))))))) (ROOT (S (NP (PRP$ My) (JJ favorite) (NNS restaurants)) (VP (VBP are) (ADVP (RB always)) (ADVP (NP (QP (IN at) (JJS least) (DT a) (CD hundred)) (NNS miles)) (RB away)) (PP (IN from) (NP (PRP$ my) (NN house)))) (. .))) yeah i i think my favorite restaurant is always been the one closest you know the closest as long as it's it meets the minimum criteria you know of good food My favorite restaurants are always at least a hundred miles away from my house. contradiction contradiction contradiction contradiction contradiction contradiction +4 96377 96377c telephone ( i ( ( do n't ) ( know ( um ( do ( you ( do ( ( a lot ) ( of camping ) ) ) ) ) ) ) ) ) ( I ( ( know exactly ) . ) ) (ROOT (S (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (SBAR (S (NP (NN um)) (VP (VBP do) (SBAR (S (NP (PRP you)) (VP (VBP do) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN camping)))))))))))))) (ROOT (S (NP (PRP I)) (VP (VBP know) (ADVP (RB exactly))) (. .))) i don't know um do you do a lot of camping I know exactly. contradiction contradiction contradiction contradiction contradiction contradiction +5 139749 139749c telephone ( well ( that ( would ( be ( ( a help ) ( i ( wish ( they ( would ( do ( that ( ( ( here ( we ( have ( got ( so ( ( little ( landfill space ) ) ( left ( that ( we ( 're ( going ( to ( ( run out ) ( before ( ( the end ) ( of ( this decade ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) and ) ( it ( ( 's really ) ( going ( to be ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( We ( ( have ( plenty ( of ( space ( in ( the landfill ) ) ) ) ) ) . ) ) (ROOT (FRAG (ADVP (RB well)) (SBAR (WHNP (WDT that)) (S (VP (MD would) (VP (VB be) (NP (NP (DT a) (NN help)) (SBAR (S (NP (FW i)) (VP (VBP wish) (SBAR (S (NP (PRP they)) (VP (MD would) (VP (VB do) (SBAR (IN that) (S (S (ADVP (RB here)) (NP (PRP we)) (VP (VBP have) (VP (VBN got) (SBAR (IN so) (S (NP (JJ little) (NN landfill) (NN space)) (VP (VBD left) (SBAR (IN that) (S (NP (PRP we)) (VP (VBP 're) (VP (VBG going) (S (VP (TO to) (VP (VB run) (PRT (RP out)) (PP (IN before) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT this) (NN decade)))))))))))))))))) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (ADVP (RB really)) (VP (VBG going) (S (VP (TO to) (VP (VB be))))))))))))))))))))))) (ROOT (S (NP (PRP We)) (VP (VBP have) (NP (NP (RB plenty)) (PP (IN of) (NP (NP (NN space)) (PP (IN in) (NP (DT the) (NN landfill))))))) (. .))) well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be We have plenty of space in the landfill. contradiction contradiction contradiction contradiction contradiction contradiction +6 101415 101415c telephone ( yeah ( ( ( i know ) and ) ( i ( did ( that ( ( ( all ( through college ) ) and ) ( it ( worked too ) ) ) ) ) ) ) ) ( I ( ( ( did ( that all ) ) ( through college ) ) ( but ( it ( never worked ) ) ) ) ) (ROOT (S (VP (VB yeah) (S (S (NP (FW i)) (VP (VBP know))) (CC and) (S (NP (FW i)) (VP (VBD did) (SBAR (IN that) (S (S (NP (DT all)) (PP (IN through) (NP (NN college)))) (CC and) (S (NP (PRP it)) (VP (VBD worked) (ADVP (RB too)))))))))))) (ROOT (S (NP (PRP I)) (VP (VBD did) (ADVP (IN that) (DT all)) (PP (IN through) (NP (NN college))) (SBAR (CC but) (S (NP (PRP it)) (ADVP (RB never)) (VP (VBD worked))))))) yeah i know and i did that all through college and it worked too I did that all through college but it never worked contradiction contradiction contradiction contradiction contradiction contradiction +7 93958 93958n travel ( ( ( ( ( Calcutta ( seems ( to ( be ( ( the ( only ( other ( production center ) ) ) ) ( ( having ( any pretensions ) ) ( to ( ( artistic creativity ) ( at all ) ) ) ) ) ) ) ) ) , ) but ) ( ironically ( you ( ( 're actually ) ( ( more ( likely ( to ( see ( ( the works ) ( of ( ( ( Satyajit Ray ) or ) ( ( Mrinal Sen ) ( shown ( in ( Europe ( or ( North America ) ) ) ) ) ) ) ) ) ) ) ) ) ( than ( in ( India itself ) ) ) ) ) ) ) ) . ) ( ( Most ( of ( ( Mrinal ( Sen 's ) ) work ) ) ) ( ( can ( be ( found ( in ( European collections ) ) ) ) ) . ) ) (ROOT (S (S (NP (NNP Calcutta)) (VP (VBZ seems) (S (VP (TO to) (VP (VB be) (NP (NP (DT the) (JJ only) (JJ other) (NN production) (NN center)) (VP (VBG having) (NP (DT any) (NNS pretensions)) (PP (TO to) (NP (NP (JJ artistic) (NN creativity)) (ADVP (IN at) (DT all))))))))))) (, ,) (CC but) (S (ADVP (RB ironically)) (NP (PRP you)) (VP (VBP 're) (ADVP (RB actually)) (ADJP (ADJP (RBR more) (JJ likely) (S (VP (TO to) (VP (VB see) (NP (NP (DT the) (NNS works)) (PP (IN of) (NP (NP (NNP Satyajit) (NNP Ray)) (CC or) (NP (NP (NNP Mrinal) (NNP Sen)) (VP (VBN shown) (PP (IN in) (NP (NNP Europe) (CC or) (NNP North) (NNP America)))))))))))) (ADVP (IN than) (PP (IN in) (S (VP (VBG India) (NP (PRP itself))))))))) (. .))) (ROOT (S (NP (NP (JJS Most)) (PP (IN of) (NP (NP (NNP Mrinal) (NNP Sen) (POS 's)) (NN work)))) (VP (MD can) (VP (VB be) (VP (VBN found) (PP (IN in) (NP (JJ European) (NNS collections)))))) (. .))) Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself. Most of Mrinal Sen's work can be found in European collections. neutral neutral entailment neutral neutral neutral +8 12567 12567c slate ( ( If ( ( that investor ) ( were ( willing ( to ( pay ( extra ( for ( ( the security ) ( of ( limited downside ) ) ) ) ) ) ) ) ) ) ) ( , ( she ( ( could ( ( buy ( put options ) ) ( with ( ( a ( strike price ) ) ( of ( ( ( $ 98 ) , ) ( which ( would ( ( ( lock ( in ( ( her profit ) ( on ( ( the shares ) ( at ( $ 18 ) ) ) ) ) ) ) , ) ( less ( whatever ( ( the options ) cost ) ) ) ) ) ) ) ) ) ) ) ) . ) ) ) ) ( ( THe ( strike price ) ) ( ( could ( be ( $ 8 ) ) ) . ) ) (ROOT (S (SBAR (IN If) (S (NP (DT that) (NN investor)) (VP (VBD were) (ADJP (JJ willing) (S (VP (TO to) (VP (VB pay) (NP (NP (JJ extra)) (PP (IN for) (NP (NP (DT the) (NN security)) (PP (IN of) (NP (JJ limited) (NN downside))))))))))))) (, ,) (NP (PRP she)) (VP (MD could) (VP (VB buy) (NP (NN put) (NNS options)) (PP (IN with) (NP (NP (DT a) (NN strike) (NN price)) (PP (IN of) (NP (NP ($ $) (CD 98)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (MD would) (VP (VB lock) (PP (IN in) (NP (NP (PRP$ her) (NN profit)) (PP (IN on) (NP (NP (DT the) (NNS shares)) (PP (IN at) (NP ($ $) (CD 18))))))) (, ,) (ADVP (ADVP (RBR less)) (SBAR (WHNP (WDT whatever)) (S (NP (DT the) (NNS options)) (VP (VBD cost))))))))))))))) (. .))) (ROOT (S (NP (NNP THe) (NN strike) (NN price)) (VP (MD could) (VP (VB be) (NP ($ $) (CD 8)))) (. .))) If that investor were willing to pay extra for the security of limited downside, she could buy put options with a strike price of $98, which would lock in her profit on the shares at $18, less whatever the options cost. THe strike price could be $8. contradiction contradiction contradiction contradiction contradiction contradiction +9 117487 117487n slate ( ( 3 -RRB- ) ( ( Dare ( you ( ( ( rise ( to ( ( ( ( the occasion ) , ) ( like Raskolnikov ) ) , ) ) ) and ) ( reject ( ( the ( petty rules ) ) ( that ( govern ( lesser men ) ) ) ) ) ) ) ) ? ) ) ( ( ( Would you ) ( ( ( rise up ) and ) ( defeaat ( ( all ( evil lords ) ) ( in ( the town ) ) ) ) ) ) ? ) (ROOT (S (LST (LS 3) (-RRB- -RRB-)) (VP (VB Dare) (S (NP (PRP you)) (VP (VP (VB rise) (PP (TO to) (NP (NP (DT the) (NN occasion)) (, ,) (PP (IN like) (NP (NNP Raskolnikov))) (, ,)))) (CC and) (VP (VB reject) (NP (NP (DT the) (JJ petty) (NNS rules)) (SBAR (WHNP (WDT that)) (S (VP (VBP govern) (NP (JJR lesser) (NNS men)))))))))) (. ?))) (ROOT (SQ (MD Would) (NP (PRP you)) (VP (VP (VB rise) (PRT (RP up))) (CC and) (VP (VB defeaat) (NP (NP (DT all) (JJ evil) (NNS lords)) (PP (IN in) (NP (DT the) (NN town)))))) (. ?))) 3) Dare you rise to the occasion, like Raskolnikov, and reject the petty rules that govern lesser men? Would you rise up and defeaat all evil lords in the town? neutral neutral neutral neutral neutral neutral +10 9616 9616c travel ( ( The ( ( most important ) directions ) ) ( ( ( are ( simply ( ( up and ) up ) ) ) ( ( ( ( ( ( ( ( leads eventually ) ( to ( the cathedral ) ) ) and ) ( fortress ( commanding ( the hilltop ) ) ) ) , ) and ) down ) ( inevitably ( ( leads ( to ( one ( of ( three gates ) ) ) ) ) ( through ( ( the wall ) ( to ( the ( new town ) ) ) ) ) ) ) ) ) . ) ) ( Go ( ( downwards ( to ( one ( of ( ( ( the gates ) , ) ( ( all ( of which ) ) ( will ( ( lead you ) ( into ( the cathedral ) ) ) ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (DT The) (ADJP (RBS most) (JJ important)) (NNS directions)) (VP (VBP are) (PRN (ADVP (RB simply)) (ADVP (RB up) (CC and) (RB up))) (VP (VP (VBZ leads) (ADVP (RB eventually)) (PP (TO to) (NP (DT the) (NN cathedral)))) (CC and) (VP (VBZ fortress) (NP (JJ commanding) (DT the) (NN hilltop))) (, ,) (CC and) (ADVP (RB down)) (VP (ADVP (RB inevitably)) (VBZ leads) (PP (TO to) (NP (NP (CD one)) (PP (IN of) (NP (CD three) (NNS gates))))) (PP (IN through) (NP (NP (DT the) (NN wall)) (PP (TO to) (NP (DT the) (JJ new) (NN town)))))))) (. .))) (ROOT (S (NP (NNP Go)) (VP (VBZ downwards) (PP (TO to) (NP (NP (CD one)) (PP (IN of) (NP (NP (DT the) (NNS gates)) (, ,) (SBAR (WHNP (DT all) (WHPP (IN of) (WHNP (WDT which)))) (S (VP (MD will) (VP (VB lead) (NP (PRP you)) (PP (IN into) (NP (DT the) (NN cathedral)))))))))))) (. .))) The most important directions are simply up and up leads eventually to the cathedral and fortress commanding the hilltop, and down inevitably leads to one of three gates through the wall to the new town. Go downwards to one of the gates, all of which will lead you into the cathedral. contradiction contradiction entailment contradiction contradiction contradiction diff --git a/tests/data_for_tests/sample_snli.jsonl b/tests/data_for_tests/sample_snli.jsonl new file mode 100755 index 00000000..e62856ac --- /dev/null +++ b/tests/data_for_tests/sample_snli.jsonl @@ -0,0 +1,3 @@ +{"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"} +{"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"} +{"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"} \ No newline at end of file diff --git a/tests/data_for_tests/text_classify.txt b/tests/data_for_tests/text_classify.txt new file mode 100755 index 00000000..24a51ce9 --- /dev/null +++ b/tests/data_for_tests/text_classify.txt @@ -0,0 +1,100 @@ +entertainment 台 媒 预 测 周 冬 雨 金 马 奖 封 后 , 大 气 的 倪 妮 却 佳 作 难 出 +food 农 村 就 是 好 , 能 吃 到 纯 天 然 无 添 加 的 野 生 蜂 蜜 , 营 养 又 健 康 +fashion 1 4 款 知 性 美 装 , 时 尚 惊 艳 搁 浅 的 阳 光 轻 熟 的 优 雅 +history 火 焰 喷 射 器 1 0 0 0 度 火 焰 烧 死 鬼 子 4 连 拍 +society 1 8 岁 青 年 砍 死 8 8 岁 老 兵 +fashion 醋 洗 脸 的 正 确 方 法 洗 对 了 不 仅 美 容 肌 肤 还 能 收 缩 毛 孔 +game 大 家 都 说 说 除 了 这 1 0 个 英 雄 , L O L 还 有 哪 些 英 雄 可 以 单 挑 男 爵 +sports 王 仕 鹏 退 役 担 任 N B A 总 决 赛 现 场 解 说 嘉 宾 +regimen 天 天 吃 “ 洋 快 餐 ” , 5 岁 女 童 患 上 肝 炎 +food 汤 里 的 蛋 花 怎 样 才 能 如 花 朵 般 漂 亮 , 注 意 这 一 点 即 可 ! +tech 英 退 休 人 士 把 谷 歌 当 活 人 以 礼 貌 搜 索 请 求 征 服 整 个 互 联 网 +discovery N A S A 探 测 器 拍 摄 地 球 、 火 星 和 冥 王 星 合 影 +society 当 骗 子 遇 上 撒 贝 宁 ! 几 句 话 过 后 骗 子 赔 礼 道 歉 . . . . . +history 红 军 长 征 在 中 国 革 命 史 上 的 地 位 +world 实 拍 神 秘 之 国 , 带 你 走 进 真 实 的 朝 鲜 +tech 逼 格 爆 表 ! 古 文 版 2 0 1 6 网 络 热 词 : 燃 尽 洪 荒 之 力 +story 因 为 一 样 东 西 这 个 后 娘 竟 然 给 孩 子 磕 头 +game L O L : 皮 肤 对 操 作 没 影 响 ? 细 数 那 些 有 加 成 效 果 的 皮 肤 +fashion 冬 天 想 穿 裙 子 又 怕 冷 ? 学 了 这 些 搭 配 就 能 好 看 又 温 暖 ! +entertainment 贾 建 军 少 林 三 光 剑 视 频 +food 再 也 不 用 出 去 吃 羊 肉 串 , 自 己 做 又 卫 生 又 健 康 +regimen 男 人 多 吃 这 几 道 菜 , 效 果 胜 “ 伟 哥 ” +baby 宝 贝 厨 房 丨 肉 类 辅 食 第 一 步 宝 宝 的 生 长 发 育 每 天 都 离 不 开 它 ! +travel 近 8 0 亿 的 顶 级 豪 华 邮 轮 上 到 底 有 什 么 ? +sports 厄 齐 尔 心 中 最 想 签 约 的 三 个 人 +food 东 北 的 粘 豆 包 啊 , 想 死 你 们 了 ! +military 强 军 足 音 +sports 奥 运 赛 场 上 , 被 喷 子 痛 批 的 十 大 知 名 运 动 员 +game 老 玩 家 分 享 对 2 0 1 6 L P L 夏 季 赛 R N G 的 分 析 +military 揭 秘 : 关 于 战 争 的 五 大 真 相 , 不 要 再 被 影 视 所 欺 骗 了 ! +food 小 丫 厨 房 : 夏 天 怎 么 吃 辣 不 长 痘 ? 告 诉 你 火 锅 鸡 、 香 辣 鱼 的 正 确 做 法 +travel 中 国 首 个 内 陆 城 市 群 上 的 9 座 城 市 , 看 看 有 你 的 家 乡 吗 +fashion 李 小 璐 做 榜 样 接 亲 吻 脚 大 流 行 新 娘 玉 足 怎 样 才 有 好 味 道 ? +game 黄 金 吊 打 钻 石 ? L O L 最 强 刷 钱 毒 瘤 打 法 诞 生 +history 奇 事 ! 上 万 只 青 蛙 拦 路 告 状 , 竟 然 牵 扯 出 一 桩 命 案 +baby 奶 奶 , 你 为 什 么 不 让 我 用 尿 不 湿 +game L O L 当 5 个 大 发 明 家 炮 台 围 住 泉 水 的 时 候 : 这 是 真 虐 泉 ! +essay 文 友 忠 告 暖 人 心 : 人 到 中 年 “ 不 交 五 友 ” +travel 这 一 年 , 我 们 去 日 本 +food 好 吃 早 饭 近 似 吃 补 药 +fashion 夏 天 太 热 , 唇 膏 化 了 如 何 办 ? +society 厂 里 面 的 9 0 后 打 工 妹 , 辛 苦 来 之 不 易 +history 罕 见 老 照 片 展 示 美 国 大 萧 条 时 期 景 象 +world 美 国 总 统 奥 巴 马 , 是 童 心 未 泯 的 温 情 奥 大 大 , 还 是 个 超 级 老 顽 童 +finance 脱 欧 公 投 前 一 天 抛 售 英 镑 这 一 次 索 罗 斯 也 被 “ 打 败 ” 了 . . . +history 翻 越 长 征 路 上 第 一 座 大 山 +world 朝 鲜 批 奥 巴 马 涉 朝 言 论 , 称 只 要 核 威 胁 存 在 将 继 续 强 化 核 武 力 量 +game 《 巫 师 3 : 狂 猎 》 不 良 因 素 解 析 攻 略 +travel 在 郑 州 有 个 地 方 , 时 光 仿 佛 在 那 儿 停 下 脚 步 +history 它 号 称 “ 天 下 第 一 团 ” , 走 出 过 1 4 位 共 和 国 将 军 以 及 一 位 著 名 作 家 +car 煤 老 板 去 黄 江 买 车 , 以 为 占 了 便 宜 没 想 被 坑 了 1 0 0 多 万 +society “ 试 管 婴 儿 之 母 ” 张 丽 珠 遗 体 告 别 仪 式 8 日 举 行 +sports 东 京 奥 运 会 , 中 国 女 排 卫 冕 的 几 率 有 多 大 ? +travel 成 都 我 们 永 远 依 恋 的 城 市 +tech 雷 布 斯 除 了 小 米 还 有 这 些 秘 密 , 你 知 道 吗 ? +world “ 仲 裁 庭 损 害 国 际 法 体 系 公 正 性 ” — — 访 武 汉 大 学 中 国 边 界 与 海 洋 研 究 院 首 席 专 家 易 显 河 +entertainment 上 海 观 众 和 欧 洲 三 大 影 展 之 间 的 距 离 : 零 时 差 +essay 关 系 好 , 一 切 便 好 +baby 刚 出 生 不 到 1 小 时 的 白 鲸 宝 宝 被 冲 上 岸 , 被 救 后 对 恩 人 露 出 微 笑 +tech 赚 足 眼 球 , 诺 基 亚 五 边 形 W i n 1 0 M o b i l e 概 念 手 机 : 棱 镜 +essay 2 4 句 经 典 语 录 : 穷 三 年 可 以 怨 命 , 穷 十 年 就 得 自 省 +food 这 道 菜 真 下 饭 ! 做 法 简 单 , 防 辐 射 、 抗 衰 老 , 关 键 还 便 宜 +entertainment 《 继 承 者 们 》 要 拍 中 国 版 , 众 角 色 你 期 待 谁 来 演 ? +game D N F 暴 走 改 版 后 怎 么 样 D N F 暴 走 改 版 红 眼 变 弱 了 吗 +entertainment 郑 佩 佩 自 曝 与 李 小 龙 的 过 去 他 是 个 “ 疯 子 ” +baby 女 性 只 有 8 4 次 最 佳 受 孕 机 会 +travel 月 初 一 个 人 去 了 日 本 . . +military 不 为 人 知 的 8 0 万 苏 联 女 兵 ! 最 后 一 张 很 美 ! +tech 网 络 商 家 提 供 小 米 5 运 存 升 级 服 务 : 3 G B 秒 变 6 G B +history 宋 太 祖 、 宋 太 宗 凌 辱 亡 国 皇 后 , 徽 钦 二 帝 后 宫 被 金 人 凌 辱 +history 人 有 三 面 最 “ 难 吃 ” ! 黑 帮 大 佬 杜 月 笙 论 江 湖 规 矩 ! 一 生 只 怕 这 一 人 +game 来 了 ! 索 尼 P S 4 独 占 大 作 《 战 神 4 》 正 式 公 布 +discovery 延 时 视 频 显 示 珊 瑚 如 何 “ 驱 逐 ” 共 生 藻 类 +car 传 祺 G A 8 和 东 风 A 9 谁 才 是 自 主 “ 豪 车 ” 大 佬 +fashion 娶 老 婆 就 要 娶 这 种 ! 蒋 欣 这 样 微 胖 的 女 人 好 看 又 实 用 +sports 黄 山 姑 娘 吕 秀 芝 勇 夺 奥 运 铜 牌 数 百 父 老 彻 夜 为 她 加 油 +military [ 每 日 军 图 ] 土 豪 补 仓 ! 沙 特 再 次 购 买 上 百 辆 美 国 M 1 A 2 主 战 坦 克 +military 美 军 这 款 武 器 号 称 能 让 半 个 中 国 陷 入 黑 暗 , 解 放 军 少 将 : 我 们 也 有 +world 邓 小 平 与 日 本 天 皇 的 历 史 性 会 谈 , 对 中 日 两 国 都 具 有 深 远 的 意 义 啊 ! +baby 为 什 么 有 人 上 个 厕 所 都 能 生 出 孩 子 ? +fashion 欣 宜 举 行 首 次 个 唱 十 万 颗 宝 仕 奥 莎 仿 水 晶 闪 耀 全 场 +food 小 两 口 上 周 的 晚 餐 +society 在 北 京 就 要 守 规 矩 +entertainment 知 情 人 曝 翰 爽 分 手 内 幕 : 郑 爽 想 结 婚 却 被 一 直 拖 着 +military 中 国 反 舰 导 弹 世 界 第 一 远 远 超 过 美 国 但 为 何 却 还 不 如 俄 罗 斯 ? +entertainment 他 除 了 是 《 我 歌 》 音 乐 总 监 , 还 曾 组 乐 队 玩 摇 滚 , 是 黄 家 驹 旧 日 知 己 +baby 长 鹅 口 疮 的 孩 子 怎 么 照 顾 ? 不 要 再 说 拿 他 没 办 法 了 ! +discovery 微 重 力 不 需 使 用 肌 肉 , 太 空 人 返 回 地 球 后 脊 椎 旁 肌 肉 萎 缩 约 1 9 % +regimen 这 6 种 人 将 来 会 得 老 年 痴 呆 ! 预 防 老 年 痴 呆 症 , 这 些 办 法 被 全 世 界 公 认 +society 2 0 1 6 年 上 海 即 将 发 生 哪 些 大 事 件 。 。 。 。 +car 北 汽 自 主 品 牌 亏 损 3 3 . 4 1 亿 额 外 促 销 成 主 因 +car 在 那 山 的 那 边 海 的 那 边 , 有 一 群 自 由 侠 +history 一 个 小 城 就 屠 杀 了 4 0 0 0 苏 军 战 俘 , 希 特 勒 死 神 战 队 的 崛 起 与 覆 灭 +baby 给 孩 子 洗 澡 时 , 这 些 部 位 再 脏 也 不 要 碰 ! +essay 好 久 不 见 , 你 还 好 么 +baby 被 娃 误 伤 的 9 种 痛 , 数 一 数 你 中 了 几 枪 ? +food 初 秋 的 小 炖 品 放 冰 糖 就 比 较 滋 润 , 放 红 糖 就 补 血 又 不 燥 热 +game 佩 服 佩 服 ! 羊 驼 D e f t 单 排 重 回 韩 服 最 强 王 者 第 一 名 ! +game 三 个 时 代 的 标 志 炉 石 传 说 三 大 远 古 毒 瘤 卡 组 +discovery 2 0 世 纪 最 伟 大 科 学 发 现 — — 魔 术 般 的 超 导 材 料 ! \ No newline at end of file diff --git a/tests/data_for_tests/zh_sample.conllx b/tests/data_for_tests/zh_sample.conllx new file mode 100755 index 00000000..dee802ef --- /dev/null +++ b/tests/data_for_tests/zh_sample.conllx @@ -0,0 +1,100 @@ +1 上海 _ NR NR _ 3 nsubj _ _ +2 积极 _ AD AD _ 3 advmod _ _ +3 准备 _ VV VV _ 0 root _ _ +4 迎接 _ VV VV _ 3 ccomp _ _ +5 欧元 _ NN NN _ 6 nn _ _ +6 诞生 _ NN NN _ 4 dobj _ _ + +1 新华社 _ NR NR _ 7 dep _ _ +2 上海 _ NR NR _ 7 dep _ _ +3 十二月 _ NT NT _ 7 dep _ _ +4 三十日 _ NT NT _ 7 dep _ _ +5 电 _ NN NN _ 7 dep _ _ +6 ( _ PU PU _ 7 punct _ _ +7 记者 _ NN NN _ 0 root _ _ +8 潘清 _ NR NR _ 7 dep _ _ +9 ) _ PU PU _ 7 punct _ _ + +1 即将 _ AD AD _ 2 advmod _ _ +2 诞生 _ VV VV _ 4 rcmod _ _ +3 的 _ DEC DEC _ 2 cpm _ _ +4 欧元 _ NN NN _ 6 nsubj _ _ +5 , _ PU PU _ 6 punct _ _ +6 引起 _ VV VV _ 0 root _ _ +7 了 _ AS AS _ 6 asp _ _ +8 上海 _ NR NR _ 14 nn _ _ +9 这 _ DT DT _ 14 det _ _ +10 个 _ M M _ 9 clf _ _ +11 中国 _ NR NR _ 13 nn _ _ +12 金融 _ NN NN _ 13 nn _ _ +13 中心 _ NN NN _ 14 nn _ _ +14 城市 _ NN NN _ 16 assmod _ _ +15 的 _ DEG DEG _ 14 assm _ _ +16 关注 _ NN NN _ 6 dobj _ _ +17 。 _ PU PU _ 6 punct _ _ + +1 上海 _ NR NR _ 2 nn _ _ +2 银行界 _ NN NN _ 4 nsubj _ _ +3 纷纷 _ AD AD _ 4 advmod _ _ +4 推出 _ VV VV _ 0 root _ _ +5 了 _ AS AS _ 4 asp _ _ +6 与 _ P P _ 8 prep _ _ +7 之 _ PN PN _ 6 pobj _ _ +8 相关 _ VA VA _ 15 rcmod _ _ +9 的 _ DEC DEC _ 8 cpm _ _ +10 外汇 _ NN NN _ 15 nn _ _ +11 业务 _ NN NN _ 15 nn _ _ +12 品种 _ NN NN _ 15 conj _ _ +13 和 _ CC CC _ 15 cc _ _ +14 服务 _ NN NN _ 15 nn _ _ +15 举措 _ NN NN _ 4 dobj _ _ +16 , _ PU PU _ 4 punct _ _ +17 积极 _ AD AD _ 18 advmod _ _ +18 准备 _ VV VV _ 4 dep _ _ +19 启动 _ VV VV _ 18 ccomp _ _ +20 欧元 _ NN NN _ 21 nn _ _ +21 业务 _ NN NN _ 19 dobj _ _ +22 。 _ PU PU _ 4 punct _ _ + +1 一些 _ CD CD _ 8 nummod _ _ +2 热衷于 _ VV VV _ 8 rcmod _ _ +3 个人 _ NN NN _ 5 nn _ _ +4 外汇 _ NN NN _ 5 nn _ _ +5 交易 _ NN NN _ 2 dobj _ _ +6 的 _ DEC DEC _ 2 cpm _ _ +7 上海 _ NR NR _ 8 nn _ _ +8 市民 _ NN NN _ 13 nsubj _ _ +9 , _ PU PU _ 13 punct _ _ +10 也 _ AD AD _ 13 advmod _ _ +11 对 _ P P _ 13 prep _ _ +12 欧元 _ NN NN _ 11 pobj _ _ +13 表示 _ VV VV _ 0 root _ _ +14 出 _ VV VV _ 13 rcomp _ _ +15 极 _ AD AD _ 16 advmod _ _ +16 大 _ VA VA _ 18 rcmod _ _ +17 的 _ DEC DEC _ 16 cpm _ _ +18 兴趣 _ NN NN _ 13 dobj _ _ +19 。 _ PU PU _ 13 punct _ _ + +1 继 _ P P _ 38 prep _ _ +2 上海 _ NR NR _ 6 nn _ _ +3 大众 _ NR NR _ 6 nn _ _ +4 汽车 _ NN NN _ 6 nn _ _ +5 有限 _ JJ JJ _ 6 amod _ _ +6 公司 _ NN NN _ 13 nsubj _ _ +7 十八日 _ NT NT _ 13 tmod _ _ +8 在 _ P P _ 13 prep _ _ +9 中国 _ NR NR _ 10 nn _ _ +10 银行 _ NN NN _ 12 nn _ _ +11 上海 _ NR NR _ 12 nn _ _ +12 分行 _ NN NN _ 8 pobj _ _ +13 开立 _ VV VV _ 19 lccomp _ _ +14 上海 _ NR NR _ 16 dep _ _ +15 第一 _ OD OD _ 16 ordmod _ _ +16 个 _ M M _ 18 clf _ _ +17 欧元 _ NN NN _ 18 nn _ _ +18 帐户 _ NN NN _ 13 dobj _ _ +19 后 _ LC LC _ 1 plmod _ _ +20 , _ PU PU _ 38 punct _ _ +21 工商 _ NN NN _ 28 nn _ _ +22 银行 _ NN NN _ 28 conj _ _ diff --git a/tests/io/__init__.py b/tests/io/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/tests/io/loader/test_classification_loader.py b/tests/io/loader/test_classification_loader.py new file mode 100755 index 00000000..abc84dc5 --- /dev/null +++ b/tests/io/loader/test_classification_loader.py @@ -0,0 +1,51 @@ + + +import os +import pytest + +from fastNLP.io import DataBundle +from fastNLP.io.loader.classification import YelpFullLoader, YelpPolarityLoader, IMDBLoader, \ + SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, \ + MRLoader, R8Loader, R52Loader, OhsumedLoader, NG20Loader + + +class TestDownload: + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_download(self): + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader, ChnSentiCorpLoader]: + loader().download() + + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_load(self): + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader, ChnSentiCorpLoader]: + data_bundle = loader().load() + print(data_bundle) + + +class TestLoad: + def test_process_from_file(self): + data_set_dict = { + 'yelp.p': ('tests/data_for_tests/io/yelp_review_polarity', YelpPolarityLoader, (6, 6, 6), False), + 'yelp.f': ('tests/data_for_tests/io/yelp_review_full', YelpFullLoader, (6, 6, 6), False), + 'sst-2': ('tests/data_for_tests/io/SST-2', SST2Loader, (5, 5, 5), True), + 'sst': ('tests/data_for_tests/io/SST', SSTLoader, (6, 6, 6), False), + 'imdb': ('tests/data_for_tests/io/imdb', IMDBLoader, (6, 6, 6), False), + 'ChnSentiCorp': ('tests/data_for_tests/io/ChnSentiCorp', ChnSentiCorpLoader, (6, 6, 6), False), + 'THUCNews': ('tests/data_for_tests/io/THUCNews', THUCNewsLoader, (9, 9, 9), False), + 'WeiboSenti100k': ('tests/data_for_tests/io/WeiboSenti100k', WeiboSenti100kLoader, (6, 7, 6), False), + 'mr': ('tests/data_for_tests/io/mr', MRLoader, (6, 6, 6), False), + 'R8': ('tests/data_for_tests/io/R8', R8Loader, (6, 6, 6), False), + 'R52': ('tests/data_for_tests/io/R52', R52Loader, (6, 6, 6), False), + 'ohsumed': ('tests/data_for_tests/io/R52', OhsumedLoader, (6, 6, 6), False), + '20ng': ('tests/data_for_tests/io/R52', NG20Loader, (6, 6, 6), False), + } + for k, v in data_set_dict.items(): + path, loader, data_set, warns = v + data_bundle = loader().load(path) + + assert(isinstance(data_bundle, DataBundle)) + assert(len(data_set) == data_bundle.num_dataset) + for x, y in zip(data_set, data_bundle.iter_datasets()): + name, dataset = y + assert(x == len(dataset)) + diff --git a/tests/io/loader/test_conll_loader.py b/tests/io/loader/test_conll_loader.py new file mode 100755 index 00000000..4f6b49c7 --- /dev/null +++ b/tests/io/loader/test_conll_loader.py @@ -0,0 +1,43 @@ + +import pytest +import os +from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, \ + Conll2003Loader, ConllLoader + + +class TestMSRANER: + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_download(self): + MsraNERLoader().download(re_download=False) + data_bundle = MsraNERLoader().load() + print(data_bundle) + + +class TestPeopleDaily: + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_download(self): + PeopleDailyNERLoader().download() + + +class TestWeiboNER: + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_download(self): + WeiboNERLoader().download() + + +class TestConll2003Loader: + def test_load(self): + Conll2003Loader()._load('tests/data_for_tests/conll_2003_example.txt') + + +class TestConllLoader: + def test_conll(self): + db = Conll2003Loader().load('tests/data_for_tests/io/conll2003') + print(db) + + def test_sep(self): + headers = [ + 'raw_words', 'ner', + ] + db = ConllLoader(headers = headers, sep="\n").load('tests/data_for_tests/io/MSRA_NER') + print(db) diff --git a/tests/io/loader/test_cws_loader.py b/tests/io/loader/test_cws_loader.py new file mode 100755 index 00000000..6c19355c --- /dev/null +++ b/tests/io/loader/test_cws_loader.py @@ -0,0 +1,22 @@ +import pytest +import os +from fastNLP.io.loader import CWSLoader + + +class TestCWSLoader: + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_download(self): + dataset_names = ['pku', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + data_bundle = CWSLoader(dataset_name=dataset_name).load() + print(data_bundle) + + +class TestRunCWSLoader: + def test_cws_loader(self): + dataset_names = ['msra', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + data_bundle = CWSLoader(dataset_name=dataset_name).load( + f'tests/data_for_tests/io/cws_{dataset_name}' + ) + print(data_bundle) diff --git a/tests/io/loader/test_matching_loader.py b/tests/io/loader/test_matching_loader.py new file mode 100755 index 00000000..5f56ee27 --- /dev/null +++ b/tests/io/loader/test_matching_loader.py @@ -0,0 +1,49 @@ + +import pytest + +import os + +from fastNLP.io import DataBundle +from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \ + BQCorpusLoader, CNXNLILoader, LCQMCLoader + + +@pytest.mark.skipif('download' not in os.environ, reason="Skip download") +class TestMatchingDownload: + def test_download(self): + for loader in [RTELoader, QNLILoader, SNLILoader, MNLILoader]: + loader().download() + with pytest.raises(Exception): + QuoraLoader().load() + + def test_load(self): + for loader in [RTELoader, QNLILoader, SNLILoader, MNLILoader]: + data_bundle = loader().load() + print(data_bundle) + + +class TestMatchingLoad: + def test_load(self): + data_set_dict = { + 'RTE': ('tests/data_for_tests/io/RTE', RTELoader, (5, 5, 5), True), + 'SNLI': ('tests/data_for_tests/io/SNLI', SNLILoader, (5, 5, 5), False), + 'QNLI': ('tests/data_for_tests/io/QNLI', QNLILoader, (5, 5, 5), True), + 'MNLI': ('tests/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True), + 'Quora': ('tests/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False), + 'BQCorpus': ('tests/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False), + 'XNLI': ('tests/data_for_tests/io/XNLI', CNXNLILoader, (6, 6, 8), False), + 'LCQMC': ('tests/data_for_tests/io/LCQMC', LCQMCLoader, (6, 5, 6), False), + } + for k, v in data_set_dict.items(): + path, loader, instance, warns = v + if warns: + data_bundle = loader().load(path) + else: + data_bundle = loader().load(path) + + assert(isinstance(data_bundle, DataBundle)) + assert(len(instance) == data_bundle.num_dataset) + for x, y in zip(instance, data_bundle.iter_datasets()): + name, dataset = y + assert(x == len(dataset)) + diff --git a/tests/io/loader/test_qa_loader.py b/tests/io/loader/test_qa_loader.py new file mode 100755 index 00000000..06ad10f9 --- /dev/null +++ b/tests/io/loader/test_qa_loader.py @@ -0,0 +1,12 @@ +from fastNLP.io.loader.qa import CMRC2018Loader + +class TestCMRC2018Loader: + def test__load(self): + loader = CMRC2018Loader() + dataset = loader._load('tests/data_for_tests/io/cmrc/train.json') + print(dataset) + + def test_load(self): + loader = CMRC2018Loader() + data_bundle = loader.load('tests/data_for_tests/io/cmrc/') + print(data_bundle) diff --git a/tests/io/pipe/test_classification.py b/tests/io/pipe/test_classification.py new file mode 100755 index 00000000..31174862 --- /dev/null +++ b/tests/io/pipe/test_classification.py @@ -0,0 +1,89 @@ +import pytest +import os + +from fastNLP.io import DataBundle +from fastNLP.io.pipe.classification import SSTPipe, SST2Pipe, IMDBPipe, YelpFullPipe, YelpPolarityPipe, \ + AGsNewsPipe, DBPediaPipe +from fastNLP.io.pipe.classification import ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe + + +@pytest.mark.skipif('download' not in os.environ, reason="Skip download") +class TestClassificationPipe: + def test_process_from_file(self): + for pipe in [YelpPolarityPipe, SST2Pipe, IMDBPipe, YelpFullPipe, SSTPipe]: + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) + + +class TestRunPipe: + def test_load(self): + for pipe in [IMDBPipe]: + data_bundle = pipe(tokenizer='raw').process_from_file('tests/data_for_tests/io/imdb') + print(data_bundle) + + +@pytest.mark.skipif('download' not in os.environ, reason="Skip download") +class TestCNClassificationPipe: + def test_process_from_file(self): + for pipe in [ChnSentiCorpPipe]: + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file() + print(data_bundle) + + +@pytest.mark.skipif('download' not in os.environ, reason="Skip download") +class TestRunClassificationPipe: + def test_process_from_file(self): + data_set_dict = { + 'yelp.p': ('tests/data_for_tests/io/yelp_review_polarity', YelpPolarityPipe, + {'train': 6, 'dev': 6, 'test': 6}, {'words': 1176, 'target': 2}, + False), + 'yelp.f': ('tests/data_for_tests/io/yelp_review_full', YelpFullPipe, + {'train': 6, 'dev': 6, 'test': 6}, {'words': 1166, 'target': 5}, + False), + 'sst-2': ('tests/data_for_tests/io/SST-2', SST2Pipe, + {'train': 5, 'dev': 5, 'test': 5}, {'words': 139, 'target': 2}, + True), + 'sst': ('tests/data_for_tests/io/SST', SSTPipe, + {'train': 354, 'dev': 6, 'test': 6}, {'words': 232, 'target': 5}, + False), + 'imdb': ('tests/data_for_tests/io/imdb', IMDBPipe, + {'train': 6, 'dev': 6, 'test': 6}, {'words': 1670, 'target': 2}, + False), + 'ag': ('tests/data_for_tests/io/ag', AGsNewsPipe, + {'train': 4, 'test': 5}, {'words': 257, 'target': 4}, + False), + 'dbpedia': ('tests/data_for_tests/io/dbpedia', DBPediaPipe, + {'train': 14, 'test': 5}, {'words': 496, 'target': 14}, + False), + 'ChnSentiCorp': ('tests/data_for_tests/io/ChnSentiCorp', ChnSentiCorpPipe, + {'train': 6, 'dev': 6, 'test': 6}, + {'chars': 529, 'bigrams': 1296, 'trigrams': 1483, 'target': 2}, + False), + 'Chn-THUCNews': ('tests/data_for_tests/io/THUCNews', THUCNewsPipe, + {'train': 9, 'dev': 9, 'test': 9}, {'chars': 1864, 'target': 9}, + False), + 'Chn-WeiboSenti100k': ('tests/data_for_tests/io/WeiboSenti100k', WeiboSenti100kPipe, + {'train': 6, 'dev': 6, 'test': 7}, {'chars': 452, 'target': 2}, + False), + } + for k, v in data_set_dict.items(): + path, pipe, data_set, vocab, warns = v + if 'Chn' not in k: + if warns: + data_bundle = pipe(tokenizer='raw').process_from_file(path) + else: + data_bundle = pipe(tokenizer='raw').process_from_file(path) + else: + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file(path) + + assert(isinstance(data_bundle, DataBundle)) + assert(len(data_set) == data_bundle.num_dataset) + for name, dataset in data_bundle.iter_datasets(): + assert(name in data_set.keys()) + assert(data_set[name] == len(dataset)) + + assert(len(vocab) == data_bundle.num_vocab) + for name, vocabs in data_bundle.iter_vocabs(): + assert(name in vocab.keys()) + assert(vocab[name] == len(vocabs)) diff --git a/tests/io/pipe/test_conll.py b/tests/io/pipe/test_conll.py new file mode 100755 index 00000000..e4000ae3 --- /dev/null +++ b/tests/io/pipe/test_conll.py @@ -0,0 +1,48 @@ +import pytest +import os +from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, Conll2003Pipe, Conll2003NERPipe, \ + OntoNotesNERPipe + + +@pytest.mark.skipif('download' not in os.environ, reason="Skip download") +class TestConllPipe: + def test_process_from_file(self): + for pipe in [MsraNERPipe, PeopleDailyPipe, WeiboNERPipe]: + print(pipe) + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file() + print(data_bundle) + data_bundle = pipe(encoding_type='bioes').process_from_file() + print(data_bundle) + + +class TestRunPipe: + def test_conll2003(self): + for pipe in [Conll2003Pipe, Conll2003NERPipe]: + print(pipe) + data_bundle = pipe().process_from_file('tests/data_for_tests/conll_2003_example.txt') + print(data_bundle) + + +class TestNERPipe: + def test_process_from_file(self): + data_dict = { + 'weibo_NER': WeiboNERPipe, + 'peopledaily': PeopleDailyPipe, + 'MSRA_NER': MsraNERPipe, + } + for k, v in data_dict.items(): + pipe = v + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file(f'tests/data_for_tests/io/{k}') + print(data_bundle) + data_bundle = pipe(encoding_type='bioes').process_from_file(f'tests/data_for_tests/io/{k}') + print(data_bundle) + + +class TestConll2003Pipe: + def test_conll(self): + data_bundle = Conll2003Pipe().process_from_file('tests/data_for_tests/io/conll2003') + print(data_bundle) + + def test_OntoNotes(self): + data_bundle = OntoNotesNERPipe().process_from_file('tests/data_for_tests/io/OntoNotes') + print(data_bundle) diff --git a/tests/io/pipe/test_cws.py b/tests/io/pipe/test_cws.py new file mode 100755 index 00000000..895234b2 --- /dev/null +++ b/tests/io/pipe/test_cws.py @@ -0,0 +1,39 @@ + +import pytest +import os +from fastNLP.io.pipe.cws import CWSPipe + + +class TestCWSPipe: + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_process_from_file(self): + dataset_names = ['pku', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + data_bundle = CWSPipe(dataset_name=dataset_name).process_from_file() + print(data_bundle) + + def test_demo(self): + # related to issue https://github.com/fastnlp/fastNLP/issues/324#issue-705081091 + from fastNLP import DataSet, Instance + from fastNLP.io import DataBundle + data_bundle = DataBundle() + ds = DataSet() + ds.append(Instance(raw_words="截流 进入 最后 冲刺 ( 附 图片 1 张 )")) + data_bundle.set_dataset(ds, name='train') + data_bundle = CWSPipe().process(data_bundle) + assert('<' not in data_bundle.get_vocab('chars')) + + +class TestRunCWSPipe: + def test_process_from_file(self): + dataset_names = ['msra', 'cityu', 'as', 'pku'] + for dataset_name in dataset_names: + data_bundle = CWSPipe(bigrams=True, trigrams=True).\ + process_from_file(f'tests/data_for_tests/io/cws_{dataset_name}') + print(data_bundle) + + def test_replace_number(self): + data_bundle = CWSPipe(bigrams=True, replace_num_alpha=True).\ + process_from_file(f'tests/data_for_tests/io/cws_pku') + for word in ['<', '>', '']: + assert(data_bundle.get_vocab('chars').to_index(word) != 1) diff --git a/tests/io/pipe/test_matching.py b/tests/io/pipe/test_matching.py new file mode 100755 index 00000000..23c8fd70 --- /dev/null +++ b/tests/io/pipe/test_matching.py @@ -0,0 +1,104 @@ + +import pytest +import os + +from fastNLP.io import DataBundle +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \ + CNXNLIPipe, BQCorpusPipe, LCQMCPipe +from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \ + CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe + + +@pytest.mark.skipif('download' not in os.environ, reason="Skip download") +class TestMatchingPipe: + def test_process_from_file(self): + for pipe in [SNLIPipe, RTEPipe, QNLIPipe, MNLIPipe]: + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) + + +@pytest.mark.skipif('download' not in os.environ, reason="Skip download") +class TestMatchingBertPipe: + def test_process_from_file(self): + for pipe in [SNLIBertPipe, RTEBertPipe, QNLIBertPipe, MNLIBertPipe]: + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) + + +class TestRunMatchingPipe: + def test_load(self): + data_set_dict = { + 'RTE': ('tests/data_for_tests/io/RTE', RTEPipe, RTEBertPipe, (5, 5, 5), (449, 2), True), + 'SNLI': ('tests/data_for_tests/io/SNLI', SNLIPipe, SNLIBertPipe, (5, 5, 5), (110, 3), False), + 'QNLI': ('tests/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True), + 'MNLI': ('tests/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True), + 'BQCorpus': ('tests/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False), + 'XNLI': ('tests/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 6, 8), (39, 3), False), + 'LCQMC': ('tests/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (6, 5, 6), (36, 2), False), + } + for k, v in data_set_dict.items(): + path, pipe1, pipe2, data_set, vocab, warns = v + if warns: + data_bundle1 = pipe1(tokenizer='raw').process_from_file(path) + data_bundle2 = pipe2(tokenizer='raw').process_from_file(path) + else: + data_bundle1 = pipe1(tokenizer='raw').process_from_file(path) + data_bundle2 = pipe2(tokenizer='raw').process_from_file(path) + + assert(isinstance(data_bundle1, DataBundle)) + assert(len(data_set) == data_bundle1.num_dataset) + print(k) + print(data_bundle1) + print(data_bundle2) + for x, y in zip(data_set, data_bundle1.iter_datasets()): + name, dataset = y + assert(x == len(dataset)) + assert(len(data_set) == data_bundle2.num_dataset) + for x, y in zip(data_set, data_bundle2.iter_datasets()): + name, dataset = y + assert(x == len(dataset)) + + assert(len(vocab) == data_bundle1.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + assert(x == len(vocabs)) + assert(len(vocab) == data_bundle2.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + assert(x + 1 if name == 'words' else x == len(vocabs)) + + @pytest.mark.skipif('download' not in os.environ, reason="Skip download") + def test_spacy(self): + data_set_dict = { + 'Quora': ('tests/data_for_tests/io/Quora', QuoraPipe, QuoraBertPipe, (2, 2, 2), (93, 2)), + } + for k, v in data_set_dict.items(): + path, pipe1, pipe2, data_set, vocab = v + + data_bundle1 = pipe1(tokenizer='spacy').process_from_file(path) + data_bundle2 = pipe2(tokenizer='spacy').process_from_file(path) + + assert(isinstance(data_bundle1, DataBundle)) + assert(len(data_set) == data_bundle1.num_dataset) + print(k) + print(data_bundle1) + print(data_bundle2) + for x, y in zip(data_set, data_bundle1.iter_datasets()): + name, dataset = y + assert(x == len(dataset)) + assert(len(data_set) == data_bundle2.num_dataset) + for x, y in zip(data_set, data_bundle2.iter_datasets()): + name, dataset = y + assert(x == len(dataset)) + + assert(len(vocab) == data_bundle1.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + assert(x == len(vocabs)) + assert(len(vocab) == data_bundle2.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + assert(x + 1 if name == 'words' else x == len(vocabs)) + diff --git a/tests/io/pipe/test_qa.py b/tests/io/pipe/test_qa.py new file mode 100755 index 00000000..35477939 --- /dev/null +++ b/tests/io/pipe/test_qa.py @@ -0,0 +1,24 @@ + +import pytest +from fastNLP.io.pipe.qa import CMRC2018BertPipe +from fastNLP.io.loader.qa import CMRC2018Loader + + +class CMRC2018PipeTest: + def test_process(self): + data_bundle = CMRC2018Loader().load('tests/data_for_tests/io/cmrc/') + pipe = CMRC2018BertPipe() + data_bundle = pipe.process(data_bundle) + + for name, dataset in data_bundle.iter_datasets(): + for ins in dataset: + if 'target_start' in ins: + # 抓到的答案是对应上的 + start_index = ins['target_start'] + end_index = ins['target_end']+1 + extract_answer = ''.join(ins['raw_chars'][start_index:end_index]) + assert(extract_answer == ins['answers'][0]) + # 测试context_len是对的 + raw_chars = ins['raw_chars'] + expect_len = raw_chars.index('[SEP]') + assert(expect_len == ins['context_len']) diff --git a/tests/io/pipe/test_summary.py b/tests/io/pipe/test_summary.py new file mode 100755 index 00000000..b8692791 --- /dev/null +++ b/tests/io/pipe/test_summary.py @@ -0,0 +1,71 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# __author__="Danqing Wang" + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import pytest +import os + +import pytest + +from fastNLP.io import DataBundle +from fastNLP.io.pipe.summarization import ExtCNNDMPipe + + +class TestRunExtCNNDMPipe: + + def test_load(self): + data_dir = 'tests/data_for_tests/io/cnndm' + vocab_size = 100000 + VOCAL_FILE = 'tests/data_for_tests/io/cnndm/vocab' + sent_max_len = 100 + doc_max_timesteps = 50 + dbPipe = ExtCNNDMPipe(vocab_size=vocab_size, + vocab_path=VOCAL_FILE, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps) + dbPipe2 = ExtCNNDMPipe(vocab_size=vocab_size, + vocab_path=VOCAL_FILE, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps, + domain=True) + db = dbPipe.process_from_file(data_dir) + db2 = dbPipe2.process_from_file(data_dir) + + assert(isinstance(db, DataBundle)) + assert(isinstance(db2, DataBundle)) + + dbPipe3 = ExtCNNDMPipe(vocab_size=vocab_size, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps, + domain=True) + db3 = dbPipe3.process_from_file(data_dir) + assert(isinstance(db3, DataBundle)) + + with pytest.raises(RuntimeError): + dbPipe4 = ExtCNNDMPipe(vocab_size=vocab_size, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps) + db4 = dbPipe4.process_from_file(os.path.join(data_dir, 'train.cnndm.jsonl')) + + dbPipe5 = ExtCNNDMPipe(vocab_size=vocab_size, + vocab_path=VOCAL_FILE, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps,) + db5 = dbPipe5.process_from_file(os.path.join(data_dir, 'train.cnndm.jsonl')) + assert(isinstance(db5, DataBundle)) + diff --git a/tests/io/test_embed_loader.py b/tests/io/test_embed_loader.py new file mode 100755 index 00000000..9be0d3b7 --- /dev/null +++ b/tests/io/test_embed_loader.py @@ -0,0 +1,38 @@ +import numpy as np + +from fastNLP import Vocabulary +from fastNLP.io import EmbedLoader + + +class TestEmbedLoader: + def test_load_with_vocab(self): + vocab = Vocabulary() + glove = "tests/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt" + word2vec = "tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt" + vocab.add_word('the') + vocab.add_word('none') + g_m = EmbedLoader.load_with_vocab(glove, vocab) + assert(g_m.shape == (4, 50)) + w_m = EmbedLoader.load_with_vocab(word2vec, vocab, normalize=True) + assert(w_m.shape ==(4, 50)) + assert np.allclose(np.linalg.norm(w_m, axis=1).sum(), 4) + + def test_load_without_vocab(self): + words = ['the', 'of', 'in', 'a', 'to', 'and'] + glove = "tests/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt" + word2vec = "tests/data_for_tests/embedding/small_static_embedding/word2vec_test.txt" + g_m, vocab = EmbedLoader.load_without_vocab(glove) + assert(g_m.shape == (8, 50)) + for word in words: + assert(word in vocab) + w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True) + assert(w_m.shape== (8, 50)) + assert np.allclose(np.linalg.norm(w_m, axis=1).sum(), 8) + for word in words: + assert(word in vocab) + # no unk + w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True, unknown=None) + assert(w_m.shape == (7, 50)) + assert np.allclose(np.linalg.norm(w_m, axis=1).sum(), 7) + for word in words: + assert(word in vocab)