1.增加sequence labeling任务的数据说明; 2.增加对CWSPipe的引用

6 years ago · 584a92c64c
--- a/fastNLP/io/init.py
+++ b/fastNLP/io/init.py
@@ -38,6 +38,7 @@ __all__ = [
    'JsonLoader',

    'CWSLoader',
    "CWSPipe",

    'MNLILoader',
    "QuoraLoader",
--- a/fastNLP/io/pipe/init.py
+++ b/fastNLP/io/pipe/init.py
@@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据，所有的 Pipe 都包含 ``proce
 __all__ = [
    "Pipe",

    "CWSPipe",

    "YelpFullPipe",
    "YelpPolarityPipe",
    "SSTPipe",
@@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
    MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
 from .pipe import Pipe
 from .conll import Conll2003Pipe
 from .cws import CWSPipe
--- a/reproduction/seqence_labelling/chinese_ner/readme.md
+++ b/reproduction/seqence_labelling/chinese_ner/readme.md
@@ -0,0 +1,30 @@
 使用以下中文NERPipe自动下载的统计数据

 | MsraNERPipe | # of sents | # of tokens |
 | ----------- | ---------- | ----------- |
 | train       | 41747      | 1954374     |
 | dev         | 4617       | 215505      |
 | test        | 4365       | 172601      |
 | total       | 50729      | 2342480     |
 这里报道的统计数据，与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致



 | WeiboNERPipe | # of sents | # of tokens |
 | ------------ | ---------- | ----------- |
 | train        | 1350       | 73778       |
 | dev          | 270        | 14509       |
 | test         | 270        | 14842       |
 | total        | 1890       | 1890        |
 这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致




 | PeopleDailyPipe | # of sents | # of tokens |
 | --------------- | ---------- | ----------- |
 | train           | 50658      | 2169879     |
 | dev             | 4631       | 172601      |
 | test            | 68         | 2270        |
 | total           | 55357      | 2344750     |
 这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的
--- a/reproduction/seqence_labelling/cws/readme.md
+++ b/reproduction/seqence_labelling/cws/readme.md
@@ -0,0 +1,32 @@
 四个数据集的统计信息，最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。

 | pku   | # of sents | # of tokens |
 | ----- | ---------- | ----------- |
 | train | 17173      | 1650222     |
 | dev   | 1881       | 176226      |
 | test  | 1944       | 172733      |
 | total | 20998      | 1999181     |


 | cityu | # of sents | # of tokens |
 | ----- | ---------- | ----------- |
 | train | 47696      | 2164907     |
 | dev   | 5323       | 238447      |
 | test  | 1492       | 67690       |
 | total | 54511      | 2471044     |


 | msra  | # of sents | # of tokens |
 | ----- | ---------- | ----------- |
 | train | 78242      | 3644550     |
 | dev   | 8676       | 405919      |
 | test  | 3985       | 184355      |
 | total | 90903      | 4234824     |


 | as    | # of sents | # of tokens |
 | ----- | ---------- | ----------- |
 | train | 638273     | 7536586     |
 | dev   | 70680      | 831464      |
 | test  | 14429      | 197681      |
 | total | 723382     | 8565731     |
--- a/reproduction/seqence_labelling/cws/test/init.py
+++ b/reproduction/seqence_labelling/cws/test/init.py
--- a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py
+++ b/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py
@@ -1,17 +0,0 @@


 import unittest
 from ..data.CWSDataLoader import SigHanLoader
 from fastNLP.core.vocabulary import VocabularyOption


 class TestCWSDataLoader(unittest.TestCase):
    def test_case1(self):
        cws_loader = SigHanLoader(target_type='bmes')
        data = cws_loader.process('pku_demo.txt')
        print(data.datasets)

    def test_calse2(self):
        cws_loader = SigHanLoader(target_type='bmes')
        data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption())
        print(data.datasets)