Browse Source

1.增加sequence labeling任务的数据说明; 2.增加对CWSPipe的引用

tags/v0.4.10
yh_cc 5 years ago
parent
commit
584a92c64c
6 changed files with 66 additions and 17 deletions
  1. +1
    -0
      fastNLP/io/__init__.py
  2. +3
    -0
      fastNLP/io/pipe/__init__.py
  3. +30
    -0
      reproduction/seqence_labelling/chinese_ner/readme.md
  4. +32
    -0
      reproduction/seqence_labelling/cws/readme.md
  5. +0
    -0
      reproduction/seqence_labelling/cws/test/__init__.py
  6. +0
    -17
      reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py

+ 1
- 0
fastNLP/io/__init__.py View File

@@ -38,6 +38,7 @@ __all__ = [
'JsonLoader', 'JsonLoader',


'CWSLoader', 'CWSLoader',
"CWSPipe",


'MNLILoader', 'MNLILoader',
"QuoraLoader", "QuoraLoader",


+ 3
- 0
fastNLP/io/pipe/__init__.py View File

@@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce
__all__ = [ __all__ = [
"Pipe", "Pipe",


"CWSPipe",

"YelpFullPipe", "YelpFullPipe",
"YelpPolarityPipe", "YelpPolarityPipe",
"SSTPipe", "SSTPipe",
@@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
from .pipe import Pipe from .pipe import Pipe
from .conll import Conll2003Pipe from .conll import Conll2003Pipe
from .cws import CWSPipe

+ 30
- 0
reproduction/seqence_labelling/chinese_ner/readme.md View File

@@ -0,0 +1,30 @@
使用以下中文NERPipe自动下载的统计数据

| MsraNERPipe | # of sents | # of tokens |
| ----------- | ---------- | ----------- |
| train | 41747 | 1954374 |
| dev | 4617 | 215505 |
| test | 4365 | 172601 |
| total | 50729 | 2342480 |
这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致



| WeiboNERPipe | # of sents | # of tokens |
| ------------ | ---------- | ----------- |
| train | 1350 | 73778 |
| dev | 270 | 14509 |
| test | 270 | 14842 |
| total | 1890 | 1890 |
这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致




| PeopleDailyPipe | # of sents | # of tokens |
| --------------- | ---------- | ----------- |
| train | 50658 | 2169879 |
| dev | 4631 | 172601 |
| test | 68 | 2270 |
| total | 55357 | 2344750 |
这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的

+ 32
- 0
reproduction/seqence_labelling/cws/readme.md View File

@@ -0,0 +1,32 @@
四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。

| pku | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 17173 | 1650222 |
| dev | 1881 | 176226 |
| test | 1944 | 172733 |
| total | 20998 | 1999181 |


| cityu | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 47696 | 2164907 |
| dev | 5323 | 238447 |
| test | 1492 | 67690 |
| total | 54511 | 2471044 |


| msra | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 78242 | 3644550 |
| dev | 8676 | 405919 |
| test | 3985 | 184355 |
| total | 90903 | 4234824 |


| as | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 638273 | 7536586 |
| dev | 70680 | 831464 |
| test | 14429 | 197681 |
| total | 723382 | 8565731 |

+ 0
- 0
reproduction/seqence_labelling/cws/test/__init__.py View File


+ 0
- 17
reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py View File

@@ -1,17 +0,0 @@


import unittest
from ..data.CWSDataLoader import SigHanLoader
from fastNLP.core.vocabulary import VocabularyOption


class TestCWSDataLoader(unittest.TestCase):
def test_case1(self):
cws_loader = SigHanLoader(target_type='bmes')
data = cws_loader.process('pku_demo.txt')
print(data.datasets)

def test_calse2(self):
cws_loader = SigHanLoader(target_type='bmes')
data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption())
print(data.datasets)

Loading…
Cancel
Save