Browse Source

1.增加sequence labeling任务的数据说明; 2.增加对CWSPipe的引用

tags/v0.4.10
yh_cc 5 years ago
parent
commit
584a92c64c
6 changed files with 66 additions and 17 deletions
  1. +1
    -0
      fastNLP/io/__init__.py
  2. +3
    -0
      fastNLP/io/pipe/__init__.py
  3. +30
    -0
      reproduction/seqence_labelling/chinese_ner/readme.md
  4. +32
    -0
      reproduction/seqence_labelling/cws/readme.md
  5. +0
    -0
      reproduction/seqence_labelling/cws/test/__init__.py
  6. +0
    -17
      reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py

+ 1
- 0
fastNLP/io/__init__.py View File

@@ -38,6 +38,7 @@ __all__ = [
'JsonLoader',

'CWSLoader',
"CWSPipe",

'MNLILoader',
"QuoraLoader",


+ 3
- 0
fastNLP/io/pipe/__init__.py View File

@@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce
__all__ = [
"Pipe",

"CWSPipe",

"YelpFullPipe",
"YelpPolarityPipe",
"SSTPipe",
@@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
from .pipe import Pipe
from .conll import Conll2003Pipe
from .cws import CWSPipe

+ 30
- 0
reproduction/seqence_labelling/chinese_ner/readme.md View File

@@ -0,0 +1,30 @@
使用以下中文NERPipe自动下载的统计数据

| MsraNERPipe | # of sents | # of tokens |
| ----------- | ---------- | ----------- |
| train | 41747 | 1954374 |
| dev | 4617 | 215505 |
| test | 4365 | 172601 |
| total | 50729 | 2342480 |
这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致



| WeiboNERPipe | # of sents | # of tokens |
| ------------ | ---------- | ----------- |
| train | 1350 | 73778 |
| dev | 270 | 14509 |
| test | 270 | 14842 |
| total | 1890 | 1890 |
这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致




| PeopleDailyPipe | # of sents | # of tokens |
| --------------- | ---------- | ----------- |
| train | 50658 | 2169879 |
| dev | 4631 | 172601 |
| test | 68 | 2270 |
| total | 55357 | 2344750 |
这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的

+ 32
- 0
reproduction/seqence_labelling/cws/readme.md View File

@@ -0,0 +1,32 @@
四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。

| pku | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 17173 | 1650222 |
| dev | 1881 | 176226 |
| test | 1944 | 172733 |
| total | 20998 | 1999181 |


| cityu | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 47696 | 2164907 |
| dev | 5323 | 238447 |
| test | 1492 | 67690 |
| total | 54511 | 2471044 |


| msra | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 78242 | 3644550 |
| dev | 8676 | 405919 |
| test | 3985 | 184355 |
| total | 90903 | 4234824 |


| as | # of sents | # of tokens |
| ----- | ---------- | ----------- |
| train | 638273 | 7536586 |
| dev | 70680 | 831464 |
| test | 14429 | 197681 |
| total | 723382 | 8565731 |

+ 0
- 0
reproduction/seqence_labelling/cws/test/__init__.py View File


+ 0
- 17
reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py View File

@@ -1,17 +0,0 @@


import unittest
from ..data.CWSDataLoader import SigHanLoader
from fastNLP.core.vocabulary import VocabularyOption


class TestCWSDataLoader(unittest.TestCase):
def test_case1(self):
cws_loader = SigHanLoader(target_type='bmes')
data = cws_loader.process('pku_demo.txt')
print(data.datasets)

def test_calse2(self):
cws_loader = SigHanLoader(target_type='bmes')
data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption())
print(data.datasets)

Loading…
Cancel
Save