修复CWSPipe中无法读取包含<, >的数据的问题https://github.com/fastnlp/fastNLP/issues/258#issue-533768454

5 years ago · 03e9ca2147
--- a/fastNLP/io/pipe/cws.py
+++ b/fastNLP/io/pipe/cws.py
@@ -122,7 +122,7 @@ def _find_and_replace_digit_spans(line):
    otherwise unkdgt
    """
    new_line = ''
    pattern = '\d[\d\\.﹒·]*(?=[\u4e00-\u9fff  ，％,。！<－“])'
    pattern = '\d[\d\\.﹒·]*(?=[\u4e00-\u9fff  ，％%,。！<－“])'
    prev_end = 0
    for match in re.finditer(pattern, line):
        start, end = match.span()
@@ -193,9 +193,13 @@ class CWSPipe(Pipe):
                subchar = []
                for c in word:
                    if c == '<':
                        if subchar:
                            char.extend(subchar)
                            subchar = []
                        subchar.append(c)
                        continue
                    if c == '>' and subchar[0] == '<':
                    if c == '>' and len(subchar)>0 and subchar[0] == '<':
                        subchar.append(c)
                        char.append(''.join(subchar))
                        subchar = []
                    if subchar:
--- a/test/data_for_tests/io/cws_pku/train.txt
+++ b/test/data_for_tests/io/cws_pku/train.txt
@@ -4,3 +4,6 @@
 １２月  ３１日  ，  中共中央  总书记  、  国家  主席  江  泽民  发表  １９９８年  新年  讲话  《  迈向  充满  希望  的  新  世纪  》  。  （  新华社  记者  兰  红光  摄  ）  
 同胞  们  、  朋友  们  、  女士  们  、  先生  们  ：  
 在  １９９８年  来临  之际  ，  我  十分  高兴  地  通过  中央  人民  广播  电台  、  中国  国际  广播  电台  和  中央  电视台  ，  向  全国  各族  人民  ，  向  香港  特别  行政区  同胞  、  澳门  和  台湾  同胞  、  海外  侨胞  ，  向  世界  各国  的  朋友  们  ，  致以  诚挚  的  问候  和  良好  的  祝愿  ！  
 占 比 57.8% > 40%
 占 比 57.8% < 40%
 占 比 57.8% < < 40% >
--- a/test/io/pipe/test_cws.py
+++ b/test/io/pipe/test_cws.py
@@ -22,3 +22,9 @@ class TestRunCWSPipe(unittest.TestCase):
                data_bundle = CWSPipe(bigrams=True, trigrams=True).\
                    process_from_file(f'test/data_for_tests/io/cws_{dataset_name}')
                print(data_bundle)
    def test_replace_number(self):
        data_bundle = CWSPipe(bigrams=True, replace_num_alpha=True).\
                    process_from_file(f'test/data_for_tests/io/cws_pku')
        for word in ['<', '>', '<NUM>']:
            self.assertNotEqual(data_bundle.get_vocab('chars').to_index(word), 1)