hummingbird
/
fastNLP

 
			
							import os
import re
import argparse
from opencc import OpenCC

cc = OpenCC("t2s")

from utils import make_sure_path_exists, append_tags

sighan05_root = ""
sighan08_root = ""
data_path = ""

E_pun = u",.!?[]()<>\"\"'',"
C_pun = u"，。！？【】（）《》“”‘’、"
Table = {ord(f): ord(t) for f, t in zip(C_pun, E_pun)}
Table[12288] = 32  # 全半角空格


def C_trans_to_E(string):
    return string.translate(Table)


def normalize(ustring):
    """全角转半角"""
    rstring = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:  # 全角空格直接转换
            inside_code = 32
        elif 65281 <= inside_code <= 65374:  # 全角字符（除空格）根据关系转化
            inside_code -= 65248

        rstring += chr(inside_code)
    return rstring


def preprocess(text):
    rNUM = u"(-|\+)?\d+((\.|·)\d+)?%?"
    rENG = u"[A-Za-z_]+.*"
    sent = normalize(C_trans_to_E(text.strip())).split()
    new_sent = []
    for word in sent:
        word = re.sub(u"\s+", "", word, flags=re.U)
        word = re.sub(rNUM, u"0", word, flags=re.U)
        word = re.sub(rENG, u"X", word)
        new_sent.append(word)
    return new_sent


def to_sentence_list(text, split_long_sentence=False):
    text = preprocess(text)
    delimiter = set()
    delimiter.update("。！？：；…、，（）,;!?、,\"'")
    delimiter.add("……")
    sent_list = []
    sent = []
    sent_len = 0
    for word in text:
        sent.append(word)
        sent_len += len(word)
        if word in delimiter or (split_long_sentence and sent_len >= 50):
            sent_list.append(sent)
            sent = []
            sent_len = 0

    if len(sent) > 0:
        sent_list.append(sent)

    return sent_list


def is_traditional(dataset):
    return dataset in ["as", "cityu", "ckip"]


def convert_file(
    src, des, need_cc=False, split_long_sentence=False, encode="utf-8-sig"
):
    with open(src, encoding=encode) as src, open(des, "w", encoding="utf-8") as des:
        for line in src:
            for sent in to_sentence_list(line, split_long_sentence):
                line = " ".join(sent) + "\n"
                if need_cc:
                    line = cc.convert(line)
                des.write(line)
                # if len(''.join(sent)) > 200:
                #     print(' '.join(sent))


def split_train_dev(dataset):
    root = data_path + "/" + dataset + "/raw/"
    with open(root + "train-all.txt", encoding="UTF-8") as src, open(
        root + "train.txt", "w", encoding="UTF-8"
    ) as train, open(root + "dev.txt", "w", encoding="UTF-8") as dev:
        lines = src.readlines()
        idx = int(len(lines) * 0.9)
        for line in lines[:idx]:
            train.write(line)
        for line in lines[idx:]:
            dev.write(line)


def combine_files(one, two, out):
    if os.path.exists(out):
        os.remove(out)
    with open(one, encoding="utf-8") as one, open(two, encoding="utf-8") as two, open(
        out, "a", encoding="utf-8"
    ) as out:
        for line in one:
            out.write(line)
        for line in two:
            out.write(line)


def bmes_tag(input_file, output_file):
    with open(input_file, encoding="utf-8") as input_data, open(
        output_file, "w", encoding="utf-8"
    ) as output_data:
        for line in input_data:
            word_list = line.strip().split()
            for word in word_list:
                if len(word) == 1 or (
                    len(word) > 2 and word[0] == "<" and word[-1] == ">"
                ):
                    output_data.write(word + "\tS\n")
                else:
                    output_data.write(word[0] + "\tB\n")
                    for w in word[1 : len(word) - 1]:
                        output_data.write(w + "\tM\n")
                    output_data.write(word[len(word) - 1] + "\tE\n")
            output_data.write("\n")


def make_bmes(dataset="pku"):
    path = data_path + "/" + dataset + "/"
    make_sure_path_exists(path + "bmes")
    bmes_tag(path + "raw/train.txt", path + "bmes/train.txt")
    bmes_tag(path + "raw/train-all.txt", path + "bmes/train-all.txt")
    bmes_tag(path + "raw/dev.txt", path + "bmes/dev.txt")
    bmes_tag(path + "raw/test.txt", path + "bmes/test.txt")


def convert_sighan2005_dataset(dataset):
    global sighan05_root
    root = os.path.join(data_path, dataset)
    make_sure_path_exists(root)
    make_sure_path_exists(root + "/raw")
    file_path = "{}/{}_training.utf8".format(sighan05_root, dataset)
    convert_file(
        file_path, "{}/raw/train-all.txt".format(root), is_traditional(dataset), True
    )
    if dataset == "as":
        file_path = "{}/{}_testing_gold.utf8".format(sighan05_root, dataset)
    else:
        file_path = "{}/{}_test_gold.utf8".format(sighan05_root, dataset)
    convert_file(
        file_path, "{}/raw/test.txt".format(root), is_traditional(dataset), False
    )
    split_train_dev(dataset)


def convert_sighan2008_dataset(dataset, utf=16):
    global sighan08_root
    root = os.path.join(data_path, dataset)
    make_sure_path_exists(root)
    make_sure_path_exists(root + "/raw")
    convert_file(
        "{}/{}_train_utf{}.seg".format(sighan08_root, dataset, utf),
        "{}/raw/train-all.txt".format(root),
        is_traditional(dataset),
        True,
        "utf-{}".format(utf),
    )
    convert_file(
        "{}/{}_seg_truth&resource/{}_truth_utf{}.seg".format(
            sighan08_root, dataset, dataset, utf
        ),
        "{}/raw/test.txt".format(root),
        is_traditional(dataset),
        False,
        "utf-{}".format(utf),
    )
    split_train_dev(dataset)


def extract_conll(src, out):
    words = []
    with open(src, encoding="utf-8") as src, open(out, "w", encoding="utf-8") as out:
        for line in src:
            line = line.strip()
            if len(line) == 0:
                out.write(" ".join(words) + "\n")
                words = []
                continue
            cells = line.split()
            words.append(cells[1])


def make_joint_corpus(datasets, joint):
    parts = ["dev", "test", "train", "train-all"]
    for part in parts:
        old_file = "{}/{}/raw/{}.txt".format(data_path, joint, part)
        if os.path.exists(old_file):
            os.remove(old_file)
        elif not os.path.exists(os.path.dirname(old_file)):
            os.makedirs(os.path.dirname(old_file))
        for name in datasets:
            append_tags(
                os.path.join(data_path, name, "raw"),
                os.path.dirname(old_file),
                name,
                part,
                encode="utf-8",
            )


def convert_all_sighan2005(datasets):
    for dataset in datasets:
        print(("Converting sighan bakeoff 2005 corpus: {}".format(dataset)))
        convert_sighan2005_dataset(dataset)
        make_bmes(dataset)


def convert_all_sighan2008(datasets):
    for dataset in datasets:
        print(("Converting sighan bakeoff 2008 corpus: {}".format(dataset)))
        convert_sighan2008_dataset(dataset, 16)
        make_bmes(dataset)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # fmt: off
    parser.add_argument("--sighan05", required=True, type=str, help="path to sighan2005 dataset")
    parser.add_argument("--sighan08", required=True, type=str, help="path to sighan2008 dataset")
    parser.add_argument("--data_path", required=True, type=str, help="path to save dataset")
    # fmt: on

    args, _ = parser.parse_known_args()
    sighan05_root = args.sighan05
    sighan08_root = args.sighan08
    data_path = args.data_path

    print("Converting sighan2005 Simplified Chinese corpus")
    datasets = "pku", "msr", "as", "cityu"
    convert_all_sighan2005(datasets)

    print("Combining sighan2005 corpus to one joint Simplified Chinese corpus")
    datasets = "pku", "msr", "as", "cityu"
    make_joint_corpus(datasets, "joint-sighan2005")
    make_bmes("joint-sighan2005")

    # For researchers who have access to sighan2008 corpus, use official corpora please.
    print("Converting sighan2008 Simplified Chinese corpus")
    datasets = "ctb", "ckip", "cityu", "ncc", "sxu"
    convert_all_sighan2008(datasets)
    print("Combining those 8 sighan corpora to one joint corpus")
    datasets = "pku", "msr", "as", "ctb", "ckip", "cityu", "ncc", "sxu"
    make_joint_corpus(datasets, "joint-sighan2008")
    make_bmes("joint-sighan2008")