From d5bea4a75523ecb1340d165b4cd6c2cadcc94f72 Mon Sep 17 00:00:00 2001
From: yunfan <yunfan.shao@outlook.com>
Date: Thu, 10 Oct 2019 21:03:12 +0800
Subject: [PATCH] [add] reproduction of multi-criteria cws

---
 reproduction/multi-criteria-cws/README.md     |  61 +++
 .../multi-criteria-cws/data-prepare.py        | 262 +++++++++
 .../multi-criteria-cws/data-process.py        | 166 ++++++
 reproduction/multi-criteria-cws/main.py       | 506 ++++++++++++++++++
 reproduction/multi-criteria-cws/make_data.sh  |  14 +
 reproduction/multi-criteria-cws/model.py      |   0
 reproduction/multi-criteria-cws/models.py     | 200 +++++++
 reproduction/multi-criteria-cws/optm.py       |  49 ++
 reproduction/multi-criteria-cws/train.py      | 138 +++++
 reproduction/multi-criteria-cws/train.sh      |  26 +
 .../multi-criteria-cws/transformer.py         | 152 ++++++
 reproduction/multi-criteria-cws/utils.py      | 308 +++++++++++
 12 files changed, 1882 insertions(+)
 create mode 100644 reproduction/multi-criteria-cws/README.md
 create mode 100644 reproduction/multi-criteria-cws/data-prepare.py
 create mode 100644 reproduction/multi-criteria-cws/data-process.py
 create mode 100644 reproduction/multi-criteria-cws/main.py
 create mode 100644 reproduction/multi-criteria-cws/make_data.sh
 create mode 100644 reproduction/multi-criteria-cws/model.py
 create mode 100644 reproduction/multi-criteria-cws/models.py
 create mode 100644 reproduction/multi-criteria-cws/optm.py
 create mode 100644 reproduction/multi-criteria-cws/train.py
 create mode 100644 reproduction/multi-criteria-cws/train.sh
 create mode 100644 reproduction/multi-criteria-cws/transformer.py
 create mode 100644 reproduction/multi-criteria-cws/utils.py

diff --git a/reproduction/multi-criteria-cws/README.md b/reproduction/multi-criteria-cws/README.md
new file mode 100644
index 00000000..0f4ab8d8
--- /dev/null
+++ b/reproduction/multi-criteria-cws/README.md
@@ -0,0 +1,61 @@
+
+
+# Multi-Criteria-CWS
+
+An implementation of [Multi-Criteria Chinese Word Segmentation with Transformer](http://arxiv.org/abs/1906.12035) with fastNLP.
+
+## Dataset
+### Overview
+We use the same datasets listed in paper.
+- sighan2005
+  - pku
+  - msr
+  - as
+  - cityu
+- sighan2008
+  - ctb
+  - ckip
+  - cityu (combined with data in sighan2005)
+  - ncc
+  - sxu
+
+### Preprocess
+First, download OpenCC to convert between Traditional Chinese and Simplified Chinese.
+``` shell
+pip install opencc-python-reimplemented
+```
+Then, set a path to save processed data, and run the shell script to process the data.
+```shell
+export DATA_DIR=path/to/processed-data
+bash make_data.sh path/to/sighan2005 path/to/sighan2008
+```
+It would take a few minutes to finish the process.
+
+## Model
+We use transformer to build the model, as described in paper.
+
+## Train
+Finally, to train the model, run the shell script.
+The `train.sh` takes one argument, the GPU-IDs to use, for example:
+``` shell
+bash train.sh 0,1
+```
+This command use GPUs with ID 0 and 1.
+
+Note: Please refer to the paper for details of hyper-parameters. And modify the settings in `train.sh` to match your experiment environment.
+
+Type 
+``` shell
+python main.py --help
+```
+to learn all arguments to be specified in training.
+
+## Performance
+
+Results on the test sets of eight CWS datasets with multi-criteria learning. 
+
+| Dataset        | MSRA  | AS    | PKU   | CTB   | CKIP  | CITYU | NCC   | SXU   | Avg.  |
+| -------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
+| Original paper | 98.05 | 96.44 | 96.41 | 96.99 | 96.51 | 96.91 | 96.04 | 97.61 | 96.87 |
+| Ours           | 96.92 | 95.71 | 95.65 | 95.96 | 96.00 | 96.09 | 94.61 | 96.64 | 95.95 |
+
diff --git a/reproduction/multi-criteria-cws/data-prepare.py b/reproduction/multi-criteria-cws/data-prepare.py
new file mode 100644
index 00000000..1d6e89b5
--- /dev/null
+++ b/reproduction/multi-criteria-cws/data-prepare.py
@@ -0,0 +1,262 @@
+import os
+import re
+import argparse
+from opencc import OpenCC
+
+cc = OpenCC("t2s")
+
+from utils import make_sure_path_exists, append_tags
+
+sighan05_root = ""
+sighan08_root = ""
+data_path = ""
+
+E_pun = u",.!?[]()<>\"\"'',"
+C_pun = u"，。！？【】（）《》“”‘’、"
+Table = {ord(f): ord(t) for f, t in zip(C_pun, E_pun)}
+Table[12288] = 32  # 全半角空格
+
+
+def C_trans_to_E(string):
+    return string.translate(Table)
+
+
+def normalize(ustring):
+    """全角转半角"""
+    rstring = ""
+    for uchar in ustring:
+        inside_code = ord(uchar)
+        if inside_code == 12288:  # 全角空格直接转换
+            inside_code = 32
+        elif 65281 <= inside_code <= 65374:  # 全角字符（除空格）根据关系转化
+            inside_code -= 65248
+
+        rstring += chr(inside_code)
+    return rstring
+
+
+def preprocess(text):
+    rNUM = u"(-|\+)?\d+((\.|·)\d+)?%?"
+    rENG = u"[A-Za-z_]+.*"
+    sent = normalize(C_trans_to_E(text.strip())).split()
+    new_sent = []
+    for word in sent:
+        word = re.sub(u"\s+", "", word, flags=re.U)
+        word = re.sub(rNUM, u"0", word, flags=re.U)
+        word = re.sub(rENG, u"X", word)
+        new_sent.append(word)
+    return new_sent
+
+
+def to_sentence_list(text, split_long_sentence=False):
+    text = preprocess(text)
+    delimiter = set()
+    delimiter.update("。！？：；…、，（）,;!?、,\"'")
+    delimiter.add("……")
+    sent_list = []
+    sent = []
+    sent_len = 0
+    for word in text:
+        sent.append(word)
+        sent_len += len(word)
+        if word in delimiter or (split_long_sentence and sent_len >= 50):
+            sent_list.append(sent)
+            sent = []
+            sent_len = 0
+
+    if len(sent) > 0:
+        sent_list.append(sent)
+
+    return sent_list
+
+
+def is_traditional(dataset):
+    return dataset in ["as", "cityu", "ckip"]
+
+
+def convert_file(
+    src, des, need_cc=False, split_long_sentence=False, encode="utf-8-sig"
+):
+    with open(src, encoding=encode) as src, open(des, "w", encoding="utf-8") as des:
+        for line in src:
+            for sent in to_sentence_list(line, split_long_sentence):
+                line = " ".join(sent) + "\n"
+                if need_cc:
+                    line = cc.convert(line)
+                des.write(line)
+                # if len(''.join(sent)) > 200:
+                #     print(' '.join(sent))
+
+
+def split_train_dev(dataset):
+    root = data_path + "/" + dataset + "/raw/"
+    with open(root + "train-all.txt", encoding="UTF-8") as src, open(
+        root + "train.txt", "w", encoding="UTF-8"
+    ) as train, open(root + "dev.txt", "w", encoding="UTF-8") as dev:
+        lines = src.readlines()
+        idx = int(len(lines) * 0.9)
+        for line in lines[:idx]:
+            train.write(line)
+        for line in lines[idx:]:
+            dev.write(line)
+
+
+def combine_files(one, two, out):
+    if os.path.exists(out):
+        os.remove(out)
+    with open(one, encoding="utf-8") as one, open(two, encoding="utf-8") as two, open(
+        out, "a", encoding="utf-8"
+    ) as out:
+        for line in one:
+            out.write(line)
+        for line in two:
+            out.write(line)
+
+
+def bmes_tag(input_file, output_file):
+    with open(input_file, encoding="utf-8") as input_data, open(
+        output_file, "w", encoding="utf-8"
+    ) as output_data:
+        for line in input_data:
+            word_list = line.strip().split()
+            for word in word_list:
+                if len(word) == 1 or (
+                    len(word) > 2 and word[0] == "<" and word[-1] == ">"
+                ):
+                    output_data.write(word + "\tS\n")
+                else:
+                    output_data.write(word[0] + "\tB\n")
+                    for w in word[1 : len(word) - 1]:
+                        output_data.write(w + "\tM\n")
+                    output_data.write(word[len(word) - 1] + "\tE\n")
+            output_data.write("\n")
+
+
+def make_bmes(dataset="pku"):
+    path = data_path + "/" + dataset + "/"
+    make_sure_path_exists(path + "bmes")
+    bmes_tag(path + "raw/train.txt", path + "bmes/train.txt")
+    bmes_tag(path + "raw/train-all.txt", path + "bmes/train-all.txt")
+    bmes_tag(path + "raw/dev.txt", path + "bmes/dev.txt")
+    bmes_tag(path + "raw/test.txt", path + "bmes/test.txt")
+
+
+def convert_sighan2005_dataset(dataset):
+    global sighan05_root
+    root = os.path.join(data_path, dataset)
+    make_sure_path_exists(root)
+    make_sure_path_exists(root + "/raw")
+    file_path = "{}/{}_training.utf8".format(sighan05_root, dataset)
+    convert_file(
+        file_path, "{}/raw/train-all.txt".format(root), is_traditional(dataset), True
+    )
+    if dataset == "as":
+        file_path = "{}/{}_testing_gold.utf8".format(sighan05_root, dataset)
+    else:
+        file_path = "{}/{}_test_gold.utf8".format(sighan05_root, dataset)
+    convert_file(
+        file_path, "{}/raw/test.txt".format(root), is_traditional(dataset), False
+    )
+    split_train_dev(dataset)
+
+
+def convert_sighan2008_dataset(dataset, utf=16):
+    global sighan08_root
+    root = os.path.join(data_path, dataset)
+    make_sure_path_exists(root)
+    make_sure_path_exists(root + "/raw")
+    convert_file(
+        "{}/{}_train_utf{}.seg".format(sighan08_root, dataset, utf),
+        "{}/raw/train-all.txt".format(root),
+        is_traditional(dataset),
+        True,
+        "utf-{}".format(utf),
+    )
+    convert_file(
+        "{}/{}_seg_truth&resource/{}_truth_utf{}.seg".format(
+            sighan08_root, dataset, dataset, utf
+        ),
+        "{}/raw/test.txt".format(root),
+        is_traditional(dataset),
+        False,
+        "utf-{}".format(utf),
+    )
+    split_train_dev(dataset)
+
+
+def extract_conll(src, out):
+    words = []
+    with open(src, encoding="utf-8") as src, open(out, "w", encoding="utf-8") as out:
+        for line in src:
+            line = line.strip()
+            if len(line) == 0:
+                out.write(" ".join(words) + "\n")
+                words = []
+                continue
+            cells = line.split()
+            words.append(cells[1])
+
+
+def make_joint_corpus(datasets, joint):
+    parts = ["dev", "test", "train", "train-all"]
+    for part in parts:
+        old_file = "{}/{}/raw/{}.txt".format(data_path, joint, part)
+        if os.path.exists(old_file):
+            os.remove(old_file)
+        elif not os.path.exists(os.path.dirname(old_file)):
+            os.makedirs(os.path.dirname(old_file))
+        for name in datasets:
+            append_tags(
+                os.path.join(data_path, name, "raw"),
+                os.path.dirname(old_file),
+                name,
+                part,
+                encode="utf-8",
+            )
+
+
+def convert_all_sighan2005(datasets):
+    for dataset in datasets:
+        print(("Converting sighan bakeoff 2005 corpus: {}".format(dataset)))
+        convert_sighan2005_dataset(dataset)
+        make_bmes(dataset)
+
+
+def convert_all_sighan2008(datasets):
+    for dataset in datasets:
+        print(("Converting sighan bakeoff 2008 corpus: {}".format(dataset)))
+        convert_sighan2008_dataset(dataset, 16)
+        make_bmes(dataset)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # fmt: off
+    parser.add_argument("--sighan05", required=True, type=str, help="path to sighan2005 dataset")
+    parser.add_argument("--sighan08", required=True, type=str, help="path to sighan2008 dataset")
+    parser.add_argument("--data_path", required=True, type=str, help="path to save dataset")
+    # fmt: on
+
+    args, _ = parser.parse_known_args()
+    sighan05_root = args.sighan05
+    sighan08_root = args.sighan08
+    data_path = args.data_path
+
+    print("Converting sighan2005 Simplified Chinese corpus")
+    datasets = "pku", "msr", "as", "cityu"
+    convert_all_sighan2005(datasets)
+
+    print("Combining sighan2005 corpus to one joint Simplified Chinese corpus")
+    datasets = "pku", "msr", "as", "cityu"
+    make_joint_corpus(datasets, "joint-sighan2005")
+    make_bmes("joint-sighan2005")
+
+    # For researchers who have access to sighan2008 corpus, use official corpora please.
+    print("Converting sighan2008 Simplified Chinese corpus")
+    datasets = "ctb", "ckip", "cityu", "ncc", "sxu"
+    convert_all_sighan2008(datasets)
+    print("Combining those 8 sighan corpora to one joint corpus")
+    datasets = "pku", "msr", "as", "ctb", "ckip", "cityu", "ncc", "sxu"
+    make_joint_corpus(datasets, "joint-sighan2008")
+    make_bmes("joint-sighan2008")
+
diff --git a/reproduction/multi-criteria-cws/data-process.py b/reproduction/multi-criteria-cws/data-process.py
new file mode 100644
index 00000000..829580ef
--- /dev/null
+++ b/reproduction/multi-criteria-cws/data-process.py
@@ -0,0 +1,166 @@
+import os
+import sys
+
+import codecs
+import argparse
+from _pickle import load, dump
+import collections
+from utils import get_processing_word, is_dataset_tag, make_sure_path_exists, get_bmes
+from fastNLP import Instance, DataSet, Vocabulary, Const
+
+max_len = 0
+
+
+def expand(x):
+    sent = ["<sos>"] + x[1:] + ["<eos>"]
+    return [x + y for x, y in zip(sent[:-1], sent[1:])]
+
+
+def read_file(filename, processing_word=get_processing_word(lowercase=False)):
+    dataset = DataSet()
+    niter = 0
+    with codecs.open(filename, "r", "utf-8-sig") as f:
+        words, tags = [], []
+        for line in f:
+            line = line.strip()
+            if len(line) == 0 or line.startswith("-DOCSTART-"):
+                if len(words) != 0:
+                    assert len(words) > 2
+                    if niter == 1:
+                        print(words, tags)
+                    niter += 1
+                    dataset.append(Instance(ori_words=words[:-1], ori_tags=tags[:-1]))
+                    words, tags = [], []
+            else:
+                word, tag = line.split()
+                word = processing_word(word)
+                words.append(word)
+                tags.append(tag.lower())
+
+    dataset.apply_field(lambda x: [x[0]], field_name="ori_words", new_field_name="task")
+    dataset.apply_field(
+        lambda x: len(x), field_name="ori_tags", new_field_name="seq_len"
+    )
+    dataset.apply_field(
+        lambda x: expand(x), field_name="ori_words", new_field_name="bi1"
+    )
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # fmt: off
+    parser.add_argument("--data_path", required=True, type=str, help="all of datasets pkl paths")
+    # fmt: on
+
+    options, _ = parser.parse_known_args()
+
+    train_set, test_set = DataSet(), DataSet()
+
+    input_dir = os.path.join(options.data_path, "joint-sighan2008/bmes")
+    options.output = os.path.join(options.data_path, "total_dataset.pkl")
+    print(input_dir, options.output)
+
+    for fn in os.listdir(input_dir):
+        if fn not in ["test.txt", "train-all.txt"]:
+            continue
+        print(fn)
+        abs_fn = os.path.join(input_dir, fn)
+        ds = read_file(abs_fn)
+        if "test.txt" == fn:
+            test_set = ds
+        else:
+            train_set = ds
+
+    print(
+        "num samples of total train, test: {}, {}".format(len(train_set), len(test_set))
+    )
+
+    uni_vocab = Vocabulary(min_freq=None).from_dataset(
+        train_set, test_set, field_name="ori_words"
+    )
+    # bi_vocab = Vocabulary(min_freq=3, max_size=50000).from_dataset(train_set,test_set, field_name="bi1")
+    bi_vocab = Vocabulary(min_freq=3, max_size=None).from_dataset(
+        train_set, field_name="bi1", no_create_entry_dataset=[test_set]
+    )
+    tag_vocab = Vocabulary(min_freq=None, padding="s", unknown=None).from_dataset(
+        train_set, field_name="ori_tags"
+    )
+    task_vocab = Vocabulary(min_freq=None, padding=None, unknown=None).from_dataset(
+        train_set, field_name="task"
+    )
+
+    def to_index(dataset):
+        uni_vocab.index_dataset(dataset, field_name="ori_words", new_field_name="uni")
+        tag_vocab.index_dataset(dataset, field_name="ori_tags", new_field_name="tags")
+        task_vocab.index_dataset(dataset, field_name="task", new_field_name="task")
+
+        dataset.apply_field(lambda x: x[1:], field_name="bi1", new_field_name="bi2")
+        dataset.apply_field(lambda x: x[:-1], field_name="bi1", new_field_name="bi1")
+        bi_vocab.index_dataset(dataset, field_name="bi1", new_field_name="bi1")
+        bi_vocab.index_dataset(dataset, field_name="bi2", new_field_name="bi2")
+
+        dataset.set_input("task", "uni", "bi1", "bi2", "seq_len")
+        dataset.set_target("tags")
+        return dataset
+
+    train_set = to_index(train_set)
+    test_set = to_index(test_set)
+
+    output = {}
+    output["train_set"] = train_set
+    output["test_set"] = test_set
+    output["uni_vocab"] = uni_vocab
+    output["bi_vocab"] = bi_vocab
+    output["tag_vocab"] = tag_vocab
+    output["task_vocab"] = task_vocab
+
+    print(tag_vocab.word2idx)
+    print(task_vocab.word2idx)
+
+    make_sure_path_exists(os.path.dirname(options.output))
+
+    print("Saving dataset to {}".format(os.path.abspath(options.output)))
+    with open(options.output, "wb") as outfile:
+        dump(output, outfile)
+
+    print(len(task_vocab), len(tag_vocab), len(uni_vocab), len(bi_vocab))
+    dic = {}
+    tokens = {}
+
+    def process(words):
+        name = words[0][1:-1]
+        if name not in dic:
+            dic[name] = set()
+            tokens[name] = 0
+        tokens[name] += len(words[1:])
+        dic[name].update(words[1:])
+
+    train_set.apply_field(process, "ori_words", None)
+    for name in dic.keys():
+        print(name, len(dic[name]), tokens[name])
+
+    with open(os.path.join(os.path.dirname(options.output), "oovdict.pkl"), "wb") as f:
+        dump(dic, f)
+
+    def get_max_len(ds):
+        global max_len
+        max_len = 0
+
+        def find_max_len(words):
+            global max_len
+            if max_len < len(words):
+                max_len = len(words)
+
+        ds.apply_field(find_max_len, "ori_words", None)
+        return max_len
+
+    print(
+        "train max len: {}, test max len: {}".format(
+            get_max_len(train_set), get_max_len(test_set)
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/reproduction/multi-criteria-cws/main.py b/reproduction/multi-criteria-cws/main.py
new file mode 100644
index 00000000..049a1974
--- /dev/null
+++ b/reproduction/multi-criteria-cws/main.py
@@ -0,0 +1,506 @@
+import _pickle as pickle
+import argparse
+import collections
+import logging
+import math
+import os
+import pickle
+import random
+import sys
+import time
+from sys import maxsize
+
+import fastNLP
+import fastNLP.embeddings
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from fastNLP import BucketSampler, DataSetIter, SequentialSampler, logger
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data.distributed import DistributedSampler
+
+import models
+import optm
+import utils
+
+NONE_TAG = "<NONE>"
+START_TAG = "<sos>"
+END_TAG = "<eos>"
+
+DEFAULT_WORD_EMBEDDING_SIZE = 100
+DEBUG_SCALE = 200
+
+# ===-----------------------------------------------------------------------===
+# Argument parsing
+# ===-----------------------------------------------------------------------===
+# fmt: off
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset", required=True, dest="dataset", help="processed data dir")
+parser.add_argument("--word-embeddings", dest="word_embeddings", help="File from which to read in pretrained embeds")
+parser.add_argument("--bigram-embeddings", dest="bigram_embeddings", help="File from which to read in pretrained embeds")
+parser.add_argument("--crf", dest="crf", action="store_true", help="crf")
+# parser.add_argument("--devi", default="0", dest="devi", help="gpu")
+parser.add_argument("--step", default=0, dest="step", type=int,help="step")
+parser.add_argument("--num-epochs", default=100, dest="num_epochs", type=int,
+                    help="Number of full passes through training set")
+parser.add_argument("--batch-size", default=128, dest="batch_size", type=int,
+                    help="Minibatch size of training set")
+parser.add_argument("--d_model", default=256, dest="d_model", type=int, help="d_model")
+parser.add_argument("--d_ff", default=1024, dest="d_ff", type=int, help="d_ff")
+parser.add_argument("--N", default=6, dest="N", type=int, help="N")
+parser.add_argument("--h", default=4, dest="h", type=int, help="h")
+parser.add_argument("--factor", default=2, dest="factor", type=float, help="Initial learning rate")
+parser.add_argument("--dropout", default=0.2, dest="dropout", type=float,
+                    help="Amount of dropout(not keep rate, but drop rate) to apply to embeddings part of graph")
+parser.add_argument("--log-dir", default="result", dest="log_dir",
+                    help="Directory where to write logs / serialized models")
+parser.add_argument("--task-name", default=time.strftime("%Y-%m-%d-%H-%M-%S"), dest="task_name",
+                    help="Name for this task, use a comprehensive one")
+parser.add_argument("--no-model", dest="no_model", action="store_true", help="Don't serialize model")
+parser.add_argument("--always-model", dest="always_model", action="store_true",
+                    help="Always serialize model after every epoch")
+parser.add_argument("--old-model", dest="old_model", help="Path to old model for incremental training")
+parser.add_argument("--skip-dev", dest="skip_dev", action="store_true", help="Skip dev set, would save some time")
+parser.add_argument("--freeze", dest="freeze", action="store_true", help="freeze pretrained embedding")
+parser.add_argument("--only-task", dest="only_task", action="store_true", help="only train task embedding")
+parser.add_argument("--subset", dest="subset", help="Only train and test on a subset of the whole dataset")
+parser.add_argument("--seclude", dest="seclude", help="train and test except a subset")
+parser.add_argument("--instances", default=None, dest="instances", type=int,help="num of instances of subset")
+
+parser.add_argument("--seed", dest="python_seed", type=int, default=random.randrange(maxsize),
+                    help="Random seed of Python and NumPy")
+parser.add_argument("--debug", dest="debug", default=False, action="store_true", help="Debug mode")
+parser.add_argument("--test", dest="test", action="store_true", help="Test mode")
+parser.add_argument('--local_rank', type=int, default=None)
+parser.add_argument('--init_method', type=str, default='env://')
+# fmt: on
+
+options, _ = parser.parse_known_args()
+print("unknown args", _)
+task_name = options.task_name
+root_dir = "{}/{}".format(options.log_dir, task_name)
+utils.make_sure_path_exists(root_dir)
+
+if options.local_rank is not None:
+    torch.cuda.set_device(options.local_rank)
+    dist.init_process_group("nccl", init_method=options.init_method)
+
+
+def init_logger():
+    if not os.path.exists(root_dir):
+        os.mkdir(root_dir)
+    log_formatter = logging.Formatter("%(asctime)s - %(message)s")
+    logger = logging.getLogger()
+    file_handler = logging.FileHandler("{0}/info.log".format(root_dir), mode="w")
+    file_handler.setFormatter(log_formatter)
+    logger.addHandler(file_handler)
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(log_formatter)
+    logger.addHandler(console_handler)
+    if options.local_rank is None or options.local_rank == 0:
+        logger.setLevel(logging.INFO)
+    else:
+        logger.setLevel(logging.WARNING)
+    return logger
+
+
+# ===-----------------------------------------------------------------------===
+# Set up logging
+# ===-----------------------------------------------------------------------===
+# logger = init_logger()
+logger.add_file("{}/info.log".format(root_dir), "INFO")
+logger.setLevel(logging.INFO if dist.get_rank() == 0 else logging.WARNING)
+
+# ===-----------------------------------------------------------------------===
+# Log some stuff about this run
+# ===-----------------------------------------------------------------------===
+logger.info(" ".join(sys.argv))
+logger.info("")
+logger.info(options)
+
+if options.debug:
+    logger.info("DEBUG MODE")
+    options.num_epochs = 2
+    options.batch_size = 20
+
+random.seed(options.python_seed)
+np.random.seed(options.python_seed % (2 ** 32 - 1))
+torch.cuda.manual_seed_all(options.python_seed)
+logger.info("Python random seed: {}".format(options.python_seed))
+
+# ===-----------------------------------------------------------------------===
+# Read in dataset
+# ===-----------------------------------------------------------------------===
+dataset = pickle.load(open(options.dataset + "/total_dataset.pkl", "rb"))
+train_set = dataset["train_set"]
+test_set = dataset["test_set"]
+uni_vocab = dataset["uni_vocab"]
+bi_vocab = dataset["bi_vocab"]
+task_vocab = dataset["task_vocab"]
+tag_vocab = dataset["tag_vocab"]
+for v in (bi_vocab, uni_vocab, tag_vocab, task_vocab):
+    if hasattr(v, "_word2idx"):
+        v.word2idx = v._word2idx
+for ds in (train_set, test_set):
+    ds.rename_field("ori_words", "words")
+
+logger.info("{} {}".format(bi_vocab.to_word(0), tag_vocab.word2idx))
+logger.info(task_vocab.word2idx)
+if options.skip_dev:
+    dev_set = test_set
+else:
+    train_set, dev_set = train_set.split(0.1)
+
+logger.info("{} {} {}".format(len(train_set), len(dev_set), len(test_set)))
+
+if options.debug:
+    train_set = train_set[0:DEBUG_SCALE]
+    dev_set = dev_set[0:DEBUG_SCALE]
+    test_set = test_set[0:DEBUG_SCALE]
+
+# ===-----------------------------------------------------------------------===
+# Build model and trainer
+# ===-----------------------------------------------------------------------===
+
+# ===============================
+if dist.get_rank() != 0:
+    dist.barrier()
+
+if options.word_embeddings is None:
+    init_embedding = None
+else:
+    # logger.info("Load: {}".format(options.word_embeddings))
+    # init_embedding = utils.embedding_load_with_cache(options.word_embeddings, options.cache_dir, uni_vocab, normalize=False)
+    init_embedding = fastNLP.embeddings.StaticEmbedding(
+        uni_vocab, options.word_embeddings, word_drop=0.01
+    )
+
+bigram_embedding = None
+if options.bigram_embeddings:
+    # logger.info("Load: {}".format(options.bigram_embeddings))
+    # bigram_embedding = utils.embedding_load_with_cache(options.bigram_embeddings, options.cache_dir, bi_vocab, normalize=False)
+    bigram_embedding = fastNLP.embeddings.StaticEmbedding(
+        bi_vocab, options.bigram_embeddings
+    )
+
+if dist.get_rank() == 0:
+    dist.barrier()
+# ===============================
+
+# select subset training
+if options.seclude is not None:
+    setname = "<{}>".format(options.seclude)
+    logger.info("seclude {}".format(setname))
+    train_set.drop(lambda x: x["words"][0] == setname, inplace=True)
+    test_set.drop(lambda x: x["words"][0] == setname, inplace=True)
+    dev_set.drop(lambda x: x["words"][0] == setname, inplace=True)
+
+if options.subset is not None:
+    setname = "<{}>".format(options.subset)
+    logger.info("select {}".format(setname))
+    train_set.drop(lambda x: x["words"][0] != setname, inplace=True)
+    test_set.drop(lambda x: x["words"][0] != setname, inplace=True)
+    dev_set.drop(lambda x: x["words"][0] != setname, inplace=True)
+
+# build model and optimizer
+i2t = None
+if options.crf:
+    # i2t=utils.to_id_list(tag_vocab.word2idx)
+    i2t = {}
+    for x, y in tag_vocab.word2idx.items():
+        i2t[y] = x
+    logger.info(i2t)
+
+freeze = True if options.freeze else False
+model = models.make_CWS(
+    d_model=options.d_model,
+    N=options.N,
+    h=options.h,
+    d_ff=options.d_ff,
+    dropout=options.dropout,
+    word_embedding=init_embedding,
+    bigram_embedding=bigram_embedding,
+    tag_size=len(tag_vocab),
+    task_size=len(task_vocab),
+    crf=i2t,
+    freeze=freeze,
+)
+
+device = "cpu"
+
+if torch.cuda.device_count() > 0:
+    if options.local_rank is not None:
+        device = "cuda:{}".format(options.local_rank)
+        # model=nn.DataParallel(model)
+        model = model.to(device)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[options.local_rank], output_device=options.local_rank
+        )
+    else:
+        device = "cuda:0"
+        model.to(device)
+
+
+if options.only_task and options.old_model is not None:
+    logger.info("fix para except task embedding")
+    for name, para in model.named_parameters():
+        if name.find("task_embed") == -1:
+            para.requires_grad = False
+        else:
+            para.requires_grad = True
+            logger.info(name)
+
+optimizer = optm.NoamOpt(
+    options.d_model,
+    options.factor,
+    4000,
+    torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9),
+)
+
+optimizer._step = options.step
+
+best_model_file_name = "{}/model.bin".format(root_dir)
+
+if options.local_rank is None:
+    train_sampler = BucketSampler(
+        batch_size=options.batch_size, seq_len_field_name="seq_len"
+    )
+else:
+    train_sampler = DistributedSampler(
+        train_set, dist.get_world_size(), dist.get_rank()
+    )
+dev_sampler = SequentialSampler()
+
+i2t = utils.to_id_list(tag_vocab.word2idx)
+i2task = utils.to_id_list(task_vocab.word2idx)
+dev_set.set_input("words")
+test_set.set_input("words")
+test_batch = DataSetIter(test_set, options.batch_size, num_workers=2)
+
+word_dic = pickle.load(open(options.dataset + "/oovdict.pkl", "rb"))
+
+
+def batch_to_device(batch, device):
+    for k, v in batch.items():
+        if torch.is_tensor(v):
+            batch[k] = v.to(device)
+    return batch
+
+
+def tester(model, test_batch, write_out=False):
+    res = []
+    prf = utils.CWSEvaluator(i2t)
+    prf_dataset = {}
+    oov_dataset = {}
+
+    logger.info("start evaluation")
+    # import ipdb; ipdb.set_trace()
+    with torch.no_grad():
+        for batch_x, batch_y in test_batch:
+            batch_to_device(batch_x, device)
+            # batch_to_device(batch_y, device)
+            if bigram_embedding is not None:
+                out = model(
+                    batch_x["task"],
+                    batch_x["uni"],
+                    batch_x["seq_len"],
+                    batch_x["bi1"],
+                    batch_x["bi2"],
+                )
+            else:
+                out = model(batch_x["task"], batch_x["uni"], batch_x["seq_len"])
+            out = out["pred"]
+            # print(out)
+            num = out.size(0)
+            out = out.detach().cpu().numpy()
+            for i in range(num):
+                length = int(batch_x["seq_len"][i])
+
+                out_tags = out[i, 1:length].tolist()
+                sentence = batch_x["words"][i]
+                gold_tags = batch_y["tags"][i][1:length].numpy().tolist()
+                dataset_name = sentence[0]
+                sentence = sentence[1:]
+                # print(out_tags,gold_tags)
+                assert utils.is_dataset_tag(dataset_name), dataset_name
+                assert len(gold_tags) == len(out_tags) and len(gold_tags) == len(
+                    sentence
+                )
+
+                if dataset_name not in prf_dataset:
+                    prf_dataset[dataset_name] = utils.CWSEvaluator(i2t)
+                    oov_dataset[dataset_name] = utils.CWS_OOV(
+                        word_dic[dataset_name[1:-1]]
+                    )
+
+                prf_dataset[dataset_name].add_instance(gold_tags, out_tags)
+                prf.add_instance(gold_tags, out_tags)
+
+                if write_out:
+                    gold_strings = utils.to_tag_strings(i2t, gold_tags)
+                    obs_strings = utils.to_tag_strings(i2t, out_tags)
+
+                    word_list = utils.bmes_to_words(sentence, obs_strings)
+                    oov_dataset[dataset_name].update(
+                        utils.bmes_to_words(sentence, gold_strings), word_list
+                    )
+
+                    raw_string = " ".join(word_list)
+                    res.append(dataset_name + " " + raw_string + " " + dataset_name)
+
+        Ap = 0.0
+        Ar = 0.0
+        Af = 0.0
+        Aoov = 0.0
+        tot = 0
+        nw = 0.0
+        for dataset_name, performance in sorted(prf_dataset.items()):
+            p = performance.result()
+            if write_out:
+                nw = oov_dataset[dataset_name].oov()
+                # nw = 0
+                logger.info(
+                    "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format(
+                        dataset_name, p[0], p[1], p[2], nw
+                    )
+                )
+            else:
+                logger.info(
+                    "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format(
+                        dataset_name, p[0], p[1], p[2]
+                    )
+                )
+            Ap += p[0]
+            Ar += p[1]
+            Af += p[2]
+            Aoov += nw
+            tot += 1
+
+        prf = prf.result()
+        logger.info(
+            "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format("TOT", prf[0], prf[1], prf[2])
+        )
+        if not write_out:
+            logger.info(
+                "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format(
+                    "AVG", Ap / tot, Ar / tot, Af / tot
+                )
+            )
+        else:
+            logger.info(
+                "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format(
+                    "AVG", Ap / tot, Ar / tot, Af / tot, Aoov / tot
+                )
+            )
+    return prf[-1], res
+
+
+# start training
+if not options.test:
+    if options.old_model:
+        # incremental training
+        logger.info("Incremental training from old model: {}".format(options.old_model))
+        model.load_state_dict(torch.load(options.old_model, map_location="cuda:0"))
+
+    logger.info("Number training instances: {}".format(len(train_set)))
+    logger.info("Number dev instances: {}".format(len(dev_set)))
+
+    train_batch = DataSetIter(
+        batch_size=options.batch_size,
+        dataset=train_set,
+        sampler=train_sampler,
+        num_workers=4,
+    )
+    dev_batch = DataSetIter(
+        batch_size=options.batch_size,
+        dataset=dev_set,
+        sampler=dev_sampler,
+        num_workers=4,
+    )
+
+    best_f1 = 0.0
+    for epoch in range(int(options.num_epochs)):
+        logger.info("Epoch {} out of {}".format(epoch + 1, options.num_epochs))
+        train_loss = 0.0
+        model.train()
+        tot = 0
+        t1 = time.time()
+        for batch_x, batch_y in train_batch:
+            model.zero_grad()
+            if bigram_embedding is not None:
+                out = model(
+                    batch_x["task"],
+                    batch_x["uni"],
+                    batch_x["seq_len"],
+                    batch_x["bi1"],
+                    batch_x["bi2"],
+                    batch_y["tags"],
+                )
+            else:
+                out = model(
+                    batch_x["task"], batch_x["uni"], batch_x["seq_len"], batch_y["tags"]
+                )
+            loss = out["loss"]
+            train_loss += loss.item()
+            tot += 1
+            loss.backward()
+            # nn.utils.clip_grad_value_(model.parameters(), 1)
+            optimizer.step()
+
+        t2 = time.time()
+        train_loss = train_loss / tot
+        logger.info(
+            "time: {} loss: {} step: {}".format(t2 - t1, train_loss, optimizer._step)
+        )
+        # Evaluate dev data
+        if options.skip_dev and dist.get_rank() == 0:
+            logger.info("Saving model to {}".format(best_model_file_name))
+            torch.save(model.module.state_dict(), best_model_file_name)
+            continue
+
+        model.eval()
+        if dist.get_rank() == 0:
+            f1, _ = tester(model.module, dev_batch)
+            if f1 > best_f1:
+                best_f1 = f1
+                logger.info("- new best score!")
+                if not options.no_model:
+                    logger.info("Saving model to {}".format(best_model_file_name))
+                    torch.save(model.module.state_dict(), best_model_file_name)
+
+            elif options.always_model:
+                logger.info("Saving model to {}".format(best_model_file_name))
+                torch.save(model.module.state_dict(), best_model_file_name)
+        dist.barrier()
+
+# Evaluate test data (once)
+logger.info("\nNumber test instances: {}".format(len(test_set)))
+
+
+if not options.skip_dev:
+    if options.test:
+        model.module.load_state_dict(
+            torch.load(options.old_model, map_location="cuda:0")
+        )
+    else:
+        model.module.load_state_dict(
+            torch.load(best_model_file_name, map_location="cuda:0")
+        )
+
+if dist.get_rank() == 0:
+    for name, para in model.named_parameters():
+        if name.find("task_embed") != -1:
+            tm = para.detach().cpu().numpy()
+            logger.info(tm.shape)
+            np.save("{}/task.npy".format(root_dir), tm)
+            break
+
+_, res = tester(model.module, test_batch, True)
+
+if dist.get_rank() == 0:
+    with open("{}/testout.txt".format(root_dir), "w", encoding="utf-8") as raw_writer:
+        for sent in res:
+            raw_writer.write(sent)
+            raw_writer.write("\n")
+
diff --git a/reproduction/multi-criteria-cws/make_data.sh b/reproduction/multi-criteria-cws/make_data.sh
new file mode 100644
index 00000000..9c2b09d8
--- /dev/null
+++ b/reproduction/multi-criteria-cws/make_data.sh
@@ -0,0 +1,14 @@
+if [ -z "$DATA_DIR" ] 
+then
+    DATA_DIR="./data"
+fi
+
+mkdir -vp $DATA_DIR
+
+cmd="python -u ./data-prepare.py --sighan05 $1 --sighan08 $2 --data_path $DATA_DIR"
+echo $cmd
+eval $cmd
+
+cmd="python -u ./data-process.py --data_path $DATA_DIR"
+echo $cmd
+eval $cmd
diff --git a/reproduction/multi-criteria-cws/model.py b/reproduction/multi-criteria-cws/model.py
new file mode 100644
index 00000000..e69de29b
diff --git a/reproduction/multi-criteria-cws/models.py b/reproduction/multi-criteria-cws/models.py
new file mode 100644
index 00000000..965da651
--- /dev/null
+++ b/reproduction/multi-criteria-cws/models.py
@@ -0,0 +1,200 @@
+import fastNLP
+import torch
+import math
+from fastNLP.modules.encoder.transformer import TransformerEncoder
+from fastNLP.modules.decoder.crf import ConditionalRandomField
+from fastNLP import Const
+import copy
+import numpy as np
+from torch.autograd import Variable
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.nn.functional as F
+import transformer
+
+
+class PositionalEncoding(nn.Module):
+    "Implement the PE function."
+
+    def __init__(self, d_model, dropout, max_len=512):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        position = torch.arange(0, max_len).unsqueeze(1).float()
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        x = x + Variable(self.pe[:, : x.size(1)], requires_grad=False)
+        return self.dropout(x)
+
+
+class Embedding(nn.Module):
+    def __init__(
+        self,
+        task_size,
+        d_model,
+        word_embedding=None,
+        bi_embedding=None,
+        word_size=None,
+        freeze=True,
+    ):
+        super(Embedding, self).__init__()
+        self.task_size = task_size
+        self.embed_dim = 0
+
+        self.task_embed = nn.Embedding(task_size, d_model)
+        if word_embedding is not None:
+            # self.uni_embed = nn.Embedding.from_pretrained(torch.FloatTensor(word_embedding), freeze=freeze)
+            # self.embed_dim+=word_embedding.shape[1]
+            self.uni_embed = word_embedding
+            self.embed_dim += word_embedding.embedding_dim
+        else:
+            if bi_embedding is not None:
+                self.embed_dim += bi_embedding.shape[1]
+            else:
+                self.embed_dim = d_model
+            assert word_size is not None
+            self.uni_embed = Embedding(word_size, self.embed_dim)
+
+        if bi_embedding is not None:
+            # self.bi_embed = nn.Embedding.from_pretrained(torch.FloatTensor(bi_embedding), freeze=freeze)
+            # self.embed_dim += bi_embedding.shape[1]*2
+            self.bi_embed = bi_embedding
+            self.embed_dim += bi_embedding.embedding_dim * 2
+
+        print("Trans Freeze", freeze, self.embed_dim)
+
+        if d_model != self.embed_dim:
+            self.F = nn.Linear(self.embed_dim, d_model)
+        else:
+            self.F = None
+
+        self.d_model = d_model
+
+    def forward(self, task, uni, bi1=None, bi2=None):
+        y_task = self.task_embed(task[:, 0:1])
+        y = self.uni_embed(uni[:, 1:])
+        if bi1 is not None:
+            assert self.bi_embed is not None
+
+            y = torch.cat([y, self.bi_embed(bi1), self.bi_embed(bi2)], dim=-1)
+            # y2=self.bi_embed(bi)
+            # y=torch.cat([y,y2[:,:-1,:],y2[:,1:,:]],dim=-1)
+
+        # y=torch.cat([y_task,y],dim=1)
+        if self.F is not None:
+            y = self.F(y)
+        y = torch.cat([y_task, y], dim=1)
+        return y * math.sqrt(self.d_model)
+
+
+def seq_len_to_mask(seq_len, max_len=None):
+    if isinstance(seq_len, np.ndarray):
+        assert (
+            len(np.shape(seq_len)) == 1
+        ), f"seq_len can only have one dimension, got {len(np.shape(seq_len))}."
+        if max_len is None:
+            max_len = int(seq_len.max())
+        broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1))
+        mask = broad_cast_seq_len < seq_len.reshape(-1, 1)
+
+    elif isinstance(seq_len, torch.Tensor):
+        assert (
+            seq_len.dim() == 1
+        ), f"seq_len can only have one dimension, got {seq_len.dim() == 1}."
+        batch_size = seq_len.size(0)
+        if max_len is None:
+            max_len = seq_len.max().long()
+        broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len)
+        mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1))
+    else:
+        raise TypeError("Only support 1-d numpy.ndarray or 1-d torch.Tensor.")
+
+    return mask
+
+
+class CWSModel(nn.Module):
+    def __init__(self, encoder, src_embed, position, d_model, tag_size, crf=None):
+        super(CWSModel, self).__init__()
+        self.encoder = encoder
+        self.src_embed = src_embed
+        self.pos = copy.deepcopy(position)
+        self.proj = nn.Linear(d_model, tag_size)
+        self.tag_size = tag_size
+        if crf is None:
+            self.crf = None
+            self.loss_f = nn.CrossEntropyLoss(reduction="mean", ignore_index=-100)
+        else:
+            print("crf")
+            trans = fastNLP.modules.decoder.crf.allowed_transitions(
+                crf, encoding_type="bmes"
+            )
+            self.crf = ConditionalRandomField(tag_size, allowed_transitions=trans)
+        # self.norm=nn.LayerNorm(d_model)
+
+    def forward(self, task, uni, seq_len, bi1=None, bi2=None, tags=None):
+        # mask=fastNLP.core.utils.seq_len_to_mask(seq_len,uni.size(1)) # for dev 0.5.1
+        mask = seq_len_to_mask(seq_len, uni.size(1))
+        out = self.src_embed(task, uni, bi1, bi2)
+        out = self.pos(out)
+        # out=self.norm(out)
+        out = self.proj(self.encoder(out, mask.float()))
+
+        if self.crf is not None:
+            if tags is not None:
+                out = self.crf(out, tags, mask)
+                return {"loss": out}
+            else:
+                out, _ = self.crf.viterbi_decode(out, mask)
+                return {"pred": out}
+        else:
+            if tags is not None:
+                out = out.contiguous().view(-1, self.tag_size)
+                tags = tags.data.masked_fill_(mask == 0, -100).view(-1)
+                loss = self.loss_f(out, tags)
+                return {"loss": loss}
+            else:
+                out = torch.argmax(out, dim=-1)
+                return {"pred": out}
+
+
+def make_CWS(
+    N=6,
+    d_model=256,
+    d_ff=1024,
+    h=4,
+    dropout=0.2,
+    tag_size=4,
+    task_size=8,
+    bigram_embedding=None,
+    word_embedding=None,
+    word_size=None,
+    crf=None,
+    freeze=True,
+):
+    c = copy.deepcopy
+    # encoder=TransformerEncoder(num_layers=N,model_size=d_model,inner_size=d_ff,key_size=d_model//h,value_size=d_model//h,num_head=h,dropout=dropout)
+    encoder = transformer.make_encoder(
+        N=N, d_model=d_model, h=h, dropout=dropout, d_ff=d_ff
+    )
+
+    position = PositionalEncoding(d_model, dropout)
+
+    embed = Embedding(
+        task_size, d_model, word_embedding, bigram_embedding, word_size, freeze
+    )
+    model = CWSModel(encoder, embed, position, d_model, tag_size, crf=crf)
+
+    for p in model.parameters():
+        if p.dim() > 1 and p.requires_grad:
+            nn.init.xavier_uniform_(p)
+
+    return model
diff --git a/reproduction/multi-criteria-cws/optm.py b/reproduction/multi-criteria-cws/optm.py
new file mode 100644
index 00000000..a2b68de5
--- /dev/null
+++ b/reproduction/multi-criteria-cws/optm.py
@@ -0,0 +1,49 @@
+import torch
+import torch.optim as optim
+
+
+class NoamOpt:
+    "Optim wrapper that implements rate."
+
+    def __init__(self, model_size, factor, warmup, optimizer):
+        self.optimizer = optimizer
+        self._step = 0
+        self.warmup = warmup
+        self.factor = factor
+        self.model_size = model_size
+        self._rate = 0
+
+    def step(self):
+        "Update parameters and rate"
+        self._step += 1
+        rate = self.rate()
+        for p in self.optimizer.param_groups:
+            p["lr"] = rate
+        self._rate = rate
+        self.optimizer.step()
+
+    def rate(self, step=None):
+        "Implement `lrate` above"
+        if step is None:
+            step = self._step
+        lr = self.factor * (
+            self.model_size ** (-0.5)
+            * min(step ** (-0.5), step * self.warmup ** (-1.5))
+        )
+        # if step>self.warmup: lr = max(1e-4,lr)
+        return lr
+
+
+def get_std_opt(model):
+    return NoamOpt(
+        model.src_embed[0].d_model,
+        2,
+        4000,
+        torch.optim.Adam(
+            filter(lambda p: p.requires_grad, model.parameters()),
+            lr=0,
+            betas=(0.9, 0.98),
+            eps=1e-9,
+        ),
+    )
+
diff --git a/reproduction/multi-criteria-cws/train.py b/reproduction/multi-criteria-cws/train.py
new file mode 100644
index 00000000..fce914a1
--- /dev/null
+++ b/reproduction/multi-criteria-cws/train.py
@@ -0,0 +1,138 @@
+from fastNLP import (Trainer, Tester, Callback, GradientClipCallback, LRScheduler, SpanFPreRecMetric)
+import torch
+import torch.cuda
+from torch.optim import Adam, SGD
+from argparse import ArgumentParser
+import logging
+from .utils import set_seed
+
+
+class LoggingCallback(Callback):
+    def __init__(self, filepath=None):
+        super().__init__()
+        # create file handler and set level to debug
+        if filepath is not None:
+            file_handler = logging.FileHandler(filepath, "a")
+        else:
+            file_handler = logging.StreamHandler()
+
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(
+            logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                              datefmt='%m/%d/%Y %H:%M:%S'))
+
+        # create logger and set level to debug
+        logger = logging.getLogger()
+        logger.handlers = []
+        logger.setLevel(logging.DEBUG)
+        logger.propagate = False
+        logger.addHandler(file_handler)
+        self.log_writer = logger
+
+    def on_backward_begin(self, loss):
+        if self.step % self.trainer.print_every == 0:
+            self.log_writer.info(
+                'Step/Epoch {}/{}: Loss {}'.format(self.step, self.epoch, loss.item()))
+
+    def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
+        self.log_writer.info(
+            'Step/Epoch {}/{}: Eval result {}'.format(self.step, self.epoch, eval_result))
+
+    def on_backward_end(self):
+        pass
+
+
+def main():
+    parser = ArgumentParser()
+    register_args(parser)
+    args = parser.parse_known_args()[0]
+
+    set_seed(args.seed)
+    if args.train:
+        train(args)
+    if args.eval:
+        evaluate(args)
+
+def get_optim(args):
+    name = args.optim.strip().split(' ')[0].lower()
+    p = args.optim.strip()
+    l = p.find('(')
+    r = p.find(')')
+    optim_args = eval('dict({})'.format(p[[l+1,r]]))
+    if name == 'sgd':
+        return SGD(**optim_args)
+    elif name == 'adam':
+        return Adam(**optim_args)
+    else:
+        raise ValueError(args.optim)
+
+def load_model_from_path(args):
+    pass
+
+def train(args):
+    data = get_data(args)
+    train_data = data['train']
+    dev_data = data['dev']
+    model = get_model(args)
+    optimizer = get_optim(args)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    callbacks = []
+    trainer = Trainer(
+        train_data=train_data,
+        model=model,
+        optimizer=optimizer,
+        loss=None,
+        batch_size=args.batch_size,
+        n_epochs=args.epochs,
+        num_workers=4,
+        metrics=SpanFPreRecMetric(
+            tag_vocab=data['tag_vocab'], encoding_type=data['encoding_type'],
+            ignore_labels=data['ignore_labels']),
+        metric_key='f1',
+        dev_data=dev_data,
+        save_path=args.save_path,
+        device=device,
+        callbacks=callbacks,
+        check_code_level=-1,
+    )
+
+    print(trainer.train())
+
+
+
+def evaluate(args):
+    data = get_data(args)
+    test_data = data['test']
+    model = load_model_from_path(args)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    tester = Tester(
+        data=test_data, model=model, batch_size=args.batch_size,
+        num_workers=2, device=device,
+        metrics=SpanFPreRecMetric(
+            tag_vocab=data['tag_vocab'], encoding_type=data['encoding_type'],
+            ignore_labels=data['ignore_labels']),
+    )
+    print(tester.test())
+
+def register_args(parser):
+    parser.add_argument('--optim', type=str, default='adam (lr=2e-3, weight_decay=0.0)')
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--save_path', type=str, default=None)
+    parser.add_argument('--data_path', type=str, required=True)
+    parser.add_argument('--log_path', type=str, default=None)
+    parser.add_argument('--model_config', type=str, required=True)
+    parser.add_argument('--load_path', type=str, default=None)
+    parser.add_argument('--train', action='store_true', default=False)
+    parser.add_argument('--eval', action='store_true', default=False)
+    parser.add_argument('--seed', type=int, default=42, help='rng seed')
+
+def get_model(args):
+    pass
+
+def get_data(args):
+    return torch.load(args.data_path)
+
+if __name__ == '__main__':
+    main()
diff --git a/reproduction/multi-criteria-cws/train.sh b/reproduction/multi-criteria-cws/train.sh
new file mode 100644
index 00000000..aa47b8af
--- /dev/null
+++ b/reproduction/multi-criteria-cws/train.sh
@@ -0,0 +1,26 @@
+export EXP_NAME=release04
+export NGPU=2
+export PORT=9988
+export CUDA_DEVICE_ORDER=PCI_BUS_ID
+export CUDA_VISIBLE_DEVICES=$1
+
+if [ -z "$DATA_DIR" ]
+then
+    DATA_DIR="./data"
+fi
+
+echo $CUDA_VISIBLE_DEVICES
+cmd="
+python -m torch.distributed.launch --nproc_per_node=$NGPU --master_port $PORT\
+    main.py \
+    --word-embeddings cn-char-fastnlp-100d \
+    --bigram-embeddings cn-bi-fastnlp-100d \
+    --num-epochs 100 \
+    --batch-size 256 \
+    --seed 1234 \
+    --task-name $EXP_NAME \
+    --dataset $DATA_DIR \
+    --freeze \
+"
+echo $cmd
+eval $cmd
diff --git a/reproduction/multi-criteria-cws/transformer.py b/reproduction/multi-criteria-cws/transformer.py
new file mode 100644
index 00000000..fc352e44
--- /dev/null
+++ b/reproduction/multi-criteria-cws/transformer.py
@@ -0,0 +1,152 @@
+import numpy as np
+import torch
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.nn.functional as F
+import math, copy, time
+from torch.autograd import Variable
+
+# import matplotlib.pyplot as plt
+
+
+def clones(module, N):
+    "Produce N identical layers."
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+
+def subsequent_mask(size):
+    "Mask out subsequent positions."
+    attn_shape = (1, size, size)
+    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
+    return torch.from_numpy(subsequent_mask) == 0
+
+
+def attention(query, key, value, mask=None, dropout=None):
+    "Compute 'Scaled Dot Product Attention'"
+    d_k = query.size(-1)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+    if mask is not None:
+        # print(scores.size(),mask.size()) # [bsz,1,1,len]
+        scores = scores.masked_fill(mask == 0, -1e9)
+    p_attn = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        p_attn = dropout(p_attn)
+    return torch.matmul(p_attn, value), p_attn
+
+
+class MultiHeadedAttention(nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        "Take in model size and number of heads."
+        super(MultiHeadedAttention, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clones(nn.Linear(d_model, d_model), 4)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, query, key, value, mask=None):
+        "Implements Figure 2"
+        if mask is not None:
+            # Same mask applied to all h heads.
+            mask = mask.unsqueeze(1)
+
+        nbatches = query.size(0)
+
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [
+            l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
+            for l, x in zip(self.linears, (query, key, value))
+        ]
+
+        # 2) Apply attention on all the projected vectors in batch.
+        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
+
+        # 3) "Concat" using a view and apply a final linear.
+        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
+        return self.linears[-1](x)
+
+
+class LayerNorm(nn.Module):
+    "Construct a layernorm module (See citation for details)."
+
+    def __init__(self, features, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.a_2 = nn.Parameter(torch.ones(features))
+        self.b_2 = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
+
+
+class PositionwiseFeedForward(nn.Module):
+    "Implements FFN equation."
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        return self.w_2(self.dropout(F.relu(self.w_1(x))))
+
+
+class SublayerConnection(nn.Module):
+    """
+    A residual connection followed by a layer norm.
+    Note for code simplicity the norm is first as opposed to last.
+    """
+
+    def __init__(self, size, dropout):
+        super(SublayerConnection, self).__init__()
+        self.norm = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, sublayer):
+        "Apply residual connection to any sublayer with the same size."
+        return x + self.dropout(sublayer(self.norm(x)))
+
+
+class EncoderLayer(nn.Module):
+    "Encoder is made up of self-attn and feed forward (defined below)"
+
+    def __init__(self, size, self_attn, feed_forward, dropout):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.sublayer = clones(SublayerConnection(size, dropout), 2)
+        self.size = size
+
+    def forward(self, x, mask):
+        "Follow Figure 1 (left) for connections."
+        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
+        return self.sublayer[1](x, self.feed_forward)
+
+
+class Encoder(nn.Module):
+    "Core encoder is a stack of N layers"
+
+    def __init__(self, layer, N):
+        super(Encoder, self).__init__()
+        self.layers = clones(layer, N)
+        self.norm = LayerNorm(layer.size)
+
+    def forward(self, x, mask):
+        # print(x.size(),mask.size())
+        "Pass the input (and mask) through each layer in turn."
+        mask = mask.byte().unsqueeze(-2)
+        for layer in self.layers:
+            x = layer(x, mask)
+        return self.norm(x)
+
+
+def make_encoder(N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
+    c = copy.deepcopy
+    attn = MultiHeadedAttention(h, d_model)
+    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
+    return Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N)
diff --git a/reproduction/multi-criteria-cws/utils.py b/reproduction/multi-criteria-cws/utils.py
new file mode 100644
index 00000000..aeb7e43c
--- /dev/null
+++ b/reproduction/multi-criteria-cws/utils.py
@@ -0,0 +1,308 @@
+import numpy as np
+import torch
+import torch.cuda
+import random
+import os
+import sys
+import errno
+import time
+import codecs
+import hashlib
+import _pickle as pickle
+import warnings
+from fastNLP.io import EmbedLoader
+
+UNK_TAG = "<unk>"
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def bmes_to_words(chars, tags):
+    result = []
+    if len(chars) == 0:
+        return result
+    word = chars[0]
+
+    for c, t in zip(chars[1:], tags[1:]):
+        if t.upper() == "B" or t.upper() == "S":
+            result.append(word)
+            word = ""
+        word += c
+    if len(word) != 0:
+        result.append(word)
+
+    return result
+
+
+def bmes_to_index(tags):
+    result = []
+    if len(tags) == 0:
+        return result
+    word = (0, 0)
+
+    for i, t in enumerate(tags):
+        if i == 0:
+            word = (0, 0)
+        elif t.upper() == "B" or t.upper() == "S":
+            result.append(word)
+            word = (i, 0)
+        word = (word[0], word[1] + 1)
+    if word[1] != 0:
+        result.append(word)
+    return result
+
+
+def get_bmes(sent):
+    x = []
+    y = []
+    for word in sent:
+        length = len(word)
+        tag = ["m"] * length if length > 1 else ["s"] * length
+        if length > 1:
+            tag[0] = "b"
+            tag[-1] = "e"
+        x += list(word)
+        y += tag
+    return x, y
+
+
+class CWSEvaluator:
+    def __init__(self, i2t):
+        self.correct_preds = 0.0
+        self.total_preds = 0.0
+        self.total_correct = 0.0
+        self.i2t = i2t
+
+    def add_instance(self, pred_tags, gold_tags):
+        pred_tags = [self.i2t[i] for i in pred_tags]
+        gold_tags = [self.i2t[i] for i in gold_tags]
+        # Evaluate PRF
+        lab_gold_chunks = set(bmes_to_index(gold_tags))
+        lab_pred_chunks = set(bmes_to_index(pred_tags))
+        self.correct_preds += len(lab_gold_chunks & lab_pred_chunks)
+        self.total_preds += len(lab_pred_chunks)
+        self.total_correct += len(lab_gold_chunks)
+
+    def result(self, percentage=True):
+        p = self.correct_preds / self.total_preds if self.correct_preds > 0 else 0
+        r = self.correct_preds / self.total_correct if self.correct_preds > 0 else 0
+        f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+        if percentage:
+            p *= 100
+            r *= 100
+            f1 *= 100
+        return p, r, f1
+
+
+class CWS_OOV:
+    def __init__(self, dic):
+        self.dic = dic
+        self.recall = 0
+        self.tot = 0
+
+    def update(self, gold_sent, pred_sent):
+        i = 0
+        j = 0
+        id = 0
+        for w in gold_sent:
+            if w not in self.dic:
+                self.tot += 1
+                while i + len(pred_sent[id]) <= j:
+                    i += len(pred_sent[id])
+                    id += 1
+                if (
+                    i == j
+                    and len(pred_sent[id]) == len(w)
+                    and w.find(pred_sent[id]) != -1
+                ):
+                    self.recall += 1
+            j += len(w)
+        # print(gold_sent,pred_sent,self.tot)
+
+    def oov(self, percentage=True):
+        ins = 1.0 * self.recall / self.tot
+        if percentage:
+            ins *= 100
+        return ins
+
+
+def get_processing_word(
+    vocab_words=None, vocab_chars=None, lowercase=False, chars=False
+):
+    def f(word):
+        # 0. get chars of words
+        if vocab_chars is not None and chars:
+            char_ids = []
+            for char in word:
+                # ignore chars out of vocabulary
+                if char in vocab_chars:
+                    char_ids += [vocab_chars[char]]
+
+        # 1. preprocess word
+        if lowercase:
+            word = word.lower()
+        if word.isdigit():
+            word = "0"
+
+        # 2. get id of word
+        if vocab_words is not None:
+            if word in vocab_words:
+                word = vocab_words[word]
+            else:
+                word = vocab_words[UNK_TAG]
+
+        # 3. return tuple char ids, word id
+        if vocab_chars is not None and chars:
+            return char_ids, word
+        else:
+            return word
+
+    return f
+
+
+def append_tags(src, des, name, part, encode="utf-16"):
+    with open("{}/{}.txt".format(src, part), encoding=encode) as input, open(
+        "{}/{}.txt".format(des, part), "a", encoding=encode
+    ) as output:
+        for line in input:
+            line = line.strip()
+            if len(line) > 0:
+                output.write("<{}> {} </{}>".format(name, line, name))
+            output.write("\n")
+
+
+def is_dataset_tag(word):
+    return len(word) > 2 and word[0] == "<" and word[-1] == ">"
+
+
+def to_tag_strings(i2ts, tag_mapping, pos_separate_col=True):
+    senlen = len(tag_mapping)
+    key_value_strs = []
+
+    for j in range(senlen):
+        val = i2ts[tag_mapping[j]]
+        pos_str = val
+        key_value_strs.append(pos_str)
+    return key_value_strs
+
+
+def to_id_list(w2i):
+    i2w = [None] * len(w2i)
+    for w, i in w2i.items():
+        i2w[i] = w
+    return i2w
+
+
+def make_sure_path_exists(path):
+    try:
+        os.makedirs(path)
+    except OSError as exception:
+        if exception.errno != errno.EEXIST:
+            raise
+
+
+def md5_for_file(fn):
+    md5 = hashlib.md5()
+    with open(fn, "rb") as f:
+        for chunk in iter(lambda: f.read(128 * md5.block_size), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
+def embedding_match_vocab(
+    vocab,
+    emb,
+    ori_vocab,
+    dtype=np.float32,
+    padding="<pad>",
+    unknown="<unk>",
+    normalize=True,
+    error="ignore",
+    init_method=None,
+):
+    dim = emb.shape[-1]
+    matrix = np.random.randn(len(vocab), dim).astype(dtype)
+    hit_flags = np.zeros(len(vocab), dtype=bool)
+
+    if init_method:
+        matrix = init_method(matrix)
+    for word, idx in ori_vocab.word2idx.items():
+        try:
+            if word == padding and vocab.padding is not None:
+                word = vocab.padding
+            elif word == unknown and vocab.unknown is not None:
+                word = vocab.unknown
+            if word in vocab:
+                index = vocab.to_index(word)
+                matrix[index] = emb[idx]
+                hit_flags[index] = True
+        except Exception as e:
+            if error == "ignore":
+                warnings.warn("Error occurred at the {} line.".format(idx))
+            else:
+                print("Error occurred at the {} line.".format(idx))
+                raise e
+
+    total_hits = np.sum(hit_flags)
+    print(
+        "Found {} out of {} words in the pre-training embedding.".format(
+            total_hits, len(vocab)
+        )
+    )
+    if init_method is None:
+        found_vectors = matrix[hit_flags]
+        if len(found_vectors) != 0:
+            mean = np.mean(found_vectors, axis=0, keepdims=True)
+            std = np.std(found_vectors, axis=0, keepdims=True)
+            unfound_vec_num = len(vocab) - total_hits
+            r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
+            matrix[hit_flags == False] = r_vecs
+
+    if normalize:
+        matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
+
+    return matrix
+
+
+def embedding_load_with_cache(emb_file, cache_dir, vocab, **kwargs):
+    def match_cache(file, cache_dir):
+        md5 = md5_for_file(file)
+        cache_files = os.listdir(cache_dir)
+        for fn in cache_files:
+            if md5 in fn.split("-")[-1]:
+                return os.path.join(cache_dir, fn), True
+        return (
+            "{}-{}.pkl".format(os.path.join(cache_dir, os.path.basename(file)), md5),
+            False,
+        )
+
+    def get_cache(file):
+        if not os.path.exists(file):
+            return None
+        with open(file, "rb") as f:
+            emb = pickle.load(f)
+        return emb
+
+    os.makedirs(cache_dir, exist_ok=True)
+    cache_fn, match = match_cache(emb_file, cache_dir)
+    if not match:
+        print("cache missed, re-generating cache at {}".format(cache_fn))
+        emb, ori_vocab = EmbedLoader.load_without_vocab(
+            emb_file, padding=None, unknown=None, normalize=False
+        )
+        with open(cache_fn, "wb") as f:
+            pickle.dump((emb, ori_vocab), f)
+
+    else:
+        print("cache matched at {}".format(cache_fn))
+
+    # use cache
+    print("loading embeddings ...")
+    emb = get_cache(cache_fn)
+    assert emb is not None
+    return embedding_match_vocab(vocab, emb[0], emb[1], **kwargs)