diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index d5a0218c..49c2add4 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -291,3 +291,24 @@ class DataSet(object): for idx in train_indices: train_set.append(self[idx]) return train_set, dev_set + + @classmethod + def read_csv(cls, csv_path, headers=None, sep='\t'): + with open(csv_path, 'r') as f: + start_idx = 0 + if headers is None: + headers = f.readline() + headers = headers.split(sep) + start_idx += 1 + else: + assert isinstance(headers, list), "headers should be list, not {}.".format(type(headers)) + _dict = {} + for col in headers: + _dict[col] = [] + for line_idx, line in enumerate(f, start_idx): + contents = line.split(sep) + assert len(contents)==len(headers), "Line {} has {} parts, while header has {}."\ + .format(line_idx, len(contents), len(headers)) + for header, content in zip(headers, contents): + _dict[header].append(content) + return cls(_dict) \ No newline at end of file