|
- r"""undocumented
- 此模块用于给其它模块提供读取文件的函数,没有为用户提供 API
- """
-
- __all__ = []
-
- import json
- import csv
-
- # from ..core import log
-
-
- def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
- r"""
- Construct a generator to read csv items.
-
- :param path: file path
- :param encoding: file's encoding, default: utf-8
- :param headers: file's headers, if None, make file's first line as headers. default: None
- :param sep: separator for each column. default: ','
- :param dropna: weather to ignore and drop invalid data,
- :if False, raise ValueError when reading invalid data. default: True
- :return: generator, every time yield (line number, csv item)
- """
- with open(path, 'r', encoding=encoding) as csv_file:
- f = csv.reader(csv_file, delimiter=sep)
- start_idx = 0
- if headers is None:
- headers = next(f)
- start_idx += 1
- elif not isinstance(headers, (list, tuple)):
- raise TypeError("headers should be list or tuple, not {}." \
- .format(type(headers)))
- for line_idx, line in enumerate(f, start_idx):
- contents = line
- if len(contents) != len(headers):
- if dropna:
- continue
- else:
- if "" in headers:
- raise ValueError(("Line {} has {} parts, while header has {} parts.\n" +
- "Please check the empty parts or unnecessary '{}'s in header.")
- .format(line_idx, len(contents), len(headers), sep))
- else:
- raise ValueError("Line {} has {} parts, while header has {} parts." \
- .format(line_idx, len(contents), len(headers)))
- _dict = {}
- for header, content in zip(headers, contents):
- _dict[header] = content
- yield line_idx, _dict
-
-
- def _read_json(path, encoding='utf-8', fields=None, dropna=True):
- r"""
- Construct a generator to read json items.
-
- :param path: file path
- :param encoding: file's encoding, default: utf-8
- :param fields: json object's fields that needed, if None, all fields are needed. default: None
- :param dropna: weather to ignore and drop invalid data,
- :if False, raise ValueError when reading invalid data. default: True
- :return: generator, every time yield (line number, json item)
- """
- if fields:
- fields = set(fields)
- with open(path, 'r', encoding=encoding) as f:
- for line_idx, line in enumerate(f):
- data = json.loads(line)
- if fields is None:
- yield line_idx, data
- continue
- _res = {}
- for k, v in data.items():
- if k in fields:
- _res[k] = v
- if len(_res) < len(fields):
- if dropna:
- continue
- else:
- raise ValueError('invalid instance at line: {}'.format(line_idx))
- yield line_idx, _res
-
-
- def _read_conll(path, encoding='utf-8',sep=None, indexes=None, dropna=True):
- r"""
- Construct a generator to read conll items.
-
- :param path: file path
- :param encoding: file's encoding, default: utf-8
- :param sep: seperator
- :param indexes: conll object's column indexes that needed, if None, all columns are needed. default: None
- :param dropna: weather to ignore and drop invalid data,
- :if False, raise ValueError when reading invalid data. default: True
- :return: generator, every time yield (line number, conll item)
- """
-
- def parse_conll(sample):
- sample = list(map(list, zip(*sample)))
- sample = [sample[i] for i in indexes]
- for f in sample:
- if len(f) <= 0:
- raise ValueError('empty field')
- return sample
-
- with open(path, 'r', encoding=encoding) as f:
- sample = []
- start = next(f).strip()
- if start != '':
- sample.append(start.split(sep)) if sep else sample.append(start.split())
- for line_idx, line in enumerate(f, 1):
- line = line.strip()
- if line == '':
- if len(sample):
- try:
- res = parse_conll(sample)
- sample = []
- yield line_idx, res
- except Exception as e:
- if dropna:
- logger.error('Invalid instance which ends at line: {} has been dropped.'.format(line_idx))
- sample = []
- continue
- raise ValueError('Invalid instance which ends at line: {}'.format(line_idx))
- elif line.startswith('#'):
- continue
- else:
- sample.append(line.split(sep)) if sep else sample.append(line.split())
- if len(sample) > 0:
- try:
- res = parse_conll(sample)
- yield line_idx, res
- except Exception as e:
- if dropna:
- return
- logger.error('invalid instance ends at line: {}'.format(line_idx))
- raise e
|