|
- class ConllxDataLoader(object):
- def load(self, path):
- datalist = []
- with open(path, 'r', encoding='utf-8') as f:
- sample = []
- for line in f:
- if line.startswith('\n'):
- datalist.append(sample)
- sample = []
- elif line.startswith('#'):
- continue
- else:
- sample.append(line.split('\t'))
- if len(sample) > 0:
- datalist.append(sample)
-
- data = [self.get_one(sample) for sample in datalist]
- return list(filter(lambda x: x is not None, data))
-
- def get_one(self, sample):
- sample = list(map(list, zip(*sample)))
- if len(sample) == 0:
- return None
- for w in sample[7]:
- if w == '_':
- print('Error Sample {}'.format(sample))
- return None
- # return word_seq, pos_seq, head_seq, head_tag_seq
- return sample[1], sample[3], list(map(int, sample[6])), sample[7]
-
-
- class MyDataloader:
- def load(self, data_path):
- with open(data_path, "r", encoding="utf-8") as f:
- lines = f.readlines()
- data = self.parse(lines)
- return data
-
- def parse(self, lines):
- """
- [
- [word], [pos], [head_index], [head_tag]
- ]
- """
- sample = []
- data = []
- for i, line in enumerate(lines):
- line = line.strip()
- if len(line) == 0 or i + 1 == len(lines):
- data.append(list(map(list, zip(*sample))))
- sample = []
- else:
- sample.append(line.split())
- if len(sample) > 0:
- data.append(list(map(list, zip(*sample))))
- return data
-
-
- def add_seg_tag(data):
- """
-
- :param data: list of ([word], [pos], [heads], [head_tags])
- :return: list of ([word], [pos])
- """
-
- _processed = []
- for word_list, pos_list, _, _ in data:
- new_sample = []
- for word, pos in zip(word_list, pos_list):
- if len(word) == 1:
- new_sample.append((word, 'S-' + pos))
- else:
- new_sample.append((word[0], 'B-' + pos))
- for c in word[1:-1]:
- new_sample.append((c, 'M-' + pos))
- new_sample.append((word[-1], 'E-' + pos))
- _processed.append(list(map(list, zip(*new_sample))))
- return _processed
|