Browse Source

Merge branch 'dataset' of github.com:yhcc/fastNLP into dataset

tags/v0.2.0
yh_cc 5 years ago
parent
commit
10bb2810ab
8 changed files with 85 additions and 4 deletions
  1. +5
    -0
      fastNLP/api/api.py
  2. +36
    -0
      fastNLP/api/parser.py
  3. +4
    -1
      fastNLP/api/pipeline.py
  4. +2
    -1
      fastNLP/api/pos_tagger.py
  5. +34
    -0
      fastNLP/api/processor.py
  6. +2
    -0
      fastNLP/core/dataset.py
  7. +1
    -1
      fastNLP/modules/decoder/CRF.py
  8. +1
    -1
      reproduction/Biaffine_parser/run.py

+ 5
- 0
fastNLP/api/api.py View File

@@ -14,3 +14,8 @@ class API:
_dict = torch.load(name)
self.pipeline = _dict['pipeline']
self.model = _dict['model']

def save(self, path):
_dict = {'pipeline': self.pipeline,
'model': self.model}
torch.save(_dict, path)

+ 36
- 0
fastNLP/api/parser.py View File

@@ -0,0 +1,36 @@
from fastNLP.api.api import API
from fastNLP.core.dataset import DataSet
from fastNLP.core.predictor import Predictor
from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import *
from fastNLP.models.biaffine_parser import BiaffineParser


class DependencyParser(API):
def __init__(self):
super(DependencyParser, self).__init__()

def predict(self, data):
self.load('xxx')

dataset = DataSet()
dataset = self.pipeline.process(dataset)

pred = Predictor()
res = pred.predict(self.model, dataset)

return res

def build(self):
pipe = Pipeline()

# build pipeline
word_seq = 'word_seq'
pos_seq = 'pos_seq'
pipe.add_processor(Num2TagProcessor('<NUM>', 'raw_sentence', word_seq))
pipe.add_processor(IndexerProcessor(word_vocab, word_seq, word_seq+'_idx'))
pipe.add_processor(IndexerProcessor(pos_vocab, pos_seq, pos_seq+'_idx'))

# load model parameters
self.model = BiaffineParser()
self.pipeline = pipe

+ 4
- 1
fastNLP/api/pipeline.py View File

@@ -19,4 +19,7 @@ class Pipeline:
return dataset

def __call__(self, *args, **kwargs):
return self.process(*args, **kwargs)
return self.process(*args, **kwargs)

def __getitem__(self, item):
return self.pipeline[item]

+ 2
- 1
fastNLP/api/pos_tagger.py View File

@@ -5,9 +5,10 @@ import numpy as np
from fastNLP.core.dataset import DataSet
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.predictor import Predictor
from fastNLP.api.api import API


class POS_tagger:
class POS_tagger(API):
def __init__(self):
pass



+ 34
- 0
fastNLP/api/processor.py View File

@@ -2,6 +2,8 @@
from fastNLP.core.dataset import DataSet
from fastNLP.core.vocabulary import Vocabulary

import re

class Processor:
def __init__(self, field_name, new_added_field_name):
self.field_name = field_name
@@ -64,6 +66,7 @@ class FullSpaceToHalfSpaceProcessor(Processor):
if self.change_space:
FHs += FH_SPACE
self.convert_map = {k: v for k, v in FHs}

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
@@ -77,6 +80,37 @@ class FullSpaceToHalfSpaceProcessor(Processor):
return dataset


class MapFieldProcessor(Processor):
def __init__(self, func, field_name, new_added_field_name=None):
super(MapFieldProcessor, self).__init__(field_name, new_added_field_name)
self.func = func

def process(self, dataset):
for ins in dataset:
s = ins[self.field_name]
new_s = self.func(s)
ins[self.new_added_field_name] = new_s
return dataset


class Num2TagProcessor(Processor):
def __init__(self, tag, field_name, new_added_field_name=None):
super(Num2TagProcessor, self).__init__(field_name, new_added_field_name)
self.tag = tag
self.pattern = r'[-+]?([0-9]+[.]?[0-9]*)+[/eE]?[-+]?([0-9]+[.]?[0-9]*)'

def process(self, dataset):
for ins in dataset:
s = ins[self.field_name]
new_s = [None] * len(s)
for i, w in enumerate(s):
if re.search(self.pattern, w) is not None:
w = self.tag
new_s[i] = w
ins[self.new_added_field_name] = new_s
return dataset


class IndexerProcessor(Processor):
def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False):



+ 2
- 0
fastNLP/core/dataset.py View File

@@ -86,6 +86,8 @@ class DataSet(object):
return self.field_arrays[name]

def __len__(self):
if len(self.field_arrays) == 0:
return 0
field = iter(self.field_arrays.values()).__next__()
return len(field)



+ 1
- 1
fastNLP/modules/decoder/CRF.py View File

@@ -89,7 +89,7 @@ class ConditionalRandomField(nn.Module):
score = score.sum(0) + emit_score[-1]
if self.include_start_end_trans:
st_scores = self.start_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[0]]
last_idx = masks.long().sum(0)
last_idx = mask.long().sum(0)
ed_scores = self.end_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[last_idx, batch_idx]]
score += st_scores + ed_scores
# return [B,]


+ 1
- 1
reproduction/Biaffine_parser/run.py View File

@@ -352,7 +352,7 @@ if __name__ == "__main__":
elif args.mode == 'test':
test(args.path)
elif args.mode == 'infer':
infer()
pass
else:
print('no mode specified for model!')
parser.print_help()

Loading…
Cancel
Save