hummingbird
/
fastNLP

 
			
							import numpy as np


class BaseModel(object):
    """The base class of all models.
        This class and its subclasses are actually "wrappers" of the PyTorch models.
        They act as an interface between Trainer and the deep learning networks.
        This interface provides the following methods to be called by Trainer.
        - prepare_input
        - mode
        - define_optimizer
        - data_forward
        - grad_backward
        - get_loss
    """

    def __init__(self):
        pass

    def prepare_input(self, data):
        """
        Perform data transformation from raw input to vector/matrix inputs.
        :param data: raw inputs
        :return (X, Y): tuple, input features and labels
        """
        raise NotImplementedError

    def mode(self, test=False):
        """
        Tell the network to be trained or not, required by PyTorch.
        :param test: bool
        """
        raise NotImplementedError

    def define_optimizer(self):
        """
        Define PyTorch optimizer specified by the model.
        """
        raise NotImplementedError

    def data_forward(self, *x):
        """
        Forward pass of the data.
        :param x: input feature matrix and label vector
        :return: output by the model
        """
        # required by PyTorch nn
        raise NotImplementedError

    def grad_backward(self):
        """
        Perform gradient descent to update the model parameters.
        """
        raise NotImplementedError

    def get_loss(self, pred, truth):
        """
        Compute loss given model prediction and ground truth. Loss function specified by the model.
        :param pred: prediction label vector
        :param truth: ground truth label vector
        :return: a scalar
        """
        raise NotImplementedError


class ToyModel(BaseModel):
    """This is for code testing."""

    def __init__(self):
        super(ToyModel, self).__init__()
        self.test_mode = False
        self.weight = np.random.rand(5, 1)
        self.bias = np.random.rand()
        self._loss = 0

    def prepare_input(self, data):
        return data[:, :-1], data[:, -1]

    def mode(self, test=False):
        self.test_mode = test

    def data_forward(self, x):
        return np.matmul(x, self.weight) + self.bias

    def grad_backward(self):
        print("loss gradient backward")

    def get_loss(self, pred, truth):
        self._loss = np.mean(np.square(pred - truth))
        return self._loss

    def define_optimizer(self):
        pass


class Vocabulary(object):
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
    data that is shared between `Doc` objects.
    """

    def __init__(self):
        """Create the vocabulary.
        RETURNS (Vocab): The newly constructed object.
        """
        self.data_frame = None


class Document(object):
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary
    strings. The `Doc` object holds an array of `Token` objects. The
    Python-level `Token` and `Span` objects are views of this array, i.e.
    they don't own the data themselves. -- spacy
    """

    def __init__(self, vocab, words=None, spaces=None):
        """Create a Doc object.
        vocab (Vocab): A vocabulary object, which must match any models you
            want to use (e.g. tokenizer, parser, entity recognizer).
        words (list or None): A list of unicode strings, to add to the document
            as words. If `None`, defaults to empty list.
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
        self.spaces = spaces
        self.words = words
        if spaces is None:
            self.spaces = [True] * len(self.words)
        elif len(spaces) != len(self.words):
            raise ValueError("dismatch spaces and words")

    def get_chunker(self, vocab):
        return None

    def push_back(self, vocab):
        pass


class Token(object):
    """An individual token – i.e. a word, punctuation symbol, whitespace,
    etc.
    """

    def __init__(self, vocab, doc, offset):
        """Construct a `Token` object.
            vocab (Vocabulary): A storage container for lexical types.
            doc (Document): The parent document.
            offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.token = doc[offset]
        self.i = offset