hummingbird
/
fastNLP

 
			
							"""
This is borrowed from FudanParser. Not stable. Do not use !!!

"""
import numpy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch import optim
from torch.autograd import Function, Variable
from torch.nn import Parameter

from .utils import orthogonal


class GroupNorm(nn.Module):
    def __init__(self, num_features, num_groups=20, eps=1e-5):
        super(GroupNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(1, num_features, 1))
        self.bias = nn.Parameter(torch.zeros(1, num_features, 1))
        self.num_groups = num_groups
        self.eps = eps

    def forward(self, x):
        N, C, H = x.size()
        G = self.num_groups
        assert C % G == 0

        x = x.view(N, G, -1)
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True)

        x = (x - mean) / (var + self.eps).sqrt()
        x = x.view(N, C, H)
        return x * self.weight + self.bias


class LayerNormalization(nn.Module):
    """ Layer normalization module """

    def __init__(self, d_hid, eps=1e-3):
        super(LayerNormalization, self).__init__()

        self.eps = eps
        self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
        self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)

    def forward(self, z):
        if z.size(1) == 1:
            return z

        mu = torch.mean(z, keepdim=True, dim=-1)
        sigma = torch.std(z, keepdim=True, dim=-1)
        ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
        ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)

        return ln_out


class OrthEmbedding(nn.Embedding):
    def __init__(self, *args, **kwargs):
        super(OrthEmbedding, self).__init__(*args, **kwargs)

    def reset_parameters(self):
        self.weight = orthogonal(self.weight)
        nn.init.constant_(self.bias, 0.)


class BiLinear(nn.Module):
    def __init__(self, n_left, n_right, n_out, bias=True):
        """
        Args:
            n_left: size of left input
            n_right: size of right input
            n_out: size of output
            bias: If set to False, the layer will not learn an additive bias.
                Default: True
        """
        super(BiLinear, self).__init__()
        self.n_left = n_left
        self.n_right = n_right
        self.n_out = n_out

        self.U = Parameter(torch.Tensor(self.n_out, self.n_left, self.n_right))
        self.W_l = Parameter(torch.Tensor(self.n_out, self.n_left))
        self.W_r = Parameter(torch.Tensor(self.n_out, self.n_left))

        if bias:
            self.bias = Parameter(torch.Tensor(n_out))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W_l)
        nn.init.xavier_uniform_(self.W_r)
        nn.init.constant_(self.bias, 0.)
        nn.init.xavier_uniform_(self.U)

    def forward(self, input_left, input_right):
        """
        Args:
            input_left: Tensor
                the left input tensor with shape = [batch1, batch2, ..., left_features]
            input_right: Tensor
                the right input tensor with shape = [batch1, batch2, ..., right_features]
        Returns:
        """
        left_size = input_left.size()
        right_size = input_right.size()
        assert left_size[:-1] == right_size[:-1], \
            "batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1])
        batch = int(np.prod(left_size[:-1]))

        # convert left and right input to matrices [batch, left_features], [batch, right_features]
        input_left = input_left.view(batch, self.n_left)
        input_right = input_right.view(batch, self.n_right)

        # output [batch, out_features]
        output = F.bilinear(input_left, input_right, self.U, self.bias)
        output = output + \
                 F.linear(input_left, self.W_l, None) + \
                 F.linear(input_right, self.W_r, None)
        # convert back to [batch1, batch2, ..., out_features]
        return output.view(left_size[:-1] + (self.n_out,))

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + 'in1_features=' + str(self.n_left) \
               + ', in2_features=' + str(self.n_right) \
               + ', out_features=' + str(self.n_out) + ')'


class BiAffine(nn.Module):
    def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs):
        """
        Args:
            n_enc: int
                the dimension of the encoder input.
            n_dec: int
                the dimension of the decoder input.
            n_labels: int
                the number of labels of the crf layer
            biaffine: bool
                if apply bi-affine parameter.
            **kwargs:
        """
        super(BiAffine, self).__init__()
        self.n_enc = n_enc
        self.n_dec = n_dec
        self.num_labels = n_labels
        self.biaffine = biaffine

        self.W_d = Parameter(torch.Tensor(self.num_labels, self.n_dec))
        self.W_e = Parameter(torch.Tensor(self.num_labels, self.n_enc))
        self.b = Parameter(torch.Tensor(self.num_labels, 1, 1))
        if self.biaffine:
            self.U = Parameter(torch.Tensor(self.num_labels, self.n_dec, self.n_enc))
        else:
            self.register_parameter('U', None)

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W_d)
        nn.init.xavier_uniform_(self.W_e)
        nn.init.constant_(self.b, 0.)
        if self.biaffine:
            nn.init.xavier_uniform_(self.U)

    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        """
        Args:
            input_d: Tensor
                the decoder input tensor with shape = [batch, length_decoder, input_size]
            input_e: Tensor
                the child input tensor with shape = [batch, length_encoder, input_size]
            mask_d: Tensor or None
                the mask tensor for decoder with shape = [batch, length_decoder]
            mask_e: Tensor or None
                the mask tensor for encoder with shape = [batch, length_encoder]
        Returns: Tensor
            the energy tensor with shape = [batch, num_label, length, length]
        """
        assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
        batch, length_decoder, _ = input_d.size()
        _, length_encoder, _ = input_e.size()

        # compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder]
        # the output shape is [batch, num_label, length_decoder]
        out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3)
        # compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder]
        # the output shape is [batch, num_label, length_encoder]
        out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2)

        # output shape [batch, num_label, length_decoder, length_encoder]
        if self.biaffine:
            # compute bi-affine part
            # [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder]
            # output shape [batch, num_label, length_decoder, input_size_encoder]
            output = torch.matmul(input_d.unsqueeze(1), self.U)
            # [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder]
            # output shape [batch, num_label, length_decoder, length_encoder]
            output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3))

            output = output + out_d + out_e + self.b
        else:
            output = out_d + out_d + self.b

        if mask_d is not None:
            output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2)

        return output


class Transpose(nn.Module):
    def __init__(self, x, y):
        super(Transpose, self).__init__()
        self.x = x
        self.y = y

    def forward(self, x):
        return x.transpose(self.x, self.y)


class WordDropout(nn.Module):
    def __init__(self, dropout_rate, drop_to_token):
        super(WordDropout, self).__init__()
        self.dropout_rate = dropout_rate
        self.drop_to_token = drop_to_token

    def forward(self, word_idx):
        if not self.training:
            return word_idx
        drop_mask = torch.rand(word_idx.shape) < self.dropout_rate
        if word_idx.device.type == 'cuda':
            drop_mask = drop_mask.cuda()
        drop_mask = drop_mask.long()
        output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx
        return output


class WlossLayer(torch.nn.Module):
    def __init__(self, lam=100, sinkhorn_iter=50):
        super(WlossLayer, self).__init__()

        # cost = matrix M = distance matrix
        # lam = lambda of type float > 0
        # sinkhorn_iter > 0
        # diagonal cost should be 0
        self.lam = lam
        self.sinkhorn_iter = sinkhorn_iter
        # self.register_buffer("K", torch.exp(-self.cost / self.lam).double())
        # self.register_buffer("KM", (self.cost * self.K).double())

    def forward(self, pred, target, cost):
        return WassersteinLossStab.apply(pred, target,
                                         cost, self.lam, self.sinkhorn_iter)


class WassersteinLossStab(Function):
    @staticmethod
    def forward(ctx, pred, target, cost, lam=1e-3, sinkhorn_iter=4):
        """pred: Batch * K: K = # mass points
           target: Batch * L: L = # mass points"""
        # import pdb
        # pdb.set_trace()
        eps = 1e-8

        # pred = pred.gather(dim=1, index=)
        na = pred.size(1)
        nb = target.size(1)

        cost = cost.double()
        pred = pred.double()
        target = target.double()

        cost = cost[:na, :nb].double()
        K = torch.exp(-cost / lam).double()
        KM = (cost * K).double()

        batch_size = pred.size(0)

        # pdb.set_trace()
        log_a, log_b = torch.log(pred + eps), torch.log(target + eps)
        log_u = cost.new(batch_size, na).fill_(-numpy.log(na))
        log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb))
        # import pdb
        # pdb.set_trace()
        for i in range(int(sinkhorn_iter)):
            log_u_max = torch.max(log_u, dim=1)[0]
            u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps)
            log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1)
            log_v_max = torch.max(log_v, dim=1)[0]
            v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
            tmp = log_u
            log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1)
            # print(log_u.sum())
            if torch.norm(tmp - log_u) / torch.norm(log_u) < eps:
                break

        log_v_max = torch.max(log_v, dim=1)[0]
        v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
        logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1)
        wnorm = torch.exp(log_u + logcostpart1).mean(0).sum()  # sum(1) for per item pair loss...
        grad_input = log_u * lam
        # print("log_u", log_u)
        grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
        grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
        grad_input = grad_input / batch_size

        ctx.save_for_backward(grad_input)
        # print("grad type", type(grad_input))

        return pred.new((wnorm,)), grad_input

    @staticmethod
    def backward(ctx, grad_output, _):
        grad_input = ctx.saved_variables
        # print(grad)
        res = grad_output.clone()
        res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data)
        res = res.mul_(grad_output[0]).float()
        # print("in backward func:\n\n", res)
        return res, None, None, None, None, None, None


class Sinkhorn(Function):
    def __init__(self):
        super(Sinkhorn, self).__init__()

    def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop):
        a = a.double()
        b = b.double()
        M = M.double()

        nbb = b.size(1)

        # init data
        na = len(a)
        nb = len(b)

        cpt = 0

        # we assume that no distances are null except those of the diagonal of
        # distances
        if warmstart is None:
            alpha, beta = np.zeros(na), np.zeros(nb)
        else:
            alpha, beta = warmstart

        if nbb:
            u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
        else:
            u, v = np.ones(na) / na, np.ones(nb) / nb

        def get_K(alpha, beta):
            """log space computation"""
            return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg)

        def get_Gamma(alpha, beta, u, v):
            """log space gamma computation"""
            return np.exp(
                -(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log(
                    v.reshape((1, nb))))

        # print(np.min(K))

        K = get_K(alpha, beta)
        transp = K
        cpt = 0
        err = 1
        while 1:

            uprev = u
            vprev = v

            # sinkhorn update
            v = b / (np.dot(K.T, u) + 1e-16)
            u = a / (np.dot(K, v) + 1e-16)

            # remove numerical problems and store them in K
            if np.abs(u).max() > tau or np.abs(v).max() > tau:
                if nbb:
                    alpha, beta = alpha + reg * \
                                  np.max(np.log(u), 1), beta + reg * np.max(np.log(v))
                else:
                    alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v)
                    if nbb:
                        u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
                    else:
                        u, v = np.ones(na) / na, np.ones(nb) / nb
                K = get_K(alpha, beta)

            if cpt % print_period == 0:
                # we can speed up the process by checking for the error only all
                # the 10th iterations
                if nbb:
                    err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \
                          np.sum((v - vprev) ** 2) / np.sum((v) ** 2)
                else:
                    transp = get_Gamma(alpha, beta, u, v)
                    err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2
                if log:
                    log['err'].append(err)

                if verbose:
                    if cpt % (print_period * 20) == 0:
                        print(
                            '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                    print('{:5d}|{:8e}|'.format(cpt, err))

            if err <= stopThr:
                loop = False

            if cpt >= numItermax:
                loop = False

            if np.any(np.isnan(u)) or np.any(np.isnan(v)):
                # we have reached the machine precision
                # come back to previous solution and quit loop
                print('Warning: numerical errors at iteration', cpt)
                u = uprev
                v = vprev
                break

            cpt = cpt + 1

        # print('err=',err,' cpt=',cpt)
        if log:
            log['logu'] = alpha / reg + np.log(u)
            log['logv'] = beta / reg + np.log(v)
            log['alpha'] = alpha + reg * np.log(u)
            log['beta'] = beta + reg * np.log(v)
            log['warmstart'] = (log['alpha'], log['beta'])
            if nbb:
                res = np.zeros((nbb))
                for i in range(nbb):
                    res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
                return res, log

            else:
                return get_Gamma(alpha, beta, u, v), log
        else:
            if nbb:
                res = np.zeros((nbb))
                for i in range(nbb):
                    res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
                return res
            else:
                return get_Gamma(alpha, beta, u, v)


if __name__ == "__main__":
    cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1)))  # .cuda()
    mylayer = WlossLayer(cost)  # .cuda()
    inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True)  # .cuda()
    ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]]))  # .cuda()

    res, _ = mylayer(inp, ground_true)
    # print(inp.requires_grad, res.requires_grad)
    # print(res, inp)
    mylayer.zero_grad()
    res.backward()
    print("inp's gradient is good:")
    print(inp.grad)

    print("convert to gpu:\n", inp.cuda().grad)
    print("=============================================="
          "\n However, this does not work on pytorch when GPU is enabled")

    cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda()
    mylayer = WlossLayer(cost).cuda()
    inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda()
    ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda()

    opt = optim.SGD([
        {'params': mylayer.parameters()},
    ], lr=1e-2, momentum=0.9)

    res, _ = mylayer(inp, ground_true)
    # print(inp.requires_grad, res.requires_grad)
    # print(res, inp)
    mylayer.zero_grad()
    res.backward()
    print("input's gradient is None!!!!!!!!!!!!!!!!")
    print(inp.grad)