PKU-DAIR
/
Hetu

 
			
							import numpy as np
import time
import ctypes
from hetu import cpu_links as cpu_op
from hetu import ndarray
from hetu.ndarray import numpyasdlarrayhandle


def save_to_file(data, file):
    f = open(file, 'a+')
    f.write(data)
    f.close()


#     0    1    2    3    4    5   6    7     8    9   10   11   12     13    14    15    16   17    18     19
ll = [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900,
      1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]


def test_boradcast_to():
    for i in range(len(ll)):
        # ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        to_shape = (1000, ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        y = np.empty(to_shape, dtype=np.float32)
        arr_x = numpyasdlarrayhandle(x)
        arr_y = numpyasdlarrayhandle(y)
        # print(arr_x.asnumpy())
        start = time.time()
        for _ in range(10):
            cpu_op.broadcast_to(arr_x, arr_y)
        end = time.time()
        for _ in range(10):
            kkk = np.broadcast_to(x, to_shape)
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        np.testing.assert_allclose(kkk, y, rtol=1e-5)


# test_boradcast_to()

def test_reduce_sum_axis_zero():
    for i in range(len(ll)):
        a = 1
        #
        # shape = (ll[i], ll[i])
        # to_shape = (1000, ll[i], ll[i])
        # x = np.random.uniform(-1, 1, shape).astype(np.float32)
        # y=np.empty(to_shape,dtype=np.float32)
        # arr_x  = numpyasdlarrayhandle(x)
        # arr_y = numpyasdlarrayhandle(y)
        # # print(arr_x.asnumpy())
        # start = time.time()
        # for _ in range(10):
        #     cpu_op.broadcast_to(arr_x, arr_y)
        # end = time.time()
        # for _ in range(10):
        #     kkk = np.broadcast_to(x, to_shape)
        # end1 = time.time()
        # print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        # np.testing.assert_allclose(kkk, y, rtol=1e-5)
    shape = (2, 2, 2)
    to_shape = (2, 2)
    x = np.random.uniform(-1, 1, shape).astype(np.float32)
    y = np.empty(to_shape, dtype=np.float32)
    arr_x = numpyasdlarrayhandle(x)
    arr_y = numpyasdlarrayhandle(y)
    cpu_op.reduce_sum_axis_zero(arr_x, arr_y)
    np_y = np.sum(x, axis=0)
    print('x:', x)
    print('np_y:', np_y)
    print('y:', y)

# test_reduce_sum_axis_zero()


def test_average_pooling():
    ctx = ndarray.cpu(0)

    def np_average_pooling(input, kernel_H, kernel_W, padding=0, stride=1):
        N, C, H, W = input.shape
        assert ((H + 2 * padding - kernel_H) % stride == 0)
        assert ((W + 2 * padding - kernel_W) % stride == 0)
        pooled_H = (H + 2 * padding - kernel_H) / stride + 1
        pooled_W = (W + 2 * padding - kernel_W) / stride + 1
        pooled_layer = np.zeros(
            shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
        pooling_size = kernel_H * kernel_W
        for n in xrange(N):
            for c in xrange(C):
                for h in xrange(pooled_H):
                    for w in xrange(pooled_W):
                        hs = h * stride - padding
                        ws = w * stride - padding
                        hend = min(hs + kernel_H, H)
                        wend = min(ws + kernel_W, W)
                        hs = max(hs, 0)
                        ws = max(ws, 0)
                        for i in xrange(hs, hend):
                            for j in xrange(ws, wend):
                                pooled_layer[n][c][h][w] += input[n][c][i][j]
                        pooled_layer[n][c][h][w] /= pooling_size
        return pooled_layer

    def np_average_pooling_gradient(gradient_y, kernel_H, kernel_W, padding=0, stride=1):
        N, C, pooled_H, pooled_W = gradient_y.shape
        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
        W = (pooled_W - 1) * stride + kernel_W - 2 * padding

        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
        pooling_size = kernel_H * kernel_W
        for n in xrange(N):
            for c in xrange(C):
                for h in xrange(pooled_H):
                    for w in xrange(pooled_W):
                        hs = h * stride - padding
                        ws = w * stride - padding
                        hend = min(hs + kernel_H, H)
                        wend = min(ws + kernel_W, W)
                        hs = max(hs, 0)
                        ws = max(ws, 0)
                        for i in xrange(hs, hend):
                            for j in xrange(ws, wend):
                                gradient_x[n][c][i][j] += gradient_y[n][c][h][w] / \
                                    pooling_size

        return gradient_x

    shapeX = (100, 3, 28, 28)
    # (1,1,5,5)
    shapeY = (100, 3, 24, 24)
    #  input : x , filter : f , output: y
    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
    gradient_y = np.random.uniform(0, 10, size=shapeY).astype(np.float32)

    arr_x = numpyasdlarrayhandle(x)
    arr_gradient_y = numpyasdlarrayhandle(gradient_y)
    pool_layer = np.empty(shapeY, dtype=np.float32)
    gradient_x = np.empty(shapeX, dtype=np.float32)
    arr_pool_layer = numpyasdlarrayhandle(pool_layer)
    arr_gradient_x = numpyasdlarrayhandle(gradient_x)

    cpu_op.avg_pool(arr_x, 5, 5, arr_pool_layer)
    cpu_op.avg_pool_gradient(arr_gradient_y, 5, 5, arr_gradient_x)

    np_pool_layer = np_average_pooling(x, 5, 5)
    np_gradient_x = np_average_pooling_gradient(gradient_y, 5, 5)

    np.testing.assert_allclose(np_pool_layer, pool_layer, rtol=1e-5)

    np.testing.assert_allclose(np_gradient_x, gradient_x, rtol=1e-5)
    # print(arr_gradient_x.asnumpy())
    # print("asdasdf:",np_gradient_x)


# test_average_pooling()


def test_max_pooling():
    ctx = ndarray.cpu(0)

    def np_max_pooling(input, kernel_H, kernel_W, padding=0, stride=1):
        N, C, H, W = input.shape
        assert ((H + 2 * padding - kernel_H) % stride == 0)
        assert ((W + 2 * padding - kernel_W) % stride == 0)
        pooled_H = (H + 2 * padding - kernel_H) / stride + 1
        pooled_W = (W + 2 * padding - kernel_W) / stride + 1

        pooled_layer = np.zeros(
            shape=(N, C, pooled_H, pooled_W), dtype=np.float32)
        pooling_size = kernel_H * kernel_W

        for n in range(N):
            for c in range(C):
                for h in range(pooled_H):
                    for w in range(pooled_W):
                        hs = h * stride - padding
                        ws = w * stride - padding
                        hend = min(hs + kernel_H, H)
                        wend = min(ws + kernel_W, W)
                        hs = max(hs, 0)
                        ws = max(ws, 0)

                        hargmax = hs
                        wargmax = ws
                        for i in range(hs, hend):
                            for j in range(ws, wend):
                                if input[n][c][i][j] > input[n][c][hargmax][wargmax]:
                                    hargmax = i
                                    wargmax = j
                        pooled_layer[n][c][h][w] = input[n][c][hargmax][wargmax]

        return pooled_layer

    def np_max_pooling_gradient(input, gradient_y, kernel_H, kernel_W, padding=0, stride=1):
        N, C, pooled_H, pooled_W = gradient_y.shape
        H = (pooled_H - 1) * stride + kernel_H - 2 * padding
        W = (pooled_W - 1) * stride + kernel_W - 2 * padding
        # print(N,C,H,W)
        gradient_x = np.zeros(shape=(N, C, H, W), dtype=np.float32)
        pooling_size = kernel_H * kernel_W

        for n in xrange(N):
            for c in xrange(C):
                for h in xrange(pooled_H):
                    for w in xrange(pooled_W):
                        hs = h * stride - padding
                        ws = w * stride - padding
                        hend = min(hs + kernel_H, H)
                        wend = min(ws + kernel_W, W)
                        hs = max(hs, 0)
                        ws = max(ws, 0)

                        hargmax = hs
                        wargmax = ws
                        for i in xrange(hs, hend):
                            for j in xrange(ws, wend):
                                # print(n,c,i,j)
                                if input[n][c][i][j] > input[n][c][hargmax][wargmax]:
                                    hargmax = i
                                    wargmax = j
                        gradient_x[n][c][hargmax][wargmax] += gradient_y[n][c][h][w]

        return gradient_x

    shapeX = (100, 3, 28, 28)
    shapeY = (100, 3, 14, 14)
    # shapeX=(1,1,2,2)
    # shapeY=(1,1,1,1)
    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
    # x = np.arange(1,37).reshape(shapeX)
    # print(x)
    # x = np.ones(shapeX).astype(np.float32)
    gradient_y = np.random.uniform(0, 10, size=shapeY).astype(np.float32)
    # gradient_y = np.ones(shapeY).astype(np.float32)
    arr_x = numpyasdlarrayhandle(x)
    arr_gradient_y = numpyasdlarrayhandle(gradient_y)
    pool_layer = np.empty(shapeY, dtype=np.float32)
    gradient_x = np.empty(shapeX, dtype=np.float32)
    arr_pool_layer = numpyasdlarrayhandle(pool_layer)
    arr_gradient_x = numpyasdlarrayhandle(gradient_x)

    pool_layer1 = np.empty(shapeY, dtype=np.float32)
    gradient_x1 = np.empty(shapeX, dtype=np.float32)
    arr_pool_layer1 = numpyasdlarrayhandle(pool_layer1)
    arr_gradient_x1 = numpyasdlarrayhandle(gradient_x1)

    np_pool_layer = np_max_pooling(x, 2, 2, 0, 2)
    cpu_op.max_pool(arr_x, 2, 2, arr_pool_layer, 0, 2)
    # print('poollayer:',np_pool_layer)
    # print(arr_pool_layer.asnumpy())

    np_gradient_x = np_max_pooling_gradient(x, np_pool_layer, 2, 2, 0, 2)
    cpu_op.max_pool_gradient(arr_x, arr_pool_layer, 2,
                             2, arr_gradient_x1, 0, 2)

    # print(arr_pool_layer.asnumpy())
    # print(np_pool_layer)

    np.testing.assert_allclose(np_pool_layer, pool_layer, rtol=1e-5)

    np.testing.assert_allclose(np_gradient_x, gradient_x1, rtol=1e-5)

    '''
   for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (3, 3, ll[i], ll[i])
        to_shape = (3, 3, ll[i] / 2, ll[i] / 2)
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        arr_x = ndarray.array(x, ctx=ctx)
        arr_y = ndarray.empty(to_shape, ctx=ctx)
        # print(arr_x.asnumpy())
        start = time.time()
        for _ in range(10):
            cpu_op.max_pooling(arr_x, 2, 2, arr_y, 0, 2)
        end = time.time()
        for _ in range(10):
            out=np_max_pooling(x,2,2,0,2)
        end1=time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:",end1-end )
        y = arr_y.asnumpy()
        np.testing.assert_allclose (out, y, rtol=1e-5)
    '''


# test_max_pooling()


def test_matrix_multiply():
    for i in range(len(ll)):
        # ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        z = np.zeros(shape, dtype=np.float32)

        start = time.time()
        # numpy
        for _ in range(10):
            c_np = np.dot(x, y)
        time1 = time.time()
        arr_x = numpyasdlarrayhandle(x)
        arr_y = numpyasdlarrayhandle(y)
        arr_z = numpyasdlarrayhandle(z)
        time2 = time.time()
        for _ in range(1):
            cpu_op.matrix_multiply(
                arr_x, False,
                arr_y, False,
                arr_z)
        time4 = time.time()
        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
        np.testing.assert_allclose(c_np, z, rtol=1e-5)
#
#
# test_matrix_multiply()


def test_matrix_elementwise_multiply_by_const():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
        arr_x = numpyasdlarrayhandle(x)
        # arr_y = ndarray.array(y, ctx=ctx)
        val = 4.754545
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)

        start = time.time()
        for _ in range(10):
            cpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_z)
        end = time.time()
        for _ in range(10):
            nu = x * val
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
#         output=z.asnumpy()
        np.testing.assert_allclose(nu, z, rtol=1e-5)
#

# test_matrix_elementwise_multiply_by_const()


def test_matrix_elementwise_add_by_const():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)

        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
        arr_x = numpyasdlarrayhandle(x)
        # arr_y = ndarray.array(y, ctx=ctx)
        val = 4.754545
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)

        start = time.time()
        for _ in range(10):
            cpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_z)
        end = time.time()
        for _ in range(10):
            nu = x + val
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        np.testing.assert_allclose(nu, z, rtol=1e-5)


# test_matrix_elementwise_add_by_const()


def test_matrix_elementwise_multiply_by_const():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
        arr_x = numpyasdlarrayhandle(x)
        # arr_y = ndarray.array(y, ctx=ctx)
        val = np.random.uniform(-5, 5)
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)

        start = time.time()
        for _ in range(10):
            cpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_z)
        end = time.time()
        for _ in range(10):
            nu = x * val
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        np.testing.assert_allclose(nu, z, rtol=1e-5)


# test_matrix_elementwise_multiply_by_const()

def test_matrix_elementwise_multiply():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        start = time.time()
        # numpy
        for _ in range(10):
            c_np = x * y
        time1 = time.time()
        arr_x = numpyasdlarrayhandle(x)
        arr_y = numpyasdlarrayhandle(y)
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)
        time2 = time.time()
        for _ in range(10):
            cpu_op.matrix_elementwise_multiply(
                arr_x,
                arr_y,
                arr_z)
        time4 = time.time()
        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
        np.testing.assert_allclose(c_np, z, rtol=1e-5)


# test_matrix_elementwise_multiply()

def test_matrix_elementwise_add():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        start = time.time()
        # numpy
        for _ in range(10):
            c_np = x + y
        time1 = time.time()
        arr_x = numpyasdlarrayhandle(x)
        arr_y = numpyasdlarrayhandle(y)
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)
        time2 = time.time()
        for _ in range(10):
            cpu_op.matrix_elementwise_add(
                arr_x,
                arr_y,
                arr_z)
        time4 = time.time()
        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
        np.testing.assert_allclose(c_np, z, rtol=1e-5)


# test_matrix_elementwise_add()

def test_matrix_div_const():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
        # arr_y = ndarray.array(y, ctx=ctx)
        val = 4.754545
        arr_x = numpyasdlarrayhandle(x)
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)
        start = time.time()
        for _ in range(10):
            cpu_op.matrix_elementwise_divide_by_const(arr_x, val, arr_z)
        end = time.time()
        for _ in range(10):
            nu = x / val
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        np.testing.assert_allclose(nu, z, rtol=1e-5)

# test_matrix_div_const()


def test_divide_elewise():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        y = np.random.uniform(-5, 5, size=shape).astype(np.float32)
        start = time.time()
        # numpy

        for _ in range(10):
            c_np = x / y
        time1 = time.time()
        arr_x = numpyasdlarrayhandle(x)
        arr_y = numpyasdlarrayhandle(y)
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)
        time2 = time.time()
        for _ in range(10):
            cpu_op.matrix_elementwise_divide(
                arr_x,
                arr_y,
                arr_z)
        time4 = time.time()
        print(ll[i], " cpu:", time4 - time2, "  ", "numpy:", time1 - start)
        np.testing.assert_allclose(c_np, z, rtol=1e-5)


# test_divide_elewise()


def test_arrayset_oneslike():
    for i in range(10000):  # len(ll)):
        ctx = ndarray.cpu(0)
        shape = (10000, 10000)  # ll[i], ll[i])
        #x = np.random.uniform(-1, 1, shape).astype(np.float32)
        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
        #arr_x = ndarray.array(x, ctx=ctx)
        # arr_y = ndarray.array(y, ctx=ctx)
        val = 1
        z = np.zeros(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)
        # print(z)
        # print(x)
        # print(val)
        start = time.time()
        for _ in range(10):
            cpu_op.array_set(arr_z, val)
        end = time.time()
        for _ in range(10):
            output_val = np.ones(shape)
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        # print(out)
        # print(x*val)
        # print(z)
        np.testing.assert_allclose(output_val, z, rtol=1e-5)

# test_arrayset_oneslike()


def test_arrayset_zeroslike():
    for i in range(10000):
        ctx = ndarray.cpu(0)
        shape = (10, 100)
        #x = np.random.uniform(-1, 1, shape).astype(np.float32)
        # y = np.random.uniform(-1, 1, shape).astype(np.float32)
        #arr_x = ndarray.array(x, ctx=ctx)
        # arr_y = ndarray.array(y, ctx=ctx)
        val = 0
        z = np.empty(shape, dtype=np.float32)
        arr_z = numpyasdlarrayhandle(z)
        # print(x)
        # print(val)
        start = time.time()
        for _ in range(10):
            cpu_op.array_set(arr_z, val)
        end = time.time()
        for _ in range(10):
            output_val = np.zeros(shape)
        end1 = time.time()
        # print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        # print(out)
        # print(x*val)
        # print(z)
        np.testing.assert_allclose(output_val, z, rtol=1e-5)

# test_arrayset_zeroslike()


def test_softmax():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        arr_x = ndarray.array(x, ctx=ctx)
        z = ndarray.empty(shape, ctx=ctx)
        start = time.time()
        for _ in range(10):
            cpu_op.softmax(arr_x, z)
        end = time.time()
        for _ in range(10):
            b = x - np.max(x, axis=1, keepdims=True)
            expb = np.exp(b)
            softmax = expb / np.sum(expb, axis=1, keepdims=True)
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        out = z.asnumpy()
        np.testing.assert_allclose(softmax, out, rtol=1e-5)


# test_softmax()

def test_softmax_crossentropy():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], 10)
        y = np.random.uniform(-5, 5, shape).astype(np.float32)
        y_ = np.random.uniform(-5, 5, shape).astype(np.float32)
        arr_y = ndarray.array(y, ctx=ctx)
        arr_y_ = ndarray.array(y_, ctx=ctx)
        arr_out = ndarray.empty((1,), ctx=ctx)
        start = time.time()
        for _ in range(10):
            cpu_op.softmax_crossentropy(arr_y, arr_y_, arr_out)
        end = time.time()
        for _ in range(10):
            b = y - np.max(y, axis=1, keepdims=True)
            expb = np.exp(b)
            softmax = expb / np.sum(expb, axis=1, keepdims=True)
            cross_entropy = np.mean(
                -np.sum(y_ * np.log(softmax), axis=1), keepdims=True)
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        out = arr_out.asnumpy()
        # print(out)
        # print(cross_entropy)
        np.testing.assert_allclose(cross_entropy, out, rtol=1e-3)


# test_softmax_crossentropy()

def test_sqrt():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(0, 10, shape).astype(np.float32)
        arr_x = ndarray.array(x, ctx=ctx)
        arr_y = ndarray.empty(shape, ctx=ctx)
        start = time.time()
        for _ in range(10):
            cpu_op.sqrt(arr_x, arr_y)
        end = time.time()
        for _ in range(10):
            out = np.sqrt(x)
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        y = arr_y.asnumpy()
        np.testing.assert_allclose(out, y, rtol=1e-3)

# test_sqrt()


def test_tanh():
    # TODO
    raise NotImplementedError


# test_tanh()

def test_sigmoid():
    # TODO
    raise NotImplementedError


# test_sigmoid()


def test_opposite():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        arr_x = ndarray.array(x, ctx=ctx)
        arr_y = ndarray.empty(shape, ctx=ctx)
        start = time.time()
        for _ in range(10):
            cpu_op.opposite(arr_x, arr_y)
        end = time.time()
        for _ in range(10):
            out = -x
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        y = arr_y.asnumpy()
        np.testing.assert_allclose(out, y, rtol=1e-5)


# test_opposite()

def test_relu():
    for i in range(len(ll)):
        ctx = ndarray.cpu(0)
        shape = (ll[i], ll[i])
        x = np.random.uniform(-1, 1, shape).astype(np.float32)
        arr_x = ndarray.array(x, ctx=ctx)
        arr_y = ndarray.empty(shape, ctx=ctx)
        start = time.time()
        for _ in range(10):
            cpu_op.relu(arr_x, arr_y)
        end = time.time()
        for _ in range(10):
            out = np.maximum(x, 0).astype(np.float32)
        end1 = time.time()
        print(ll[i], " cpu:", end - start, "  ", "numpy:", end1 - end)
        y = arr_y.asnumpy()
        np.testing.assert_allclose(out, y, rtol=1e-5)
# test_relu()


def test_relu_gradient():
    shape = (2, 2)
    ctx = ndarray.cpu(0)
    x = np.random.uniform(-1, 1, shape).astype(np.float32)
    print("x:", x)
    grad_x = np.random.uniform(-5, 5, shape).astype(np.float32)
    print("g:", grad_x)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_grad_x = ndarray.array(grad_x, ctx=ctx)
    arr_y = ndarray.empty(shape, ctx=ctx)
    cpu_op.relu_gradient(arr_x, arr_grad_x, arr_y)
    y = arr_y.asnumpy()
    print(y)
    np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y)

# test_relu_gradient()


def test_conv2d():
    ctx = ndarray.cpu(0)

    # im2col and np_conv2d are helper functions
    def im2col(X, filter_H, filter_W, padding, stride):
        N, C, H, W = X.shape
        assert (H + 2 * padding - filter_H) % stride == 0
        assert (W + 2 * padding - filter_W) % stride == 0
        out_H = int((H + 2 * padding - filter_H) / stride + 1)
        out_W = int((W + 2 * padding - filter_W) / stride + 1)

        y_row_size = int(C * filter_H * filter_W)
        y_col_size = int(out_H * out_W)
        y_shape = (N, y_row_size, y_col_size)
        Y = np.empty(y_shape, dtype=X.dtype)

        for batch_index in range(N):
            for col_index in range(y_col_size):
                out_y = col_index / out_W
                out_x = col_index % out_W
                in_y = int(out_y * stride - padding)
                in_x = int(out_x * stride - padding)
                row_idx = 0
                for c in range(0, C):
                    for y in range(in_y, in_y + filter_H):
                        for x in range(in_x, in_x + filter_W):
                            if (x < 0 or x >= W or y < 0 or y >= H):
                                Y[batch_index, row_idx, col_index] = 0
                            else:
                                Y[batch_index, row_idx,
                                    col_index] = X[batch_index, c, y, x]
                            row_idx += 1
        return Y

    def np_conv2d(X, Filter, padding=0, stride=1):
        """Implement a conv2d as a matrix multiply after im2col."""
        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
        N, C, H, W = X.shape
        assert (H + 2 * padding - filter_H) % stride == 0
        assert (W + 2 * padding - filter_W) % stride == 0
        out_H = int((H + 2 * padding - filter_H) / stride + 1)
        out_W = int((W + 2 * padding - filter_W) / stride + 1)

        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
        filter_matrix = Filter.reshape(filter_outChannel, -1)
        #print("shape", im2col_matrix.shape)
        #print("shape", filter_matrix.shape)
        #print("shape", np.matmul(filter_matrix, im2col_matrix).shape)
        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
        # return im2col_matrix

    shapeX = (100, 3, 28, 28)
    shapeF = (10, 3, 5, 5)
    shapeY = (100, 10, 24, 24)
    # shapeX=(1,1,2,2)
    # shapeF=(1,1,1,1)
    # shapeY=(1,1,2,2)
    #shapeW = (100, 3 * 5 * 5, 24 * 24)
    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
    # print("x:",x)
    # print("f:",f)
    # y = np.zeros(shapeY).astype(np.float32)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_f = ndarray.array(f, ctx=ctx)
    arr_y = ndarray.empty(shapeY, ctx=ctx)
    #arr_workspace = ndarray.empty(shapeW, ctx=ctx)

    cpu_op.conv2d(arr_x, arr_f, arr_y, 0, 1)
    y = arr_y.asnumpy()
    # print("y:",y)
    z = np_conv2d(x, f)
    # print("z:",z)
    np.testing.assert_allclose(z, y, rtol=1e-5)

# test_conv2d()


def test_conv2d_Gradient():
    ctx = ndarray.cpu(0)

    def im2col(X, filter_H, filter_W, padding, stride):
        N, C, H, W = X.shape
        assert (H + 2 * padding - filter_H) % stride == 0
        assert (W + 2 * padding - filter_W) % stride == 0
        out_H = (H + 2 * padding - filter_H) / stride + 1
        out_W = (W + 2 * padding - filter_W) / stride + 1

        y_row_size = C * filter_H * filter_W
        y_col_size = out_H * out_W
        y_shape = (N, y_row_size, y_col_size)
        Y = np.empty(y_shape, dtype=X.dtype)

        for batch_index in range(N):
            for col_index in range(y_col_size):
                out_y = col_index / out_W
                out_x = col_index % out_W
                in_y = out_y * stride - padding
                in_x = out_x * stride - padding
                row_idx = 0
                for c in range(0, C):
                    for y in range(in_y, in_y + filter_H):
                        for x in range(in_x, in_x + filter_W):
                            if (x < 0 or x >= W or y < 0 or y >= H):
                                Y[batch_index, row_idx, col_index] = 0
                            else:
                                Y[batch_index, row_idx,
                                    col_index] = X[batch_index, c, y, x]
                            row_idx += 1
        return Y

    def np_conv2d(X, Filter, padding=0, stride=1):
        """Implement a conv2d as a matrix multiply after im2col."""
        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
        N, C, H, W = X.shape
        assert (H + 2 * padding - filter_H) % stride == 0
        assert (W + 2 * padding - filter_W) % stride == 0
        out_H = (H + 2 * padding - filter_H) / stride + 1
        out_W = (W + 2 * padding - filter_W) / stride + 1

        im2col_matrix = im2col(X, filter_H, filter_W, padding, stride)
        filter_matrix = Filter.reshape(filter_outChannel, -1)
        # print("shape", im2col_matrix.shape)
        # print("shape", filter_matrix.shape)
        # print("shape", np.matmul(filter_matrix, im2col_matrix).shape)
        return np.matmul(filter_matrix, im2col_matrix).reshape(N, filter_outChannel, out_H, out_W)
        # return im2col_matrix

    def im2col_transpose(X, filter_H, filter_W, Y, padding, stride):
        N, C, H, W = X.shape
        assert (H + 2 * padding - filter_H) % stride == 0
        assert (W + 2 * padding - filter_W) % stride == 0
        out_H = (H + 2 * padding - filter_H) / stride + 1
        out_W = (W + 2 * padding - filter_W) / stride + 1
        _, y_row_size, y_col_size = Y.shape

        der_X_shape = (N, C, H, W)
        der_X = np.zeros(der_X_shape, dtype=X.dtype)

        for batch_index in range(N):
            for col_index in range(y_col_size):
                out_y = col_index / out_W
                out_x = col_index % out_W
                in_y = out_y * stride - padding
                in_x = out_x * stride - padding
                row_idx = 0
                for c in range(0, C):
                    for y in range(in_y, in_y + filter_H):
                        for x in range(in_x, in_x + filter_W):
                            if (x < 0 or x >= W or y < 0 or y >= H):
                                Y[batch_index, row_idx, col_index] = 0
                            else:
                                der_X[batch_index, c, y,
                                      x] += Y[batch_index, row_idx, col_index]
                            row_idx += 1
        return der_X

    def np_conv2d_transpose(X, Filter, Y, padding=0, stride=1):
        """Implement a conv2d_transpose as a matrix multiply after im2col."""
        filter_outChannel, filter_inChannel, filter_H, filter_W = Filter.shape
        X_N, X_C, X_H, X_W = X.shape
        Y_N, Y_C, Y_H, Y_W = Y.shape
        YY = Y.reshape((Y_N, Y_C, Y_H * Y_W))  # transformed to im2col Y
        # XX = X.reshape((X_N, X_C, X_W * X_H))   # transformed to im2col X
        F_filter = Filter.reshape((filter_outChannel, -1))
        gradient_im2col_XX = np.matmul(F_filter.T, YY)

        gradient_X = im2col_transpose(
            X, filter_H, filter_W, gradient_im2col_XX, padding, stride)  # gradient of x
        im2col_XX = im2col(X, filter_H, filter_W, padding, stride)
        gradient_filter = np.zeros(shape=F_filter.shape, dtype=X.dtype)

        for i in range(X_N):
            gradient_filter += np.matmul(YY[i], im2col_XX[i].T)
        gradient_filter = gradient_filter.reshape(Filter.shape)

        return gradient_X, gradient_filter

    # shapeX = (100, 3, 28, 28)
    # shapeF = (10, 3, 5, 5)
    # shapeY = (100, 10, 24, 24)
    shapeX = (1, 1, 2, 2)
    shapeF = (1, 1, 1, 1)
    shapeY = (1, 1, 2, 2)
    shapeW = (100, 3 * 5 * 5, 24 * 24)
    shapeFF = (100, 10, 3, 5, 5)
    #  input : x , filter : f , output: y
    x = np.random.uniform(0, 10, size=shapeX).astype(np.float32)
    f = np.random.uniform(0, 10, size=shapeF).astype(np.float32)
    # print('x:',x)
    # print('f:',f)

    der_y = np.ones(shape=shapeY)
    gradient_x, gradient_f = np_conv2d_transpose(x, f, der_y)

    # print('gredent_f:',gradient_f)
    print(gradient_f.shape)
    # print('der_y:',der_y)

    arr_x = ndarray.array(x, ctx=ctx)
    arr_f = ndarray.array(f, ctx=ctx)
    gradient_y = ndarray.array(der_y, ctx=ctx)
    gradient_xx = ndarray.array(x, ctx=ctx)
    gradient_ff = ndarray.array(f, ctx=ctx)
    cpu_op.conv2d_gradient_of_filter(arr_x, gradient_y, gradient_ff)
    cpu_op.conv2d_gradient_of_data(arr_f, gradient_y, gradient_xx)

    np.testing.assert_allclose(gradient_x, gradient_xx.asnumpy(), rtol=1e-5)
    np.testing.assert_allclose(gradient_f, gradient_ff.asnumpy(), rtol=1e-5)


# test_conv2d_Gradient()


def test_concat():
    ctx = ndarray.cpu(0)
    shape1 = (1, 2)
    shape2 = (1, 2)
    x1 = np.array([1, 2, 2, 2]).reshape((2, 2))
    x2 = np.array([3, 4, 4, 4, 4, 4]).reshape((2, 3))
    arr_x1 = ndarray.array(x1, ctx=ctx)
    arr_x2 = ndarray.array(x2, ctx=ctx)
    to_shape = (2, 5)

    arr_y = ndarray.empty(to_shape, ctx=ctx)
    cpu_op.concat(arr_x1, arr_x2, arr_y, axis=1)

    print(arr_y.asnumpy())
    #
    # grad_x1 = ndarray.empty(shape1, ctx=ctx)
    # grad_x2 = ndarray.empty(shape2, ctx=ctx)
    # grad_y = np.array([1, 2, 3, 4]).reshape((1, 4))
    # grad_y_arr = ndarray.array(grad_y, ctx=ctx)
    #
    # gpu_op.concat_gradient(grad_y_arr, grad_x1, axis=1, idx=0)
    # gpu_op.concat_gradient(grad_y_arr, grad_x2, axis=1, idx=1)
    # print(grad_x1.asnumpy())
    # print(grad_x2.asnumpy())
# test_concat()


def test_Batch_Normalization():
    ctx = ndarray.cpu(0)
    shape = (1, 1, 2, 2)
    shape2 = [2]
    np.random.seed(111)
    x = np.random.uniform(-5, 5, size=shape).astype(np.float32)
    scale = np.random.uniform(0, 1, size=shape2).astype(np.float32)
    bias = np.random.uniform(0, 1, size=shape2).astype(np.float32)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_scale = ndarray.array(scale, ctx=ctx)
    arr_bias = ndarray.array(bias, ctx=ctx)

    arr_y = ndarray.empty(shape, ctx=ctx)
    print('x:', x)
    print('scale:', scale)
    print('bias:', bias)

    cpu_op.Batch_Normalization(arr_x, arr_scale, arr_bias, arr_y, 0.99, 0.01)

    print(arr_y.asnumpy())

    # gradient
    arr_gradient_x = ndarray.empty(shape, ctx=ctx)
    arr_gradient_scale = ndarray.empty(shape2, ctx=ctx)
    arr_gradient_bias = ndarray.empty(shape2, ctx=ctx)

    cpu_op.Batch_Normalization_gradient(arr_y, arr_x, arr_scale, arr_bias,
                                        arr_gradient_x, arr_gradient_scale,
                                        arr_gradient_bias, 0.01)
    print('arr_gradient_x:', arr_gradient_x.asnumpy())
    print('arr_gradient_scale:', arr_gradient_scale.asnumpy())
    print('arr_gradient_bias:', arr_gradient_bias.asnumpy())

#
    # #   tf
    # import tensorflow as tf
    #
    # tf_x=tf.placeholder(tf.float32,[1,1,2,2])
    # train_flag=tf.placeholder(tf.bool)
    # tf_scale=tf.Variable(scale,dtype=tf.float32)
    # tf_bias=tf.Variable(bias,dtype=tf.float32)
    #
    # def batch_norm(input, scale, shift):
    #     axis = list(range(len(input.get_shape()) - 1))
    #     a_mean, a_var = tf.nn.moments(input, axis)
    #     return tf.nn.batch_normalization(input, mean=a_mean, variance=a_var,
    #                 offset=shift, scale=scale, variance_epsilon=1e-2, name=None)
    #
    # out=batch_norm(tf_x,tf_scale,tf_bias)
    #
    # with tf.Session as sess:
    #     sess.run(tf.global_variables_initializer())
    #     oo=sess.run([out],feed_dict={tf_x:x,train_flag:True})
    #     print(oo)


# test_Batch_Normalization()


'''

for i in range(1):
    ll=[500,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,900,1000,2000,3000,4000,5000,6000,7000,8000,9000]

    transA='N'
    transB='T'


    M = ll[i]
    N = ll[i]
    K = ll[i]
    A = np.random.randn(M, K)
    B = np.random.randn(K, N)
    C = np.zeros((M, N))


    start = time.time()
    ###numpy
    for i in range(100):
        C_np = np.dot(A, B)
    #print(C_np)
    time1 = time.time()
    print("numpy:total cost time:%.4f " % (time1 - start))
    M, K = A.shape
    K1, N = B.shape
    #print(B)
    linear_A = np.reshape(A, (-1))
    linear_B = np.reshape(B, (-1),order='F')
    linear_C = np.reshape(C, (-1))
    #print(linear_B)
    linear_A = np.ascontiguousarray(linear_A, dtype=np.float32)
    linear_B = np.ascontiguousarray(linear_B, dtype=np.float32)
    linear_C = np.ascontiguousarray(linear_C, dtype=np.float32)
    #print(linear_B)
    #print(linear_A)
    time4 = time.time()
    for i in range(100):
        #cpu_op.matmul(A,B,C)
        xx=cpu_op.matmul2(M,N,K,linear_A,linear_B,linear_C)
        #xx=cpu_op.sgemm(transA,transB,M,N,K,linear_A,linear_B,linear_C)
    time2 = time.time()
    print("dnnl:total cost time:%.4f " % (time2 - time4))
    #print(xx)

    #save_to_file("%d,%.4f,%.4f\n" % (M, time1 - start, time2 - time4), 'time.txt')

'''

'''
##c++
linear_A = np.reshape(A, (-1))
linear_B = np.reshape(B, (-1))
linear_C = np.reshape(C, (-1))

time4 = time.time()
print("total cost time:%.4f " % (time4 - time1))

so = ctypes.cdll.LoadLibrary
lib = so("../../build/lib/lib_dnnl_op.so")
star = time.time()
print("total cost time:%.4f " % (star - time4))

#if not linear_A.flags['C_CONTIGUOUS']:
linear_A = np.ascontiguousarray(linear_A, dtype=np.float32)
#if not linear_B.flags['C_CONTIGUOUS']:
linear_B = np.ascontiguousarray(linear_B, dtype=np.float32)
#if not linear_C.flags['C_CONTIGUOUS']:
linear_C = np.ascontiguousarray(linear_C, dtype=np.float32)

start2 = time.time()
print("total cost time:%.4f " % (start2 - star))
for i in range(100):
    lib.test(M, N, K, linear_A.ctypes.data_as(ctypes.c_void_p), linear_B.ctypes.data_as(ctypes.c_void_p),
         linear_C.ctypes.data_as(ctypes.c_void_p))
time2 = time.time()
print("last,total cost time:%.4f " % ((time2 - start2)/100))

print(linear_C)
'''


def test_transpose():
    shape = (4321, 1234)
    ctx = ndarray.cpu(0)
    x = np.random.uniform(-1, 1, shape).astype(np.float32)
    y = np.empty((shape[1], shape[0]), dtype=np.float32)

    arr_x = ndarray.numpyasdlarrayhandle(x)
    arr_y = ndarray.numpyasdlarrayhandle(y)
    cpu_op.transpose(arr_x, arr_y, [1, 0])
    np.testing.assert_allclose(np.transpose(x), y)

    shape = (21, 43, 65, 11)
    x = np.random.uniform(-1, 1, shape).astype(np.float32)
    y = np.empty((65, 11, 43, 21), dtype=np.float32)

    arr_x = ndarray.numpyasdlarrayhandle(x)
    arr_y = ndarray.numpyasdlarrayhandle(y)
    cpu_op.transpose(arr_x, arr_y,  perm=[2, 3, 1, 0])
    np.testing.assert_allclose(np.transpose(x, [2, 3, 1, 0]), y)

# test_transpose()


def test_embedding_lookup():
    emb = np.random.rand(5, 5)
    ctx = ndarray.cpu(0)
    print(emb)
    emb = ndarray.array(emb, ctx=ctx)
    ids = [[0, 1], [0, 1]]
    ids = np.array(ids)
    print(ids)
    ids = ndarray.array(ids, ctx=ctx)

    output = ndarray.empty((2, 2, 5), ctx=ctx)
    cpu_op.embedding_lookup(emb, ids, output)
    print(output.asnumpy())


test_embedding_lookup()