# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # pylint: skip-file from __future__ import print_function import numpy as np import mxnet as mx import copy import math import random import itertools from distutils.version import LooseVersion from numpy.testing import assert_allclose, assert_array_equal from mxnet.test_utils import * from mxnet.base import py_str, MXNetError, _as_list from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, assertRaises import unittest def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e-4): dshape = (N, T, I) data = mx.sym.Variable('data') Y1, _ = cell1.unroll(T, data, layout='NTC', merge_outputs=True) mod1 = mx.mod.Module(Y1, label_names=None, context=default_context()) mod1.bind(data_shapes=[('data', dshape)], label_shapes=None, inputs_need_grad=True, grad_req=grad_req) Y2, _ = cell2.unroll(T, data, layout='NTC', merge_outputs=True) mod2 = mx.mod.Module(Y2, label_names=None, context=default_context()) mod2.bind(data_shapes=[('data', dshape)], label_shapes=None, inputs_need_grad=True, grad_req=grad_req) mod1.init_params() args, auxs = mod1.get_params() args = cell1.unpack_weights(args) args = cell2.pack_weights(args) mod2.set_params(args, auxs) x = mx.random.uniform(shape=dshape) batch=mx.io.DataBatch(data=[x]) # check inference mod1.forward(batch, is_train=False) mod2.forward(batch, is_train=False) assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=rtol, atol=atol) # check training mod1.forward(batch, is_train=True) mod2.forward(batch, is_train=True) assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=rtol, atol=atol) dy = mx.random.uniform(shape=mod1.get_outputs()[0].shape) mod1.backward(out_grads=[dy]) mod2.backward(out_grads=[dy]) if grad_req != 'null': assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol) else: assert(mod1.get_input_grads()[0] == None) assert(mod2.get_input_grads()[0] == None) @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_lstm_sym(): T, N, I, H = 5, 32, 800, 800 fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='lstm', get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.LSTMCell(H, prefix='l0_')) stack.add(mx.rnn.LSTMCell(H, prefix='l1_')) stack.add(mx.rnn.LSTMCell(H, prefix='l2_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_lstm_bidirectional(): T, N, I, H = 5, 20, 800, 800 fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='lstm', bidirectional=True, get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.BidirectionalCell( mx.rnn.LSTMCell(H, prefix='l0_'), mx.rnn.LSTMCell(H, prefix='r0_'), output_prefix='bi_lstm_0_')) stack.add(mx.rnn.BidirectionalCell( mx.rnn.LSTMCell(H, prefix='l1_'), mx.rnn.LSTMCell(H, prefix='r1_'), output_prefix='bi_lstm_1_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_gru_sym(): T, N, I, H = 5, 32, 800, 800 fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='gru', get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.GRUCell(H, prefix='l0_')) stack.add(mx.rnn.GRUCell(H, prefix='l1_')) stack.add(mx.rnn.GRUCell(H, prefix='l2_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_gru_bidirectional(): T, N, I, H = 5, 20, 800, 800 fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='gru', bidirectional=True, get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.BidirectionalCell( mx.rnn.GRUCell(H, prefix='l0_'), mx.rnn.GRUCell(H, prefix='r0_'), output_prefix='bi_gru_0_')) stack.add(mx.rnn.BidirectionalCell( mx.rnn.GRUCell(H, prefix='l1_'), mx.rnn.GRUCell(H, prefix='r1_'), output_prefix='bi_gru_1_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnntanh_sym(): T, N, I, H = 5, 32, 800, 800 fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_tanh', get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l0_')) stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l1_')) stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l2_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnntanh_bidirectional(): T, N, I, H = 5, 20, 800, 800 fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_tanh', bidirectional=True, get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.BidirectionalCell( mx.rnn.RNNCell(H, activation='tanh', prefix='l0_'), mx.rnn.RNNCell(H, activation='tanh', prefix='r0_'), output_prefix='bi_rnntanh_0_')) stack.add(mx.rnn.BidirectionalCell( mx.rnn.RNNCell(H, activation='tanh', prefix='l1_'), mx.rnn.RNNCell(H, activation='tanh', prefix='r1_'), output_prefix='bi_rnntanh_1_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnnrelu_sym(): T, N, I, H = 5, 32, 200, 200 fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_relu', get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l0_')) stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l1_')) stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l2_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write') check_rnn_consistency(fused, stack, T, N, I, H, 'add') check_rnn_consistency(fused, stack, T, N, I, H, 'null') @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_rnnrelu_bidirectional(): T, N, I, H = 5, 20, 200, 200 fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_relu', bidirectional=True, get_next_state=True, prefix='') stack = mx.rnn.SequentialRNNCell() stack.add(mx.rnn.BidirectionalCell( mx.rnn.RNNCell(H, activation='relu', prefix='l0_'), mx.rnn.RNNCell(H, activation='relu', prefix='r0_'), output_prefix='bi_rnnrelu_0_')) stack.add(mx.rnn.BidirectionalCell( mx.rnn.RNNCell(H, activation='relu', prefix='l1_'), mx.rnn.RNNCell(H, activation='relu', prefix='r1_'), output_prefix='bi_rnnrelu_1_')) check_rnn_consistency(fused, stack, T, N, I, H, 'write', rtol=1e-2, atol=1e-2) check_rnn_consistency(fused, stack, T, N, I, H, 'add', rtol=1e-2, atol=1e-2) check_rnn_consistency(fused, stack, T, N, I, H, 'null', rtol=1e-2, atol=1e-2) @with_seed() def test_lstm_dropout(): X = mx.sym.Variable('x') Params = mx.sym.Variable('params') HX = mx.sym.Variable('state') CX = mx.sym.Variable('state_cell') T, N, I, H = 300, 20, 800, 800 rnn = mx.sym.RNN(data=X, parameters=Params, state=HX, state_cell=CX, state_size=H, num_layers=5, mode='lstm', p=0.5, state_outputs=True, name='LSTM') exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I)) out = exe.forward(is_train=True) out[0].wait_to_read() @with_seed() def test_gru_dropout(): X = mx.sym.Variable('x') Params = mx.sym.Variable('params') HX = mx.sym.Variable('state') T, N, I, H = 300, 20, 800, 800 rnn = mx.sym.RNN(data=X, parameters=Params, state=HX, state_size=H, num_layers=5, mode='gru', p=0.5, state_outputs=True, name='GRU') exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I)) out = exe.forward(is_train=True) out[0].wait_to_read() @with_seed() def test_rnntanh_dropout(): X = mx.sym.Variable('x') Params = mx.sym.Variable('params') HX = mx.sym.Variable('state') T, N, I, H = 300, 20, 800, 800 rnn = mx.sym.RNN(data=X, parameters=Params, state=HX, state_size=H, num_layers=5, mode='rnn_tanh', p=0.5, state_outputs=True, name='RNN_TANH') exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I)) out = exe.forward(is_train=True) out[0].wait_to_read() @with_seed() def test_rnnrelu_dropout(): X = mx.sym.Variable('x') Params = mx.sym.Variable('params') HX = mx.sym.Variable('state') T, N, I, H = 300, 20, 800, 800 rnn = mx.sym.RNN(data=X, parameters=Params, state=HX, state_size=H, num_layers=5, mode='rnn_relu', p=0.5, state_outputs=True, name='RNN_RELU') exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I)) out = exe.forward(is_train=True) out[0].wait_to_read() def np_softmax(x, axis=-1, temperature=1.0): x = x - np.max(x, axis=axis, keepdims=True) x = np.exp(x/temperature) x /= np.sum(x, axis=axis, keepdims=True) return x def check_elementwise_sum_with_shape(shape, n): # forward inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)] out = mx.symbol.ElementWiseSum(*inputs, name='esum') arr = [mx.nd.empty(shape) for i in range(n)] arr_grad = [mx.nd.empty(shape) for i in range(n)] for i in range(n): arr[i][:] = np.random.uniform(-10, 10, shape) exec1 = out.bind(default_context(), args=arr, args_grad=arr_grad) out1 = exec1.outputs[0].asnumpy() exec1.forward(is_train=True) out1 = exec1.outputs[0].asnumpy() out = sum(a.asnumpy() for a in arr) assert_almost_equal(out, out1, rtol=1e-5, atol=1e-5) out_grad = mx.nd.empty(shape) out_grad[:] = np.random.uniform(-10, 10, shape) # backward exec1.backward([out_grad]) for a in arr_grad: assert_almost_equal(a.asnumpy(), out_grad.asnumpy(), rtol=1e-5, atol=1e-5) @with_seed() def test_elementwise_sum(): nrepeat = 2 maxdim = 4 for repeat in range(nrepeat): for dim in range(1, maxdim): shape = tuple(np.random.randint(1, int(1000**(1.0/dim)), size=dim)) check_elementwise_sum_with_shape(shape, np.random.randint(1, 8)) def check_concat_with_shape(shapes, dimension, skip_second): # if skip_second is True, second argument will not have gradient. # it is to test #1130 n = len(shapes) # forward target_dim = 0 for shape in shapes: target_dim += shape[dimension] inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)] out = mx.symbol.Concat(*inputs, name='conc',dim=dimension) arr = [mx.nd.empty(shape) for shape in shapes] for i in range(n): arr[i][:] = shapes[i][dimension] arr_np = [np.copy(narray.asnumpy()) for narray in arr] arr_grad = [mx.nd.empty(shape) for shape in shapes] dict_grad = {} arg_names = out.list_arguments() for name, g in zip(arg_names, arr_grad): if not skip_second or name != 'arg1': dict_grad[name] = g args = out.list_arguments() arg_shapes, out_shapes, aux_shapes = out.infer_shape(**dict(zip(args, shapes))) out_grad = mx.nd.empty(out_shapes[0]) exec1 = out.bind(default_context(), args=arr, args_grad=dict_grad) exec1.forward(is_train=True) out1 = exec1.outputs[0] ret = np.concatenate([narray.asnumpy() for narray in arr], axis=dimension) assert_almost_equal(out1.asnumpy(), ret) # backward out1.copyto(out_grad) out_grad[:] += 1 exec1.backward([out_grad]) for i, name in enumerate(arg_names): if not skip_second or name != 'arg1': grad = dict_grad[name] np_grad = arr_np[i] assert_almost_equal(grad.asnumpy(), np_grad + 1) @with_seed() def test_concat(): for dimension in range(4): n = 2 merge = [2, 3, 4, 5, 6] a = 2 b = 3 c = 4 # test 2D if dimension<2: for dim in range(2, 6): shapes = [] for i in range(dim): if dimension == 0: shapes.append((merge[i], a)) elif dimension == 1: shapes.append((a, merge[i])) check_concat_with_shape(shapes,dimension,True) check_concat_with_shape(shapes,dimension,False) # Test negative dim check_concat_with_shape(shapes, dimension - 2, True) check_concat_with_shape(shapes, dimension - 2, False) #test 3D if dimension<3: for dim in range(2, 6): shapes = [] for i in range(dim): if dimension == 0: shapes.append((merge[i], a,b)) elif dimension ==1: shapes.append((a,merge[i],b)) elif dimension ==2: shapes.append((a,b,merge[i])) check_concat_with_shape(shapes,dimension,True) check_concat_with_shape(shapes,dimension,False) # Test negative dim check_concat_with_shape(shapes, dimension - 3, True) check_concat_with_shape(shapes, dimension - 3, False) # test 4D for dim in range(2, 6): shapes = [] for i in range(dim): if dimension == 0: shapes.append((merge[i],a,b,c)) elif dimension == 1: shapes.append((a,merge[i],b,c)) elif dimension ==2: shapes.append((a,b,merge[i],c)) elif dimension ==3: shapes.append((a,b,c,merge[i])) check_concat_with_shape(shapes,dimension,True) check_concat_with_shape(shapes,dimension,False) # Test negative dim check_concat_with_shape(shapes, dimension - 4, True) check_concat_with_shape(shapes, dimension - 4, False) @with_seed() def test_slice_channel(): def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis): ins = [] if squeeze_axis: shape = np.random.randint(2, 5, data_ndim).tolist() shape[axis] = num_outputs out_ele_shape = [ele for ele in shape] del out_ele_shape[axis] else: shape = np.random.randint(1, 5, data_ndim).tolist() shape[axis] *= num_outputs out_ele_shape = [ele for ele in shape] out_ele_shape[axis] //= num_outputs data_npy = np.random.normal(size=shape) out_grads_npy = [np.random.normal(size=out_ele_shape) for i in range(num_outputs)] data = mx.sym.Variable('data') sym = mx.sym.SliceChannel(data=data, num_outputs=num_outputs, axis=axis, squeeze_axis=squeeze_axis) exe = sym.simple_bind(ctx=default_context(), data=data_npy.shape) assert len(exe.outputs) == num_outputs outputs = exe.forward(is_train=True, data=data_npy) for i in range(num_outputs): gt = data_npy.take(np.arange(i * shape[axis]/num_outputs, (i+1) * shape[axis]/num_outputs).astype(np.int), axis=axis) if squeeze_axis: assert_almost_equal(outputs[i].asnumpy(), gt.reshape(outputs[i].shape)) else: assert_almost_equal(outputs[i].asnumpy(), gt) # test backward exe.backward(out_grads=[mx.nd.array(ele, ctx=default_context()) for ele in out_grads_npy]) if squeeze_axis: assert_almost_equal(exe.grad_arrays[0].asnumpy(), np.concatenate([np.expand_dims(ele, axis=axis) for ele in out_grads_npy], axis=axis)) else: assert_almost_equal(exe.grad_arrays[0].asnumpy(), np.concatenate(out_grads_npy, axis=axis)) check_slice_channel(data_ndim=2, axis=1, num_outputs=3, squeeze_axis=True) check_slice_channel(data_ndim=4, axis=2, num_outputs=3, squeeze_axis=False) check_slice_channel(data_ndim=3, axis=-1, num_outputs=2, squeeze_axis=False) check_slice_channel(data_ndim=5, axis=-2, num_outputs=3, squeeze_axis=True) @with_seed() def test_regression(): ''' test regression operator ''' def check_regression(symbol, forward, backward, shape, stype='default', densities=[0, 0.5, 1]): # init executor data = mx.symbol.Variable('data') label = mx.symbol.Variable('label', stype=stype) out = symbol(data, label) grad_req = {'data': 'write', 'label': 'null'} out_exec = out.simple_bind(default_context(), grad_req=grad_req, data=shape, label=shape) arg_map = dict(zip(out.list_arguments(), out_exec.arg_arrays)) grad_map = dict(zip(out.list_arguments(), out_exec.grad_arrays)) # init data arr_data = mx.random.uniform(-1, 1, shape) arg_map["data"][:] = arr_data # init label based on density arr_label = arg_map["label"] atol = 1e-5 for density in densities: arr_label[:] = rand_ndarray(shape, stype, density=density) out_exec.forward(is_train=True) out_exec.backward() np_out = forward(arr_data.asnumpy()) out_grad = backward(np_out, arr_label.asnumpy().reshape(np_out.shape)) / shape[1] assert_almost_equal(out_exec.outputs[0].asnumpy(), np_out, atol=atol) assert_almost_equal(grad_map["data"].asnumpy(), out_grad, atol=atol) shape = (50, 30) check_regression(mx.symbol.LogisticRegressionOutput, lambda x: 1.0 / (1.0 + np.exp(-x)), lambda x, y : x - y, shape) check_regression(mx.symbol.LinearRegressionOutput, lambda x: x, lambda x, y : x - y, shape) check_regression(mx.symbol.MAERegressionOutput, lambda x: x, lambda x, y : np.where(x > y, np.ones(x.shape), -np.ones(x.shape)), shape) check_regression(mx.symbol.LogisticRegressionOutput, lambda x: 1.0 / (1.0 + np.exp(-x)), lambda x, y : x - y, shape, stype='csr') check_regression(mx.symbol.LinearRegressionOutput, lambda x: x, lambda x, y : x - y, shape, stype='csr') def check_softmax_grad(xpu): x = mx.sym.Variable('x') label = mx.sym.Variable('label') x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu) grad_x = mx.nd.zeros((1,4), ctx=xpu) label_nd = mx.nd.array([1], ctx=xpu) sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False) ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x}) ex.forward(is_train=True) softmax_out = ex.outputs[0].asnumpy() expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]] assert np.isclose(softmax_out, expected_softmax_out).all() ex.backward(is_train=True) grad_out = ex.grad_arrays[0].asnumpy() k = int(label_nd[0].asscalar()) expected_grad_out = np.zeros((1,4)) expected_grad_out[0, k] = -1 assert np.isclose(grad_out - softmax_out, expected_grad_out).all() def check_smoothed_softmax_grad(xpu): alpha = 0.2 x = mx.sym.Variable('x') label = mx.sym.Variable('label') x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu) grad_x = mx.nd.zeros((1,4), ctx=xpu) label_nd = mx.nd.array([1], ctx=xpu) sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False, smooth_alpha=alpha) ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x}) ex.forward(is_train=True) softmax_out = ex.outputs[0].asnumpy() expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]] assert np.isclose(softmax_out, expected_softmax_out).all() ex.backward(is_train=True) grad_out = ex.grad_arrays[0].asnumpy() k = int(label_nd[0].asscalar()) expected_grad_out = np.full((1,4), fill_value=-alpha/float(4-1)) expected_grad_out[0, k] = - (1 - alpha) assert np.isclose(grad_out - softmax_out, expected_grad_out).all() def check_softmax_with_ignore_label(xpu): X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') Y = mx.symbol.SoftmaxOutput(data=X, label=L, ignore_label=0, use_ignore=True) shape = (20, 10) x = mx.nd.empty(shape, ctx = xpu) l = mx.nd.empty((shape[0],), ctx = xpu) x_np = np.random.rand(*shape) l_np = np.random.randint(0, shape[1]-1, (shape[0],)) x[:] = x_np l[:] = l_np grad = mx.nd.empty(shape, ctx = xpu) exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) exec1.forward(is_train=True) exec1.backward() grad0 = grad.asnumpy() for i in range(int(shape[0]/2)): l_np[i] = 0 l[:] = l_np exec1.forward(is_train=True) exec1.backward() grad1 = grad.asnumpy() assert abs(np.sum(grad1[:int(shape[0]/2)])) < 1e-5 assert_almost_equal(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):]) def check_softmax_with_shape(shape, xpu, preserve_shape=False): # bind with label X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape) x = mx.random.uniform(-1, 1, shape, ctx=xpu) l = mx.random.uniform(-1, 1, shape, ctx=xpu) l[:] = np_softmax(l.asnumpy()) grad = mx.nd.empty(shape, ctx = xpu) exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) exec1.forward(is_train=True) out = exec1.outputs[0].asnumpy() # Non-zero atol required by test_softmax with seed 781663739 rtol = 1e-4 atol = 1e-6 assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=rtol, atol=atol) exec1.backward() assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(), rtol=rtol, atol=atol) def test_python_op(): X = mx.symbol.Variable('X') op = mx.operator.NumpyOp() s = op.get_symbol(X, name='numpy_op') x = mx.ndarray.ones((10))*10 dx = mx.ndarray.zeros((10)) dy = mx.ndarray.ones((10)) exec1 = s.bind(default_context(), args=[x], args_grad = {'X': dx}) exec1.forward(is_train=True) assert_almost_equal(x.asnumpy(), exec1.outputs[0].asnumpy()) exec1.backward(dy) assert_almost_equal(dy.asnumpy(), dx.asnumpy()) def test_swapaxes(): data = mx.symbol.Variable('data') shape = (2, 3, 4) data_tmp = np.ones(shape) data_tmp[0] = 1 data_tmp[1] = 2 arr_data = mx.nd.array(data_tmp) swap0 = mx.symbol.SwapAxis(data=data, dim1=0, dim2=2) swap = mx.symbol.SwapAxis(data=swap0, dim1=1, dim2=2) exe_c = swap.bind(default_context(), args=[arr_data]) exe_c.forward(is_train=True) out = exe_c.outputs[0].asnumpy() swap0_ = np.swapaxes(data_tmp, 0, 2) swap_ = np.swapaxes(swap0_, 1, 2) assert_almost_equal(out, swap_) @with_seed() def test_scalarop(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape)*5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]=3 test = 2 / (4-((1+data+1)*2/5)-0.8-(data!=0)) npout_1 = (4-((1+data_tmp+1)*2/5)-0.8-(data_tmp!=0)) npout = 2/npout_1 check_symbolic_forward(test, [data_tmp], [npout]) npout_grad = 2.*2/5 npout_grad = 2*npout_grad /(npout_1 *npout_1 ) check_symbolic_backward(test, [data_tmp], [np.ones(shape)*2], [npout_grad]) @with_seed() def test_scalar_pow(): data = mx.symbol.Variable('data') shape = (1, 1) data_tmp = np.ones(shape) test = data ** 2 check_numeric_gradient(test, [data_tmp]) check_symbolic_forward(test, [data_tmp], [data_tmp ** 2]) check_symbolic_backward(test, [data_tmp], [np.ones(shape)], [2 * data_tmp]) @with_seed() def test_symbol_pow(): shape = (1, 1) data = mx.symbol.Variable('data') data_tmp = np.ones(shape)*2 exp = mx.symbol.Variable('exp') exp_tmp = np.ones(shape)*3 test = data**exp check_numeric_gradient(test, [data_tmp, exp_tmp]) check_symbolic_forward(test, [data_tmp, exp_tmp], [data_tmp**exp_tmp]) data_dir = data_tmp**(exp_tmp - 1) * exp_tmp exp_dir = data_tmp**(exp_tmp) * np.log(data_tmp) check_symbolic_backward(test, [data_tmp, exp_tmp], [np.ones(shape)], [data_dir, exp_dir]) @with_seed() def test_pow_fn(): shape = (3, 4) exp = mx.symbol.Variable("exp") y = mx.sym.pow(2, exp) x = np.ones(shape)*3 check_numeric_gradient(y, [x], numeric_eps=1E-3) check_symbolic_forward(y, [x], [2**x]) check_symbolic_backward(y, [x], [np.ones(shape)], [np.log(2) * 2**x]) @with_seed() def test_relu(): def frelu(x): return np.maximum(x, 0.0) def frelu_grad(x): return 1.0 * (x > 0.0) shape = (3, 4) x = mx.symbol.Variable("x") y = mx.sym.relu(x) xa = np.random.uniform(low=-1.0,high=1.0,size=shape) eps = 1e-4 # Avoid finite difference method inaccuracies due to discontinuous gradient at the origin. # Here we replace small problematic inputs with 1.0. Repro issue with seed 97264195. xa[abs(xa) < eps] = 1.0 ya = frelu(xa) ga = frelu_grad(xa) check_numeric_gradient(y, [xa], numeric_eps=eps) check_symbolic_forward(y, [xa], [ya]) check_symbolic_backward(y, [xa], [np.ones(shape)], [ga]) # NOTE(haojin2): Skipping the numeric check tests for float16 data type due to precision issues, # the analytical checks are still performed on each and every data type to verify the correctness. @with_seed() def test_leaky_relu(): def fleaky_relu(x, act_type, slope=0.25): neg_indices = x < 0 out = x.copy() if act_type == 'elu': out[neg_indices] = slope * np.expm1(out[neg_indices]) elif act_type == 'leaky': out[neg_indices] = slope * out[neg_indices] return out def fleaky_relu_grad(grad, x, y, act_type, slope=0.25): neg_indices = x < 0 out = np.ones(x.shape) if act_type == 'elu': out[neg_indices] = y[neg_indices] + slope elif act_type == 'leaky': out[neg_indices] = slope return out * grad for ndim in range(1, 4): shape = rand_shape_nd(ndim) x = mx.symbol.Variable("x") slp = 0.25 for dtype in [np.float16, np.float32, np.float64]: xa = np.random.uniform(low=-1.0,high=1.0,size=shape).astype(dtype) eps = 1e-4 rtol = 1e-2 atol = 1e-3 xa[abs(xa) < eps] = 1.0 for act_type in ['elu', 'leaky']: y = mx.symbol.LeakyReLU(data=x, slope=slp, act_type=act_type) ya = fleaky_relu(xa, slope=slp, act_type=act_type) ga = fleaky_relu_grad(np.ones(shape), xa, ya, slope=slp, act_type=act_type) # Skip numeric check for float16 type to get rid of flaky behavior if dtype is not np.float16: check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype) check_symbolic_forward(y, [xa], [ya], rtol=rtol, atol=atol, dtype=dtype) check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=rtol, atol=atol, dtype=dtype) # NOTE(haojin2): Skipping the numeric check tests for float16 data type due to precision issues, # the analytical checks are still performed on each and every data type to verify the correctness. @with_seed() @unittest.skip("Flaky test tracked by https://github.com/apache/incubator-mxnet/issues/12885") def test_prelu(): def fprelu(x, gamma): pos_indices = x > 0 out = x.copy() if len(x.shape) == 4: out = out.transpose(2,3,0,1) out = np.multiply(out, gamma) out = out.transpose(2,3,0,1) else: out = np.multiply(out, gamma) out[pos_indices] = x[pos_indices] return out def fprelu_grad(x, y, gamma): pos_indices = x > 0 if len(x.shape) == 4: grad_x = np.multiply(np.ones(x.shape).transpose(2,3,0,1), gamma) grad_x = grad_x.transpose(2,3,0,1) else: grad_x = np.multiply(np.ones(x.shape), gamma) grad_gam = np.zeros(gamma.shape) copy_x = x.copy() copy_x[pos_indices] = 0.0 grad_x[pos_indices] = 1.0 if len(gamma.shape) > 1 and len(x.shape) != 4: grad_gam = copy_x elif len(gamma.shape) > 1 and len(x.shape) == 4: grad_gam = np.sum(copy_x, axis=(2,3)) elif gamma.shape[0] == 1: grad_gam = np.sum(np.sum(copy_x)) elif gamma.shape[0] > 1 and len(x.shape) != 4: grad_gam = np.sum(copy_x, axis=0) elif gamma.shape[0] > 1 and len(x.shape) == 4: grad_gam = np.sum(copy_x, axis=(0,2,3)) return (grad_x, grad_gam) x = mx.symbol.Variable("x") gamma = mx.symbol.Variable("gamma") for shape in [(3,4), (3,4,4,5)]: for dtype in [np.float16, np.float32, np.float64]: for gam in [np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype)]: gam_full = np.array([gam, gam, gam]) xa = np.random.uniform(low=-1.0,high=1.0,size=shape).astype(dtype) rtol = 1e-2 atol = 1e-3 eps = 1e-4 xa[abs(xa) < eps] = 1.0 y = mx.symbol.LeakyReLU(data=x, gamma=gamma, act_type='prelu') ya = fprelu(xa, gam) ya_full = fprelu(xa, gam_full) g_xa, g_gam = fprelu_grad(xa, ya, gamma=gam) g_xa_full, g_gam_full = fprelu_grad(xa, ya_full, gamma=gam_full) # Skip numeric check for float16 type to get rid of flaky behavior if dtype is not np.float16: check_numeric_gradient(y, [xa, gam], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype) check_numeric_gradient(y, [xa, gam_full], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype) check_symbolic_forward(y, [xa, gam], [ya], rtol=rtol, atol=atol, dtype=dtype) check_symbolic_backward(y, [xa, gam], [np.ones(shape), np.ones(gam.shape)], [g_xa, g_gam], rtol=rtol, atol=atol, dtype=dtype) check_symbolic_forward(y, [xa, gam_full], [ya_full], rtol=rtol, atol=atol, dtype=dtype) check_symbolic_backward(y, [xa, gam_full], [np.ones(shape), np.ones(gam_full.shape)], [g_xa_full, g_gam_full], rtol=rtol, atol=atol, dtype=dtype) @with_seed() def test_selu(): alpha = 1.6732632423543772848170429916717 lamb = 1.0507009873554804934193349852946 def fselu(x): neg_indices = x < 0 out = x.copy() out[neg_indices] = alpha * np.expm1(out[neg_indices]) return out * lamb def fselu_grad(grad, x, y): neg_indices = x < 0 out = np.ones(x.shape).astype(x.dtype) out[neg_indices] = y[neg_indices] + alpha return out * lamb shape = (3, 4) x = mx.sym.Variable("x") y = mx.sym.LeakyReLU(data=x, act_type="selu") for dtype in [np.float16, np.float32, np.float64]: xa = np.random.uniform(low=-0.1,high=0.1,size=shape).astype(dtype) eps, rtol, atol = (7.5e-4, 1e-1, 1e-2) if dtype is np.float16 else (1e-4, 1e-2, 1e-4) if dtype is np.float16: xa /= 10.0 xa[abs(xa) < eps] = 0.01 ya = fselu(xa) ga = fselu_grad(np.ones(shape).astype(dtype), xa, ya) check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype) check_symbolic_forward(y, [xa], [ya], rtol=rtol, atol=atol, dtype=dtype) check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=rtol, atol=atol, dtype=dtype) @with_seed() def test_sigmoid(): def fsigmoid(a): return np.divide(1.0, (1.0 + np.exp(-a))) shape = (3, 4) x = mx.symbol.Variable("x") y = mx.sym.sigmoid(x) xa = np.random.uniform(low=-1.0,high=1.0,size=shape) ya = fsigmoid(xa) check_numeric_gradient(y, [xa], numeric_eps=1E-3) check_symbolic_forward(y, [xa], [ya]) check_symbolic_backward(y, [xa], [np.ones(shape)], [ya * (1 - ya)]) @with_seed() def test_shape_array(): for i in range(1,6): shape = rand_shape_nd(i) x = mx.sym.var('x') y = mx.sym.shape_array(x) xa = mx.nd.array(np.random.ranf(shape)) xg = mx.nd.empty(xa.shape) ya = np.shape(xa) yg = mx.nd.ones(ya) exe = y.bind(ctx=default_context(), args={'x': xa}, args_grad={'x': xg}) exe.forward(is_train=True) exe.backward([yg]) yo = exe.outputs[0].asnumpy() same(yo, ya) assert_almost_equal(xg.asnumpy(), np.zeros_like(xg.asnumpy())) @with_seed() def test_size_array(): for i in range(1,6): shape = rand_shape_nd(i) x = mx.sym.var('x') y = mx.sym.size_array(x) xa = mx.nd.array(np.random.ranf(shape)) xg = mx.nd.empty(xa.shape) ya = np.size(xa) yg = mx.nd.ones(ya) exe = y.bind(ctx=default_context(), args={'x': xa}, args_grad={'x': xg}) exe.forward(is_train=True) exe.backward([yg]) yo = exe.outputs[0].asnumpy() same(yo, ya) assert_almost_equal(xg.asnumpy(), np.zeros_like(xg.asnumpy())) @with_seed() def test_hard_sigmoid(): def fhardsigmoid(a, alpha=0.2, beta=0.5): return np.maximum(np.zeros(a.shape, dtype=a.dtype), np.minimum(np.ones(a.shape, dtype=a.dtype), alpha*a+beta)) def fhardsigmoid_grad(a, out_grad, alpha=0.2, beta=0.5): orig_out = fhardsigmoid(a, alpha, beta) res = out_grad * alpha res[orig_out <= 0.0] = 0.0 res[orig_out >= 1.0] = 0.0 return res shape = (3, 4) x = mx.symbol.Variable("x") y = mx.sym.hard_sigmoid(x) for dtype in [np.float16, np.float32, np.float64]: if dtype is np.float16: rtol = 1e-2 else: rtol = 1e-3 atol = 1e-3 eps = 1e-3 xa = np.random.uniform(low=-3.0,high=3.0,size=shape).astype(dtype) # function not differentiable at x=2.5 and -2.5 xa[abs(xa-2.5) < eps] -= 2 * eps xa[abs(xa+2.5) < eps] += 2 * eps ya = fhardsigmoid(xa) grad_xa = fhardsigmoid_grad(xa, np.ones(shape)) if dtype is not np.float16: check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype) check_symbolic_forward(y, [xa], [ya], rtol=rtol, atol=atol, dtype=dtype) check_symbolic_backward(y, [xa], [np.ones(shape)], [grad_xa], rtol=rtol, atol=atol, dtype=dtype) @with_seed() def test_softsign(): def fsoftsign(a): return np.divide(a, (1.0 + np.abs(a))) def fsoftsign_grad(a): return np.divide(1.0, np.square((1.0 + np.abs(a)))) shape = (3, 4) x = mx.symbol.Variable("x") y = mx.sym.softsign(x) xa = np.random.uniform(low=-1.0,high=1.0,size=shape) ya = fsoftsign(xa) ya_grad = fsoftsign_grad(xa) check_numeric_gradient(y, [xa], numeric_eps=1E-3) check_symbolic_forward(y, [xa], [ya]) check_symbolic_backward(y, [xa], [np.ones(shape)], [ya_grad]) @with_seed() def test_binary_logic(): def _inner_test(forward_gt, logic_sym, x_shape, y_shape, test_scalar=True): x = mx.symbol.Variable("x") y = mx.symbol.Variable("y") z = logic_sym(x, y) x_npy = np.random.randint(0, 4, size=x_shape).astype(np.float32) y_npy = np.random.randint(0, 4, size=y_shape).astype(np.float32) exe = z.simple_bind(ctx=default_context(), x=x_shape, y=y_shape) mx_out = exe.forward(is_train=True, x=x_npy, y=y_npy)[0].asnumpy() assert_almost_equal(mx_out, forward_gt(x_npy, y_npy)) exe.backward() if test_scalar: z_lscalar = logic_sym(1, y) z_rscalar = logic_sym(x, 1) exe_lscalar = z_lscalar.simple_bind(ctx=default_context(), y=y_shape) exe_rscalar = z_rscalar.simple_bind(ctx=default_context(), x=x_shape) mx_lscalar_out = exe_lscalar.forward(is_train=True, y=y_npy)[0].asnumpy() mx_rscalar_out = exe_rscalar.forward(is_train=True, x=x_npy)[0].asnumpy() assert_almost_equal(mx_lscalar_out, forward_gt(1, y_npy)) assert_almost_equal(mx_rscalar_out, forward_gt(x_npy, 1)) exe_lscalar.backward() exe_rscalar.backward() # Test the no-broadcasting binary logic ops + scalar logic ops _inner_test(forward_gt=lambda x, y: x == y, logic_sym=lambda x, y: x == y, x_shape=(10, 10), y_shape=(10, 10)) _inner_test(forward_gt=lambda x, y: x > y, logic_sym=lambda x, y: x > y, x_shape=(10, 10), y_shape=(10, 10)) _inner_test(forward_gt=lambda x, y: x >= y, logic_sym=lambda x, y: x >= y, x_shape=(10, 10), y_shape=(10, 10)) _inner_test(forward_gt=lambda x, y: x < y, logic_sym=lambda x, y: x < y, x_shape=(10, 10), y_shape=(10, 10)) _inner_test(forward_gt=lambda x, y: x <= y, logic_sym=lambda x, y: x <= y, x_shape=(10, 10), y_shape=(10, 10)) _inner_test(forward_gt=lambda x, y: x != y, logic_sym=lambda x, y: x != y, x_shape=(10, 10), y_shape=(10, 10)) # Test the broadcasting binary logic ops _inner_test(forward_gt=lambda x, y: x == y, logic_sym=lambda x, y: mx.sym.broadcast_equal(x, y), x_shape=(1, 10), y_shape=(10, 1), test_scalar=False) _inner_test(forward_gt=lambda x, y: x > y, logic_sym=lambda x, y: mx.sym.broadcast_greater(x, y), x_shape=(1, 10), y_shape=(10, 1), test_scalar=False) _inner_test(forward_gt=lambda x, y: x >= y, logic_sym=lambda x, y: mx.sym.broadcast_greater_equal(x, y), x_shape=(1, 10), y_shape=(10, 1), test_scalar=False) _inner_test(forward_gt=lambda x, y: x < y, logic_sym=lambda x, y: mx.sym.broadcast_lesser(x, y), x_shape=(1, 10), y_shape=(10, 1), test_scalar=False) _inner_test(forward_gt=lambda x, y: x <= y, logic_sym=lambda x, y: mx.sym.broadcast_lesser_equal(x, y), x_shape=(1, 10), y_shape=(10, 1), test_scalar=False) _inner_test(forward_gt=lambda x, y: x != y, logic_sym=lambda x, y: mx.sym.broadcast_not_equal(x, y), x_shape=(1, 10), y_shape=(10, 1), test_scalar=False) @with_seed() def test_unary_logic(): def reference(a, dtype): return np.logical_not(a).astype(dtype) shape = (3, 4) xa = np.random.randint(-2, 2, size=shape).astype(np.float32) mx_xa = mx.nd.array(xa) mx_out = mx.nd.logical_not(mx_xa) assert_almost_equal(mx_out.asnumpy(), reference(xa, dtype=xa.dtype)) x = mx.sym.Variable('x') y = mx.sym.logical_not(data=x) exe = y.simple_bind(ctx=default_context(), x=shape) sym_out = exe.forward(is_train=True, x=mx_xa)[0] assert_almost_equal(sym_out.asnumpy(), reference(xa, dtype=xa.dtype)) @with_seed() def test_embedding(): in_dim = 10 out_dim = 4 batch = 24 data = mx.sym.Variable("data") embed = mx.sym.Embedding(data=data, input_dim=in_dim, output_dim=out_dim, name="embed") exe_test = embed.simple_bind(default_context(), grad_req={'data': 'null', 'embed_weight': 'write'}, data=(batch,)) arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays)) grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays)) np_data = np.random.randint(low=0, high=in_dim, size=batch) np_weight = np.random.uniform(-0.01, 0.01, arg_map["embed_weight"].shape) np_onehot = np.zeros((batch, in_dim)) np_onehot[np.arange(batch), np_data] = 1.0 # forward arg_map["data"][:] = np_data arg_map["embed_weight"][:] = np_weight exe_test.forward(is_train=True) # Non-zero atol required, as exposed by seed 781663739 rtol = 1e-5 atol = 1e-5 assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, np_weight), rtol=rtol, atol=atol) # backward np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape) grad = mx.nd.zeros(np_grad.shape) grad[:] = np_grad exe_test.backward([grad]) assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, np_grad), rtol=rtol, atol=atol) # check ops handle duplicate input correctly. @with_seed() def test_binary_op_duplicate_input(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:] = 5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:] = 3 out_grad = mx.nd.empty(shape) out_grad[:] = 1 square = data * data exe_square = square.bind(default_context(), args=[arr_data], args_grad=[arr_grad]) exe_square.forward(is_train=True) assert_almost_equal(exe_square.outputs[0].asnumpy(), data_tmp * data_tmp) exe_square.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), 2.0 * data_tmp) @with_seed() def test_sign(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:]=5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]=3 test = mx.sym.sign(data) exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = np.sign(data_tmp) assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) out_grad[:] = 2; npout_grad = out_grad.asnumpy() npout_grad = 0; exe_test.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), npout_grad) @with_seed() def test_round_ceil_floor(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:]=5.543 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]= 2 test = mx.sym.round(data) + mx.sym.ceil(data) + mx.sym.floor(data) exe_test = test.bind(default_context(), args=[arr_data]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp) assert_almost_equal(out, npout) @with_seed() def test_trunc(): data_tmp = np.random.rand(3, 4) * 10 - 5 arr_data = mx.nd.array(data_tmp) data = mx.symbol.Variable('data') test = mx.sym.trunc(data) exe_test = test.bind(default_context(), args=[arr_data]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() # 'trunc' is sensitive to the precision of the calculation. Force numpy to match mxnet's float32. # Repro issue with seed 1660190454 npout = np.trunc(np.float32(data_tmp)) assert_almost_equal(out, npout) @with_seed() def test_rsqrt_cos_sin(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:]=5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]=3 test = mx.sym.rsqrt(data) + mx.sym.cos(data) + mx.sym.sin(data) exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = 1/ np.sqrt(data_tmp) + np.cos(data_tmp) + np.sin(data_tmp) assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) out_grad[:] = 2; npout_grad = out_grad.asnumpy() npout_grad = npout_grad * -(1.0 / (2.0 * data_tmp * np.sqrt(data_tmp))) + npout_grad * -1 * np.sin(data_tmp) + npout_grad * np.cos(data_tmp) exe_test.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), npout_grad) @with_seed() def test_maximum_minimum(): data1 = mx.symbol.Variable('data') data2 = mx.symbol.Variable('data') shape = (3, 4) data_tmp1 = np.random.rand(3,4) data_tmp2 = np.random.rand(3,4) data_tmp1[:] = 2 data_tmp2[:] = 3 arr_data1 = mx.nd.array(data_tmp1) arr_data2 = mx.nd.array(data_tmp2) arr_grad1 = mx.nd.empty(shape) arr_grad2 = mx.nd.empty(shape) test = mx.sym.maximum(data1,data2) + mx.sym.minimum(data1,data2); exe_test = test.bind(default_context(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = np.maximum(data_tmp1,data_tmp2) + np.minimum(data_tmp1,data_tmp2) assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) out_grad[:] = 2 exe_test.backward(out_grad) npout_grad = np.ones(shape) npout_grad[:] = 2 mask1 = (data_tmp1 > data_tmp2).astype('float') mask2 = (data_tmp1 < data_tmp2).astype('float') npout_grad1 = npout_grad * mask1 + npout_grad * mask2 npout_grad2 = (npout_grad - npout_grad * mask1) + (npout_grad - npout_grad * mask2) assert_almost_equal(arr_grad1.asnumpy(), npout_grad1) assert_almost_equal(arr_grad2.asnumpy(), npout_grad2) @with_seed() def test_maximum_minimum_scalar(): data1 = mx.symbol.Variable('data') shape = (3, 4) data_tmp1 = np.random.rand(3,4) data_tmp1[:] = 2 arr_data1 = mx.nd.array(data_tmp1) arr_grad1 = mx.nd.empty(shape) test = mx.sym.maximum(data1,3) + mx.sym.maximum(9,data1) + mx.sym.minimum(5,data1) + mx.sym.minimum(data1,4) exe_test = test.bind(default_context(), args=[arr_data1], args_grad=[arr_grad1]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = np.maximum(data_tmp1,3) + np.maximum(9,data_tmp1) + np.minimum(5,data_tmp1) + np.minimum(data_tmp1,4) assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) out_grad[:] = 2 exe_test.backward(out_grad) npout_grad = np.ones(shape) npout_grad[:] = 2 mask1 = (data_tmp1 > 3).astype('float') mask2 = (9 > data_tmp1).astype('float') mask3 = (5 < data_tmp1).astype('float') mask4 = (data_tmp1 < 4).astype('float') npout_grad1 = npout_grad * mask1 + (npout_grad - npout_grad * mask2) + (npout_grad - npout_grad * mask3) + npout_grad * mask4 assert_almost_equal(arr_grad1.asnumpy(), npout_grad1) @with_seed() def test_abs(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:]=5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]=3 test = mx.sym.abs(data) exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = abs(data_tmp) assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) out_grad[:] = 2; npout_grad = out_grad.asnumpy() npout_grad = npout_grad * np.sign(data_tmp) exe_test.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), npout_grad) def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride, pad): """configure A: input --> conv --> deconv --> output. the convolution and deconvoluiton has similar parameter which ensure the input shape is the same as output, and the same weights between conv and deconv; If the input value of forward() and backwrad() is the same, then the output value of them should also the same; """ assert input_shape[1] == num_filter data = mx.sym.Variable(name="data") conv = mx.sym.Convolution( data=data, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "conv") deconv = mx.sym.Deconvolution( data=conv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "deconv") arg_names = deconv.list_arguments() arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape) input_data = mx.random.uniform(-5, 5, input_shape, ctx=mx.cpu()).copyto(default_context()) out_grad = input_data args = {} args["data"] = input_data args['conv_weight'] = args['deconv_weight'] = mx.random.normal(0, 1, (num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_context()) args_grad = [mx.nd.empty(s) for s in arg_shapes] exe = deconv.bind(default_context(), args=args, args_grad=args_grad) exe.forward(is_train=True) out = exe.outputs[0].asnumpy() exe.backward(out_grad) assert_almost_equal(out, args_grad[0].asnumpy(), rtol=1E-3, atol=1e-3) args_grad_addto_npy = [np.random.normal(size=s) for s in arg_shapes] args_grad_addto = [mx.nd.array(ele) for ele in args_grad_addto_npy] exe = deconv.bind(default_context(), args=args, args_grad=args_grad_addto, grad_req="add") exe.forward(is_train=True) out = exe.outputs[0].asnumpy() exe.backward(out_grad) assert_almost_equal(out + args_grad_addto_npy[0], args_grad_addto[0].asnumpy(), rtol=1e-3, atol=1e-3) def check_deconvolution_gradient(input_shape, num_filter, pad): """configure A: input --> conv --> output. configure B: input --> deconv --> output the convolution and deconvoluiton has similar parameter which ensure the input shape is the same as output; During backward(), if the input of A equals output of B, and the output of A equals input of B, then the grad of weight should be the same; """ ndim = len(pad) stride = (1,) * ndim kernel = tuple(2 * np.array(pad) + 1) data_conv = mx.sym.Variable(name="data_conv") conv = mx.sym.Convolution( data=data_conv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "conv") data_deconv = mx.sym.Variable(name="data_deconv") deconv = mx.sym.Deconvolution( data=data_deconv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "deconv") conv_data = mx.random.uniform(-5, 5, input_shape, ctx=mx.cpu()).copyto(default_context()) conv_args = {} conv_args["data_conv"] = conv_data conv_args['conv_weight'] = \ mx.random.normal(0, 1,(num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_context()) conv_args_grad = [mx.nd.zeros(conv_data.shape), mx.nd.zeros((num_filter, input_shape[1]) + kernel)] exe_conv = conv.bind(default_context(), args=conv_args, args_grad=conv_args_grad) exe_conv.forward(is_train=True) conv_out_grad = mx.random.normal(0, 2, exe_conv.outputs[0].shape, ctx=mx.cpu()).copyto(default_context()) exe_conv.backward(conv_out_grad) deconv_data = conv_out_grad deconv_args = {} deconv_args['data_deconv'] = deconv_data deconv_args['deconv_weight'] = conv_args['conv_weight'] deconv_args_grad = [mx.nd.zeros(deconv_data.shape), mx.nd.zeros((num_filter, input_shape[1]) + kernel)] deconv_addto_args_grad_npy = [np.random.normal(size=deconv_data.shape), np.random.normal(size=(num_filter, input_shape[1]) + kernel)] deconv_addto_args_grad = [mx.nd.array(deconv_addto_args_grad_npy[0]), mx.nd.array(deconv_addto_args_grad_npy[1])] exe_deconv = deconv.bind(default_context(), args=deconv_args, args_grad=deconv_args_grad) exe_deconv.forward(is_train=True) deconv_out_grad = conv_data[:] exe_deconv.backward(deconv_out_grad) assert_almost_equal(conv_args_grad[1].asnumpy(), deconv_args_grad[1].asnumpy(), rtol=1e-3, atol=1e-2) # Test AddTo exe_deconv_addto = deconv.bind(default_context(), args=deconv_args, args_grad=deconv_addto_args_grad, grad_req="add") exe_deconv_addto.forward(is_train=True) deconv_out_grad = conv_data[:] exe_deconv_addto.backward(deconv_out_grad) assert_almost_equal(conv_args_grad[1].asnumpy() + deconv_addto_args_grad_npy[1], deconv_addto_args_grad[1].asnumpy(), rtol=1e-3, atol=1e-2) def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, target_shape=None): data = mx.sym.Variable(name="data") if target_shape: deconv = mx.sym.Deconvolution( data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5, target_shape = target_shape) else: deconv = mx.sym.Deconvolution( data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5) arg_names = deconv.list_arguments() arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape) default_target_size = 8 if target_shape is None: target_shape = (default_target_size,) * len(kernel) assert out_shapes[0] == (input_shape[0], 5) + target_shape @unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10973") @with_seed() def test_deconvolution(): # 2D check_deconvolution_target_shape( input_shape = (2,3,4,4), kernel = (3,3), stride = (2,2), target_shape = (8,8), pad = (99,99), # will be ignored adj = (101,101), # will be ignored ) check_deconvolution_target_shape( input_shape = (2,3,4,4), kernel = (3,3), stride = (2,2), pad = (1,1), adj = (1,1), ) check_deconvolution_forward_backward( input_shape = (1,1,5,5), num_filter = 1, kernel = (3,3), stride = (1,1), pad = (1,1) ) check_deconvolution_forward_backward( input_shape = (32,3,28,28), num_filter = 3, kernel = (3,3), stride = (1,1), pad = (1,1) ) check_deconvolution_forward_backward( input_shape = (10, 3, 403, 403), num_filter = 3, kernel = (7,7), stride = (5,5), pad = (2,2) ) check_deconvolution_gradient( input_shape = (1,3,5,5), num_filter = 3, pad = (1,1) ) check_deconvolution_gradient( input_shape = (5,3,100,100), num_filter = 3, pad = (3,3) ) # 1D check_deconvolution_target_shape( input_shape = (2,3,4), kernel = (3,), stride = (2,), target_shape = (8,), pad = (99,), # will be ignored adj = (101,), # will be ignored ) check_deconvolution_target_shape( input_shape = (2,3,4), kernel = (3,), stride = (2,), pad = (1,), adj = (1,), ) check_deconvolution_forward_backward( input_shape = (1,1,5), num_filter = 1, kernel = (3,), stride = (1,), pad = (1,) ) check_deconvolution_forward_backward( input_shape = (32,3,28), num_filter = 3, kernel = (3,), stride = (1,), pad = (1,) ) check_deconvolution_forward_backward( input_shape = (10, 3, 403), num_filter = 3, kernel = (7,), stride = (5,), pad = (2,) ) check_deconvolution_gradient( input_shape = (1,3,5), num_filter = 3, pad = (1,) ) check_deconvolution_gradient( input_shape = (5,3,100), num_filter = 3, pad = (3,) ) def check_nearest_upsampling_with_shape(shapes, scale, root_scale): arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)} arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)} up = mx.sym.UpSampling(*[mx.sym.Variable('arg_%d'%i) for i in range(len(shapes))], sample_type='nearest', scale=root_scale) exe = up.bind(default_context(), args=arr, args_grad=arr_grad) exe.forward(is_train=True) exe.backward(exe.outputs) for k in range(len(shapes)): name = 'arg_%d'%k assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4) def check_bilinear_upsampling_with_shape(shapes, scale, root_scale): arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)} arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)} up = mx.sym.UpSampling(*[mx.sym.Variable('arg_%d'%i) for i in range(len(shapes))], sample_type='bilinear', scale=root_scale) exe = up.bind(default_context(), args=arr, args_grad=arr_grad) exe.forward(is_train=True) exe.backward(exe.outputs) for k in range(len(shapes)): name = 'arg_%d'%k assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4) @with_seed() def test_nearest_upsampling(): for root_scale in [1,2,3]: for scale in [1,2,3]: for num_shape in [1,2,3]: for base in [1,2,3]: shapes = [(1,3,base*root_scale*scale**(num_shape-1-i),base*root_scale*scale**(num_shape-1-i)) for i in range(num_shape)] check_nearest_upsampling_with_shape(shapes, scale, root_scale) @with_seed() def test_batchnorm_training(): def check_batchnorm_training(stype): for shape in [(2, 3), (2, 3, 2, 2)]: data_tmp = np.random.normal(-0.1, 0.1, size=shape) s = shape[1], gamma = np.ones(s) beta = np.ones(s) gamma[1] = 3 beta[0] = 3 rolling_mean = np.random.uniform(size=s) rolling_std = np.random.uniform(size=s) data = mx.symbol.Variable('data', stype=stype) in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype), mx.nd.array(beta).tostype(stype)] mean_std = [mx.nd.array(rolling_mean).tostype(stype), mx.nd.array(rolling_std).tostype(stype)] test = mx.symbol.BatchNorm_v1(data, fix_gamma=True) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) test = mx.symbol.BatchNorm(data, fix_gamma=True) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) test = mx.symbol.BatchNorm_v1(data, fix_gamma=False) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) test = mx.symbol.BatchNorm(data, fix_gamma=False) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True) check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-2) # Test varying channel axis dim = len(shape) for chaxis in range(-dim, dim): chaxis_true = chaxis if chaxis < 0: chaxis_true = dim + chaxis shapex = shape channel_count = shapex[chaxis_true] data_tmp = np.random.normal(-0.1, 0.1, size=shapex) gamma = np.ones(channel_count) beta = np.ones(channel_count) if channel_count > 1: gamma[1] = 3 beta[0] = 3 in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype), mx.nd.array(beta).tostype(stype)] xrolling_mean = np.random.uniform(size=channel_count) xrolling_std = np.random.uniform(size=channel_count) xmean_std = [mx.nd.array(xrolling_mean).tostype(stype), mx.nd.array(xrolling_std).tostype(stype)] test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis) check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis) check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis) check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis) check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) check_batchnorm_training('default') @with_seed() def test_convolution_grouping(): for dim in [1, 2, 3]: num_filter = 4 num_group = 2 kernel = (3,) * dim shape = (1, 4) + (9,) * dim x = mx.sym.Variable('x') w = mx.sym.Variable('w') b = mx.sym.Variable('b') y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel) xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1) wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0) bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0) y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i], num_filter=num_filter//num_group, kernel=kernel) for i in range(num_group)]) exe1 = y1.simple_bind(default_context(), x=shape) exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,)) for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays): arr1[:] = np.float32(np.random.normal(size=arr1.shape)) arr2[:] = arr1 exe1.forward(is_train=True) exe1.backward(exe1.outputs[0]) exe2.forward(is_train=True) exe2.backward(exe2.outputs[0]) for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3) @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12203") @with_seed() def test_depthwise_convolution(): for dim in [1,2]: for num_base in [1, 4, 16, 32, 64]: for kernel_x in [3, 5]: for stride_x in [1, 2]: for pad_x in [0, 1]: for in_size in [7, 32]: kernel = (kernel_x,) * dim stride = (stride_x,) * dim pad = (pad_x,) * dim num_filter = num_base num_group = num_base shape = (2, num_base) + (in_size,) * dim x = mx.sym.Variable('x') w = mx.sym.Variable('w') b = mx.sym.Variable('b') y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel, stride=stride, pad=pad) xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1) wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0) bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0) y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i], num_filter=num_filter//num_group, kernel=kernel, stride=stride, pad=pad) for i in range(num_group)]) dev = default_context() exe1 = y1.simple_bind(dev, x=shape) exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group)+kernel, b=(num_filter,)) for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays): arr1[:] = np.random.normal(size=arr1.shape) arr2[:] = arr1 exe1.forward(is_train=True) exe1.backward(exe1.outputs[0]) exe2.forward(is_train=True) exe2.backward(exe2.outputs[0]) for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3) def gen_broadcast_data(idx): # Manually set test cases binary_op_data_shape = np.array( [[[2, 5, 1, 30, 7], [1, 5, 448, 30, 1]], [[10, 49, 1, 77, 17], [10, 1, 2, 1, 17]], [[13, 2, 65, 2, 1], [13, 1, 65, 1, 225]], [[9, 434, 4, 2, 37], [9, 1, 4, 1, 37]], [[2, 52, 1, 4, 1], [1, 52, 60, 1, 37]], [[1, 23, 7, 122, 50], [2, 1, 7, 1, 50]], [[1, 17, 1, 5, 1], [22, 1, 2, 1, 28]], [[29, 1, 2, 1, 8], [29, 22, 1, 130, 1]], [[2, 36, 1, 427, 3], [1, 36, 11, 427, 1]], [[1, 2, 1, 100, 7], [1, 2, 448, 100, 1]], [[1, 2, 495, 77, 7], [1, 2, 1, 1, 7]], [[1, 43, 65, 2, 1], [1, 43, 65, 1, 225]], [[1, 92, 434, 2, 2], [1, 92, 1, 2, 2]], [[1, 92, 1, 4, 1], [1, 92, 134, 1, 17]], [[1, 53, 2, 122, 143], [1, 1, 2, 1, 143]], [[1, 179, 1, 87, 17], [1, 179, 1, 1, 17]], [[1, 1, 17, 5, 1], [1, 22, 1, 1, 28]], [[1, 2, 1, 1, 8], [1, 2, 52, 430, 1]], [[1, 163, 1, 22, 3], [1, 163, 116, 22, 1]], [[1, 1, 44, 30, 7], [1, 1, 44, 30, 1]], [[1, 1, 1, 1, 28], [1, 127, 1, 5, 28]], [[1, 2, 394, 38, 1], [1, 2, 394, 38, 16]], [[1, 10, 49, 77, 17], [1, 1, 1, 1, 17]], [[1, 431, 6, 2, 225], [1, 1, 6, 2, 225]], [[1, 15, 1, 28, 1], [1, 15, 1, 28, 463]], [[1, 129, 2, 48, 96], [1, 129, 2, 1, 1]], [[1, 1, 403, 17, 2], [1, 44, 403, 17, 2]], [[1, 1, 65, 2, 22], [1, 1, 65, 1, 1]], [[1, 24, 103, 17, 18], [1, 24, 1, 1, 1]], [[1, 1, 1, 1, 2], [1, 24, 194, 50, 1]], [[1, 1, 107, 84, 9], [1, 1, 1, 1, 1]]]) if idx < binary_op_data_shape.shape[0]: l_shape = binary_op_data_shape[idx][0] r_shape = binary_op_data_shape[idx][1] else: # Generate random data that has ndim between 1-7 and all the shape dims between 1-5 ndim = np.random.randint(1, 6) shape = np.random.randint(1, 6, size=(ndim,)) l_same_dim = np.random.randint(0, 5) r_same_dim = np.random.randint(0, 5) l_axis_flags = np.random.randint(0, 2, size=ndim) r_axis_flags = np.random.randint(0, 2, size=ndim) if l_same_dim == 4: l_axis_flags = np.ones(ndim) if r_same_dim == 4: r_axis_flags = np.ones(ndim) l_shape = shape.copy() r_shape = shape.copy() l_shape[np.where(l_axis_flags == 0)] = 1 r_shape[np.where(r_axis_flags == 0)] = 1 return [np.random.random(l_shape), np.random.random(r_shape)] def gen_broadcast_data_int(idx): d = gen_broadcast_data(idx); return [np.round(d[0]*100).astype(int), np.round(d[1]*100).astype(int)] def gen_binary_data(dummy): ndim = np.random.randint(1, 6) shape = np.random.randint(1, 6, size=(ndim,)) #print("gen shape {}".format(shape)) return [np.random.random(shape), np.random.random(shape)] def gen_binary_data_int(dummy): d = gen_binary_data(dummy); return [np.round(d[0]*100).astype(int), np.round(d[1]*100).astype(int)] def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5, mx_nd_func=None): sample_num = 200 for i in range(sample_num): d = gen_data(i) y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])}) y.forward(is_train=True) y = y.outputs[0].asnumpy() x = baseline(d[0], d[1]).astype(y.dtype) #np.set_printoptions(precision=20) a = d[0] b = d[1] #print("a: {} {}".format(a.dtype, a)) #print("a: {} {}".format(b.dtype, b)) #print("x: {} {}".format(x.dtype, x)) #print("y: {} {}".format(y.dtype, y)) if mx_nd_func is not None: d0 = mx.nd.array(d[0], dtype=d[0].dtype) d1 = mx.nd.array(d[1], dtype=d[1].dtype) assert_almost_equal(y, mx_nd_func(d0, d1).asnumpy(), rtol=rtol, atol=atol) idx = np.abs(x-y) > atol+rtol*np.abs(x) if idx.any(): import binascii np.set_printoptions(precision=20) logging.error('found precision problem:') d[0] = np.broadcast_to(d[0], x.shape) d[1] = np.broadcast_to(d[1], x.shape) logging.error('input a: {}'.format(d[0][idx])) logging.error('input b: {}'.format(d[1][idx])) logging.error("output x: {} {}".format(x.dtype, x)) logging.error("output y: {} {}".format(y.dtype, y)) def ftohex(xs): import struct return list(map(lambda x: binascii.hexlify(struct.pack('d', x)), xs.flatten())) logging.error('output x in baseline(a, b): {}'.format(x[idx])) logging.error('output y in symbol(a, b): {}'.format(y[idx])) logging.error('output x in baseline(a,b) hex: {}'.format(ftohex(x[idx]))) logging.error('output y in symbol(a,b) hex: {}'.format(ftohex(y[idx]))) logging.error('input a hex: {}'.format(ftohex(d[0][idx]))) logging.error('input a hex: {}'.format(ftohex(d[1][idx]))) logging.error('diff: {}'.format(np.abs(x-y)[idx] - atol-rtol*np.abs(x)[idx])) assert_allclose(y, x, rtol=rtol, atol=atol) def check_binary_op_backward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5): sample_num = 200 for i in range(sample_num): d = gen_data(i) out = np.random.random((d[0] + d[1]).shape) def reduce_op(shape, x): if shape == x.shape: return x keepdims_shape = list(x.shape) for i in range(len(shape)): if x.shape[i] != shape[i]: keepdims_shape[i] = 1 x = np.sum(x, axis=i).reshape(keepdims_shape) return x baseline_grad1, baseline_grad2 = baseline(out, d[0], d[1]) x_1 = reduce_op(d[0].shape, baseline_grad1) x_2 = reduce_op(d[1].shape, baseline_grad2) y_1 = mx.nd.empty(d[0].shape) y_2 = mx.nd.empty(d[1].shape) y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])}, args_grad=[y_1, y_2]) y.forward(is_train=True) y.backward([mx.nd.array(out)]) assert_allclose(y_1.asnumpy(), x_1, rtol=rtol, atol=atol) assert_allclose(y_2.asnumpy(), x_2, rtol=rtol, atol=atol) @with_seed() def test_binary_op(): a = mx.sym.Variable('a') b = mx.sym.Variable('b') def test_bplus(a, b): c = a + b check_binary_op_forward(c, lambda a, b: a + b, gen_binary_data) check_binary_op_backward(c, lambda g_out, a, b: (g_out, g_out), gen_binary_data) def test_bminus(a, b): c = a - b check_binary_op_forward(c, lambda a, b: a - b, gen_binary_data) check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out), gen_binary_data) def test_bmul(a, b): c = a * b check_binary_op_forward(c, lambda a, b: a * b, gen_binary_data) check_binary_op_backward(c, lambda g_out, a, b: (g_out * b, g_out * a), gen_binary_data) def test_bdiv(a, b): c = a / b check_binary_op_forward(c, lambda a, b: a / b, gen_binary_data) check_binary_op_backward(c, lambda g_out, a, b: (g_out / b, - g_out * a / (b * b)), gen_binary_data) def test_bmod(a, b): # Python and numpy operate only in double so to avoid numerical errors we have to use # doubles as well. This was a flaky test before when using float32. seed 1688524483, 1768433044 #c = a % b c = mx.sym.cast(a, dtype='float64') % mx.sym.cast(b, dtype='float64') # '%' is sensitive to the precision of the calculation. Force numpy to match mxnet's float32. check_binary_op_forward(c, lambda a, b: np.float32(a) % np.float32(b), gen_binary_data, rtol=0, atol=0) check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out * (np.float32(a) // np.float32(b))), gen_binary_data) def test_bmod_int(a, b): c = mx.sym.cast(a, dtype='int32') % mx.sym.cast(b, dtype='int32') check_binary_op_forward(c, lambda a, b: a % b, gen_binary_data_int) check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_binary_data_int) def test_bpow(a, b): c = a ** b check_binary_op_forward(c, lambda a, b: a ** b, gen_binary_data) check_binary_op_backward(c, lambda g_out, a, b: (g_out * a **(b - 1) * b, g_out * a ** b * np.log(a)), gen_binary_data) def test_bneq(a, b): c = a != b # '!=' is sensitive to the precision of the comparison. Force numpy to match mxnet's float32. # Issue exposed with seed 1644387363 check_binary_op_forward(c, lambda a, b: (np.float32(a) != np.float32(b)).astype(a.dtype), gen_binary_data) check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_binary_data) test_bplus(a, b) test_bminus(a, b) test_bmul(a, b) test_bdiv(a, b) test_bmod(a, b) test_bmod_int(a, b) test_bpow(a, b) test_bneq(a, b) @with_seed() def test_broadcast_binary_op(): def check_bmaxmin_gradient(test_sym, x, y, delta, rtol, atol): """This function ensures that checking the numerical gradient of broadcast_max/min is not crossing the boundary y=x where there is no gradient definition at those sigularities.""" x_max = np.max(x) y = x_max + 2 * delta + np.random.random(y.shape) check_numeric_gradient(test_sym, [x, y], numeric_eps=delta, rtol=rtol, atol=atol) x_min = np.min(x) y = x_min - 2 * delta - np.random.random(y.shape) check_numeric_gradient(test_sym, [x, y], numeric_eps=delta, rtol=rtol, atol=atol) a = mx.sym.Variable('a') b = mx.sym.Variable('b') def test_bplus(a, b): c = mx.sym.broadcast_plus(a, b) check_binary_op_forward(c, lambda a, b: a + b, gen_broadcast_data, mx_nd_func=mx.nd.add) check_binary_op_backward(c, lambda g_out, a, b: (g_out, g_out), gen_broadcast_data) def test_bminus(a, b): c = mx.sym.broadcast_minus(a, b) check_binary_op_forward(c, lambda a, b: a - b, gen_broadcast_data, mx_nd_func=mx.nd.subtract) check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out), gen_broadcast_data) def test_bmul(a, b): c = mx.sym.broadcast_mul(a, b) check_binary_op_forward(c, lambda a, b: a * b, gen_broadcast_data, mx_nd_func=mx.nd.multiply) check_binary_op_backward(c, lambda g_out, a, b: (g_out * b, g_out * a), gen_broadcast_data) def test_bdiv(a, b): c = mx.sym.broadcast_div(a, b) check_binary_op_forward(c, lambda a, b: a / b, gen_broadcast_data, mx_nd_func=mx.nd.divide) check_binary_op_backward(c, lambda g_out, a, b: (g_out / b, - g_out * a / (b * b)), gen_broadcast_data) def test_bmod(a_, b_): # Python and numpy operate only in double so to avoid numerical errors we have to use # doubles as well. This was a flaky test before when using float32. seed 1688524483, 1768433044 a = mx.sym.cast(a_, dtype='float64') b = mx.sym.cast(b_, dtype='float64') # '%' is sensitive to the precision of the calculation. Force numpy to match mxnet's float32. c = mx.sym.broadcast_mod(a, b) check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data, atol=1, mx_nd_func=mx.nd.modulo) check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out * (np.float32(a) // np.float32(b))), gen_binary_data) def test_bmod_int(a, b): c = mx.sym.broadcast_mod(mx.sym.cast(a, dtype='int32'), mx.sym.cast(b, dtype='int32')) check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data_int, mx_nd_func=mx.nd.modulo) check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data_int) def test_bpow(a, b): c = mx.sym.broadcast_power(a, b) check_binary_op_forward(c, lambda a, b: a ** b, gen_broadcast_data, mx_nd_func=mx.nd.power) check_binary_op_backward(c, lambda g_out, a, b: (g_out * a **(b - 1) * b, g_out * a ** b * np.log(a)), gen_broadcast_data) def test_bequal(a, b): c = mx.sym.broadcast_equal(a, b) check_binary_op_forward(c, lambda a, b: (a == b).astype(a.dtype), gen_broadcast_data_int, mx_nd_func=mx.nd.equal) check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data_int) def test_bmax(a, b): c = mx.sym.broadcast_maximum(a, b) check_binary_op_forward(c, lambda x, y: np.maximum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.maximum) # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big data = gen_broadcast_data(idx=200) check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3) def test_bmin(a, b): c = mx.sym.broadcast_minimum(a, b) check_binary_op_forward(c, lambda x, y: np.minimum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.minimum) # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big data = gen_broadcast_data(idx=200) check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3) def test_band(a, b): c = mx.sym.broadcast_logical_and(a, b) check_binary_op_forward(c, lambda x, y: np.logical_and(x, y), gen_broadcast_data, mx_nd_func=mx.nd.logical_and) # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big data = gen_broadcast_data(idx=200) check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3) def test_bor(a, b): c = mx.sym.broadcast_logical_or(a, b) check_binary_op_forward(c, lambda x, y: np.logical_or(x, y), gen_broadcast_data, mx_nd_func=mx.nd.logical_or) # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big data = gen_broadcast_data(idx=200) check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3) def test_bxor(a, b): c = mx.sym.broadcast_logical_xor(a, b) check_binary_op_forward(c, lambda x, y: np.logical_xor(x, y), gen_broadcast_data, mx_nd_func=mx.nd.logical_xor) # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big data = gen_broadcast_data(idx=200) check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3) test_bplus(a, b) test_bminus(a, b) test_bmul(a, b) test_bdiv(a, b) test_bmod(a, b) test_bmod_int(a, b) test_bpow(a, b) test_bequal(a, b) test_bmax(a, b) test_bmin(a, b) test_band(a, b) test_bor(a, b) test_bxor(a, b) @with_seed() def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False): dim = len(dil) assert(len(kernel_shape) == dim) # Input for spike response data_size = 33 data_shape = (1, 1) + (data_size,) * dim center = (0,0) + (data_size // 2,) * dim spike_imgs = np.zeros(shape=data_shape, dtype=np.float32) spike_imgs[center] = 1.0 spike_img = mx.nd.array(spike_imgs) spike_img2 = mx.nd.array(spike_imgs) kernel_weights = mx.nd.ones(shape=tuple([1,1]+list(kernel_shape)), dtype=np.float32) kernel_weights2 = mx.nd.ones(shape=tuple([1,1]+list(kernel_shape)), dtype=np.float32) kernel = mx.symbol.Variable('kernel') in_img = mx.symbol.Variable('input') net = mx.symbol.Convolution(in_img, num_filter=1,kernel=kernel_shape, dilate=dil, no_bias="true", name='test_convolution') net.list_arguments() be = net.bind(default_context(), args={ 'input' : spike_img, 'test_convolution_weight' : kernel_weights}, args_grad={'input' : spike_img2, 'test_convolution_weight' : kernel_weights2 } ) be.forward(True) out_o = be.outputs[0].asnumpy() ndo = be.outputs[0] out_grads = np.zeros(shape=be.outputs[0].shape, dtype=np.float32) out_grads[center] = 1.0 out_grad = mx.nd.array(out_grads) be.backward([out_grad]) vgrad = be.grad_arrays[0].asnumpy() out = out_o.reshape(out_o.shape[2:]) nz_loc = np.nonzero(out) assert_allclose(np.sum(out),np.prod(kernel_shape),atol=1e-5) assert_allclose(np.sum(vgrad),np.prod(kernel_shape),atol=1e-5) # Now check whether the input gradient was computed correctly input_grad = mx.nd.array(vgrad) be = net.bind(default_context(), args={ 'input' : input_grad, 'test_convolution_weight' : kernel_weights}) be.forward(True) out_o = be.outputs[0].asnumpy() assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5) rnd_kernel_s = np.random.uniform(low=0.0, high=1.0, size=tuple([1,1]+list(kernel_shape))).astype(np.float32) impulse_error = mx.nd.array(out_o/np.sum(out_o)) # This should be 1.0 at [0,0,16,16] rnd_kernel = mx.nd.array(rnd_kernel_s) rnd_kernel2 = mx.nd.array(rnd_kernel_s) white_in = mx.nd.ones(shape=data_shape) white_in2 = mx.nd.ones(shape=data_shape) be = net.bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : rnd_kernel}, args_grad={'input' : white_in2, 'test_convolution_weight' : rnd_kernel2 } ) be.forward(True) be.backward([impulse_error]) out_orig = be.outputs[0].asnumpy() kernel_gradient = be.grad_arrays[1].asnumpy() dkernel = mx.nd.array(rnd_kernel_s + kernel_gradient) be = net.bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : dkernel}) be.forward(True) out = be.outputs[0].asnumpy() # Now do a simple check of the kernel gradient assert(out[center] - np.sum(kernel_gradient) - out_orig[center] < 0.001) @with_seed() def test_convolution_dilated_impulse_response(): # 1D for dil in [ (1,), (2,), (3,) ]: for ks in [ (1,), (2,), (3,), (4,)]: test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks) # 2D for dil in [ (1,1), (2,2), (3,3) ]: for ks in [ (3,3), (4,4), (2,3), (3,2), (1,1) ]: test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks) @with_seed() def test_reshape(): def test_reshape_new(src_shape, shape_args, reverse, dst_shape): net = mx.sym.Variable("data") net = mx.sym.Reshape(net, shape=shape_args, reverse=reverse) js = net.tojson() net = mx.sym.load_json(js) _, output_shape, __ = net.infer_shape(data=src_shape) assert output_shape[0] == dst_shape, \ 'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \ 'Output Shape = %s' %(str(src_shape), str(shape_args), str(reverse), str(dst_shape), str(output_shape[0])) dat_npy = np.random.rand(*src_shape) grad_npy = np.random.rand(*dst_shape) exe = net.simple_bind(default_context(), data=src_shape) exe.arg_dict['data'][:] = dat_npy exe.forward(is_train=True) assert np.square(exe.outputs[0].asnumpy() - dat_npy.reshape(dst_shape)).mean() < 1E-7, \ 'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s'\ %(str(src_shape), str(shape_args), str(reverse), str(dst_shape)) exe.backward(out_grads=mx.nd.array(grad_npy)) assert np.square(exe.grad_dict['data'].asnumpy() - grad_npy.reshape(src_shape)).mean() < 1E-7, \ 'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s'\ %(str(src_shape), str(shape_args), str(reverse), str(dst_shape)) for i in range(len(src_shape)): holdout_src_shape = list(src_shape) holdout_src_shape[i] = 0 holdout_src_shape = tuple(holdout_src_shape) net = mx.sym.Variable('data') net = mx.sym.elemwise_add(net.reshape(shape_args, reverse=reverse), mx.sym.ones(shape=dst_shape)) input_shape, output_shape, __ = net.infer_shape(data=holdout_src_shape) assert output_shape[0] == dst_shape, \ 'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \ 'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse), str(dst_shape), str(output_shape[0])) assert input_shape[0] == src_shape, \ 'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \ 'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse), str(dst_shape), str(output_shape[0])) # Test new api (Using shape) test_cases = [ [(2, 3, 5, 5), (0, -1), False, (2, 75)], [(2, 3, 5, 5), (0, 0, -1), False, (2, 3, 25)], [(5, 3, 4, 5), (0, -1, 0), False, (5, 15, 4)], [(2, 3, 5, 4), (-1, 0, 0), False, (8, 3, 5)], [(2, 3, 5, 5), (0, 0, 0, 0), False, (2, 3, 5, 5)], [(2, 4, 5, 3), (-1, 2, 2, 1), False, (30, 2, 2, 1)], [(2, 3, 5, 6), (-2,), False, (2, 3, 5, 6)], [(2, 3, 5, 6), (6, 1, -2), False, (6, 1, 5, 6)], [(2, 3, 5, 6), (-3, -3), False, (6, 30)], [(2, 3, 5, 6), (-3, -1), False, (6, 30)], [(64,), (-4, 16, 4), False, (16, 4)], [(64,), (-4, 16, -1), False, (16, 4)], [(64, 1, 2, 3), (-4, 16, -1, -2), False, (16, 4, 1, 2, 3)], [(2, 3, 5, 5), (0, -1), True, (5, 30)], [(2, 3, 5, 5), (0, 0, -1), True, (3, 5, 10)], [(5, 3, 4, 5), (0, -1, 0), True, (3, 20, 5)], [(2, 3, 5, 4), (-1, 0, 0), True, (6, 5, 4)], [(2, 3, 4, 5), (3, -1, 0), True, (3, 8, 5)], [(2, 3, 5, 5), (5, 3, 0, -1), True, (5, 3, 5, 2)], [(2, 3, 5, 5), (0, 0, 0, 0), True, (2, 3, 5, 5)], [(2, 3, 5, 6), (-2,), True, (2, 3, 5, 6)], [(2, 3, 5, 6), (-2, 1, 30), True, (2, 3, 1, 30)], [(2, 3, 5, 6), (-3, -3), True, (6, 30)], [(64,), (16, 4, -4), True, (16, 4)], [(64,), (16, -1, -4), True, (16, 4)], [(1, 2, 3, 64), (-2, -1, 16, -4), True, (1, 2, 3, 4, 16)]] for test_case in test_cases: test_reshape_new(*test_case) # Test old api net = mx.sym.Variable("data") net = mx.sym.Reshape(net, target_shape=(2, 0)) js = net.tojson() net = mx.sym.load_json(js) _, output_shape, __ = net.infer_shape(data=(2, 3, 5, 5)) assert(output_shape[0] == (2, 75)) # Test for Flatten data = mx.sym.Variable("data") net = mx.sym.Flatten(data) exe = net.simple_bind(ctx=default_context(), data=(5, 4, 3, 7)) data_npy = np.random.normal(size=(5, 4, 3, 7)) out_grad_npy = np.random.normal(size=(5, 4 * 3 * 7)) outputs = exe.forward(is_train=True, data=data_npy)[0].asnumpy() assert_allclose(outputs, data_npy.reshape((5, 4 * 3 * 7))) exe.backward(out_grads=[mx.nd.array(out_grad_npy, ctx=default_context())]) assert_allclose(exe.grad_arrays[0].asnumpy(), out_grad_npy.reshape((5, 4, 3, 7))) @with_seed() def test_reshape_like(): def test_reshape_like_new(lhs_shape, rhs_shape, lbeg, lend, rbeg, rend, dst_shape): lhs = mx.sym.Variable("lhs") rhs = mx.sym.Variable("rhs") net = mx.sym.reshape_like(lhs, rhs, lhs_begin=lbeg, lhs_end=lend, rhs_begin=rbeg, rhs_end=rend) js = net.tojson() net = mx.sym.load_json(js) _, output_shape, __ = net.infer_shape(lhs=lhs_shape, rhs=rhs_shape) assert output_shape[0] == dst_shape, \ 'LHS Shape = %s, RHS Shape = %s, lhs_begin = %s, lhs_end = %s, rhs_begin= %s, rhs_end= %s'\ %(str(lhs_shape), str(rhs_shape), str(lbeg), str(lend), str(rbeg), str(rend)) lhs_npy = np.random.rand(*lhs_shape) rhs_npy = np.random.rand(*rhs_shape) grad_npy = np.random.rand(*dst_shape) exe = net.simple_bind(default_context(), lhs=lhs_shape, rhs=rhs_shape) exe.arg_dict['lhs'][:] = lhs_npy exe.arg_dict['rhs'][:] = rhs_npy exe.forward(is_train=True) assert np.square(exe.outputs[0].asnumpy() - lhs_npy.reshape(dst_shape)).mean() < 1E-7, \ 'LHS Shape = %s, RHS Shape = %s, lhs_begin = %s, lhs_end = %s, rhs_begin= %s, rhs_end= %s'\ %(str(lhs_shape), str(rhs_shape), str(lbeg), str(lend), str(rbeg), str(rend)) exe.backward(out_grads=mx.nd.array(grad_npy)) assert np.square(exe.grad_dict['lhs'].asnumpy() - grad_npy.reshape(lhs_shape)).mean() < 1E-7, \ 'LHS Shape = %s, RHS Shape = %s, lhs_begin = %s, lhs_end = %s, rhs_begin= %s, rhs_end= %s'\ %(str(lhs_shape), str(rhs_shape), str(lbeg), str(lend), str(rbeg), str(rend)) # Test new api (Using shape) test_cases = [ [(30,), (15,2,4), 0, None, 0, 2, (15,2)], [(30,), (15,2,4), None, 1, None, 2, (15,2)], [(30,7), (15,2,4), 0, 1, 0, 2, (15,2,7)], [(3,5), (1,15,4), 0, 2, 1, 2, (15,)], [(3,5), (1,15,4), 0, None, 1, -1, (15,)], [(30,12), (4,2,2,3), -1, None, 1, None, (30,2,2,3)], [(1,1,7,3,1,1), (81,1,1,21), 1, -1, 1, None, (1,1,1,21,1)] ] # for test_case in test_cases: for test_case in test_cases: test_reshape_like_new(*test_case) # Test old api lhs = mx.sym.Variable("lhs") rhs = mx.sym.Variable("rhs") net = mx.sym.reshape_like(lhs, rhs) js = net.tojson() net = mx.sym.load_json(js) _, output_shape, __ = net.infer_shape(lhs=(40, 30), rhs=(30,20,2)) assert(output_shape[0] == (30,20,2)) @with_seed() def test_reduce(): sample_num = 500 def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym, nan_prob=0, test_exclude=True, test_none_axis=False): for i in range(sample_num): # Generate random data that has ndim between 1-7 and all the shape dims between 1-5 # Insert a NaN with probability equal to nan_prob ndim = np.random.randint(1, 6) shape = np.random.randint(1, 6, size=(ndim,)) axis_num = np.random.randint(0, ndim, size=1) axis_flags = np.random.randint(0, 2, size=ndim) if test_exclude: exclude = np.random.randint(0, 2) else: exclude = False axes = [] for (axis, flag) in enumerate(axis_flags): if flag: axes.append(axis) if 0 == len(axes): axes = None elif 1 == len(axes): axes = axes[0] else: axes = tuple(axes) keepdims = np.random.randint(0, 2) a = mx.symbol.Variable('a') if axes is None: if test_none_axis: b = mx_reduce_sym(a, keepdims=keepdims, axis=axes) else: b = mx_reduce_sym(a, keepdims=keepdims) elif exclude and isinstance(axes, tuple) and len(axes) < ndim: naxes = [i for i in range(ndim) if i not in axes] b = mx_reduce_sym(a, axis=naxes, keepdims=keepdims, exclude=True) else: b = mx_reduce_sym(a, axis=axes, keepdims=keepdims) dat_npy = np.random.rand(*shape) # Test with both negative and positive values (randomly). Avoid having both in the same # test, which can be problematic for error checking due to near-zero values. if np.random.rand() > 0.5: dat_npy = -dat_npy if nan_prob > 0: dat_npy[np.random.rand(*shape) < nan_prob] = np.nan sum_groundtruth = np.array(numpy_reduce_func(dat_npy, axis=axes, keepdims=keepdims)) if sum_groundtruth.shape == (): sum_groundtruth = np.array([sum_groundtruth]) grad_nd = mx.nd.empty(shape) outgrad_npy = np.array(np.random.rand(*sum_groundtruth.shape)) keepdim_shape = np_reduce(dat_npy, axes, 1, np.sum).shape grad_groundtruth = numpy_reduce_grad_func(outgrad=outgrad_npy, data=dat_npy, outdata=sum_groundtruth, axis=axes, keepdims=keepdims, keepdim_shape=keepdim_shape) net = b.bind(default_context(), args={'a': mx.nd.array(dat_npy)}, args_grad={'a': grad_nd}) net.forward(is_train=True) equal_forward = almost_equal_ignore_nan(net.outputs[0].asnumpy(), sum_groundtruth, 1E-4, 1E-4) assert equal_forward net.backward(out_grads=mx.nd.array(outgrad_npy)) bc_grad_groundtruth = np.broadcast_to(grad_groundtruth, grad_nd.shape) equal_backward = almost_equal_ignore_nan(grad_nd.asnumpy(), bc_grad_groundtruth, 1E-4, 1E-4) assert equal_backward test_none_axis = [True, False] for test_none in test_none_axis: test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.sum), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: outgrad.reshape(keepdim_shape), mx.symbol.sum, test_none_axis=test_none) test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.mean), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: outgrad.reshape(keepdim_shape)/(data.size/outdata.size), mx.symbol.mean, test_none_axis=test_none) test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.prod), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: outgrad.reshape(keepdim_shape) * (outdata.reshape(keepdim_shape) / data), mx.symbol.prod, test_none_axis=test_none) test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.nansum), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: np.where(np.isnan(data), 0, outgrad.reshape(keepdim_shape)), mx.symbol.nansum, 0.3, test_none_axis=test_none) test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.nanprod), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: np.where(np.isnan(data), 0, outgrad.reshape(keepdim_shape) * (outdata.reshape(keepdim_shape) / data)), mx.symbol.nanprod, 0.3, test_none_axis=test_none) # grad of max and min are sensitive to the precision of the calculation. # Force numpy to match mxnet's float32. test_reduce_inner(lambda data, axis, keepdims:np_reduce(np.float32(data), axis, keepdims, np.max), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: outgrad.reshape(keepdim_shape) * (np.equal(np.float32(data), outdata.reshape(keepdim_shape))), mx.symbol.max) test_reduce_inner(lambda data, axis, keepdims:np_reduce(np.float32(data), axis, keepdims, np.min), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: outgrad.reshape(keepdim_shape) * (np.equal(np.float32(data), outdata.reshape(keepdim_shape))), mx.symbol.min) test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.linalg.norm), lambda outgrad, data, outdata, axis, keepdims, keepdim_shape: outgrad.reshape(keepdim_shape) * (data / outdata.reshape(keepdim_shape)), mx.symbol.norm, test_exclude=False, test_none_axis=test_none) @with_seed() def test_broadcast(): sample_num = 200 for i in range(sample_num): # Generate random data that has ndim between 1-7 and all the shape dims between 1-5 ndim = np.random.randint(1, 6) target_shape = np.random.randint(1, 6, size=(ndim,)) axis = tuple(set(np.random.randint(0, ndim, np.random.randint(1, ndim + 1)))) shape = target_shape.copy() size = tuple([shape[ele] for ele in axis]) for ele in axis: shape[ele] = 1 a = mx.symbol.Variable('a') sym_bcast_axis = mx.symbol.broadcast_axis(a, axis=axis, size=size) sym_bcast_to = mx.symbol.broadcast_to(a, shape=tuple(target_shape)) sym_bcast_like = mx.symbol.broadcast_like(a, sym_bcast_to) def test_broadcasting_ele(sym_bcast): dat_npy = np.random.rand(*shape) groundtruth = dat_npy grad_nd = mx.nd.empty(shape) outgrad_npy = np.random.rand(*target_shape) grad_groundtruth = np_reduce(outgrad_npy, axis=axis, keepdims=True, numpy_reduce_func=np.sum) net = sym_bcast.bind(default_context(), args={'a': mx.nd.array(dat_npy)}, args_grad={'a': grad_nd}) net.forward(is_train=True) assert (net.outputs[0].shape == target_shape).all() assert_almost_equal(net.outputs[0].asnumpy(), groundtruth, rtol=1e-4) net.backward(out_grads=mx.nd.array(outgrad_npy)) assert_almost_equal(grad_nd.asnumpy(), grad_groundtruth, rtol=1e-4) test_broadcasting_ele(sym_bcast_axis) test_broadcasting_ele(sym_bcast_to) test_broadcasting_ele(sym_bcast_like) @with_seed() def test_transpose(): for ndim in range(1, 7): for t in range(5): dims = list(np.random.randint(1, 10, size=ndim)) axes = list(range(ndim)) random.shuffle(axes) axes = tuple(axes) x = mx.nd.array(np.random.normal(size=dims)) y = mx.nd.transpose(x, axes=axes) assert_allclose(np.transpose(x.asnumpy(), axes=axes), y.asnumpy()) y = mx.nd.transpose(x) assert_allclose(np.transpose(x.asnumpy()), y.asnumpy()) @with_seed() def test_expand_dims(): for ndim in range(1, 6): for axis in range(-ndim + 1, ndim): x = np.random.normal(size=list(np.random.randint(1, 10, size=ndim))) y = mx.nd.array(x) x1 = np.expand_dims(x, axis=axis) y1 = mx.nd.expand_dims(y, axis=axis) assert_allclose(x1, y1.asnumpy()) assert_allclose(x1.shape, y1.shape) @with_seed() def test_crop(): for ndim in range(1, 6): for t in range(5): dims = [] begin = [] end = [] idx = [] for i in range(ndim): d = random.randint(1, 5) b = random.randint(0, d-1) e = random.randint(b+1, d) if b == 0 and random.randint(0, 1): b = None elif b != 0 and random.randint(0, 1): b -= d if e == d and random.randint(0, 1): e = None elif e != d and random.randint(0, 1): e -= d dims.append(d) begin.append(b) end.append(e) idx.append(slice(b, e)) x = mx.nd.array(np.random.normal(size=dims)) y = mx.nd.crop(x, begin=tuple(begin), end=tuple(end)) assert_allclose(x.asnumpy()[idx], y.asnumpy()) vx = mx.sym.Variable('x') vy = mx.sym.crop(vx, begin=tuple(begin), end=tuple(end)) check_numeric_gradient(vy, [x.asnumpy()]) @with_seed() def test_slice_axis(): for ndim in range(1, 6): shape = np.random.randint(1, 11, size=(ndim,)) for t in range(ndim): d = shape[t] b = random.randint(0, d-1) e = random.randint(b+1, d) if np.random.rand() > 0.6: e = None else: if e < d and np.random.rand() > 0.5: e = e - d if np.random.rand() > 0.5: b = b - d idx = [] for i in range(ndim): idx.append(slice(0, shape[i])) idx[t] = slice(b, e) X = mx.symbol.Variable('X') x = mx.nd.array(np.random.normal(size=shape)) Y = mx.symbol.slice_axis(data=X, axis=t, begin=b, end=e) xgrad = mx.nd.empty(x.shape) exec1 = Y.bind(default_context(), args = [x], args_grad = {'X': xgrad}) exec1.forward(is_train=True) y = exec1.outputs[0] assert_allclose(x.asnumpy()[idx], y.asnumpy()) exec1.backward([y]) xx = x.asnumpy() xx[:] = 0.0 xx[idx] = x.asnumpy()[idx] assert_allclose(xx, xgrad.asnumpy()) x_grad_npy = np.random.normal(size=x.shape) xgrad = mx.nd.array(x_grad_npy) exec2 = Y.bind(default_context(), args=[x], args_grad={'X': xgrad}, grad_req="add") exec2.forward(is_train=True) exec2.backward([exec2.outputs[0]]) xx = np.zeros(shape=x.shape, dtype=np.float32) xx[idx] = x.asnumpy()[idx] assert_allclose(xx + x_grad_npy, xgrad.asnumpy(), atol=1E-5) @with_seed() def test_slice_like(): for ndim in range(1, 6): from_shape = np.random.randint(1, 11, size=(ndim,)) shape = [s + np.random.randint(0, 3) for s in from_shape] for t in range(ndim): if t > 0: axes = np.random.randint(0, ndim, size=t).tolist() else: axes = [] idx = [] for i in range(ndim): idx.append(slice(0, shape[i])) if i in axes or not axes: idx[i] = slice(0, from_shape[i]) if axes: pos = np.random.randint(0, t) if axes[pos] > 0: axes[pos] -= ndim # negative index X = mx.symbol.Variable('X') X_1 = mx.symbol.Variable('X1') x = mx.nd.array(np.random.normal(size=shape)) x1 = mx.nd.array(np.random.normal(size=from_shape)) Y = mx.symbol.slice_like(data=X, shape_like=X_1, axes=axes) xgrad = mx.nd.empty(x.shape) xgrad1 = mx.nd.empty(x1.shape) exec1 = Y.bind(default_context(), args = [x, x1], args_grad = {'X': xgrad, 'X1': xgrad1}) exec1.forward(is_train=True) y = exec1.outputs[0] assert_allclose(x.asnumpy()[idx], y.asnumpy()) exec1.backward([y]) xx = x.asnumpy() xx[:] = 0.0 xx[idx] = x.asnumpy()[idx] assert_allclose(xx, xgrad.asnumpy()) assert_allclose(xgrad1.asnumpy(), mx.nd.zeros_like(xgrad1).asnumpy()) @with_seed() def test_flip(): for ndim in range(1, 6): for t in range(5): dims = [random.randint(1,10) for i in range(ndim)] axis = random.randint(0, ndim-1) idx = [slice(None, None, -1) if i == axis else slice(None, None) for i in range(ndim)] x = mx.nd.array(np.random.normal(size=dims)) y = mx.nd.flip(x, axis=axis) assert_allclose(x.asnumpy()[idx], y.asnumpy()) @with_seed() def test_stn(): np.set_printoptions(threshold=np.nan) num_filter = 2 # conv of loc net kernel = (3, 3) # conv of loc net num_hidden = 6 # fc of loc net for n in [1, 2, 3, 4]: for c in [1, 2, 3, 4]: for h in [5, 9, 13, 17]: # for convenience test, this third and forth input dim should be 4x + 1 for w in [5, 9, 13, 17]: data_shape = (n, c, h, w) target_shape = (int((data_shape[2]+1)/2), int((data_shape[3]+1)/2)) data = mx.sym.Variable(name="data") loc = mx.sym.Convolution(data=data, kernel=kernel, pad=(1, 1), num_filter=num_filter, name="loc_conv") loc = mx.sym.Flatten(data=loc) loc = mx.sym.FullyConnected(data=loc, num_hidden=num_hidden, name="loc_fc") stn = mx.sym.SpatialTransformer(data=data, loc=loc, target_shape=target_shape, transform_type="affine", sampler_type="bilinear") arg_names = stn.list_arguments() arg_shapes, out_shapes, _ = stn.infer_shape(data=data_shape) # check shape assert out_shapes[0] == (data_shape[0], data_shape[1], target_shape[0], target_shape[1]) dev = default_context() #dev = mx.gpu(0) args = {} args['data'] = mx.random.normal(0, 1, data_shape, ctx=mx.cpu()).copyto(dev) args['loc_conv_weight'] = mx.nd.zeros((num_filter, data_shape[1], kernel[0], kernel[1]), ctx=dev) args['loc_conv_bias'] = mx.nd.zeros((num_filter,), ctx=dev) args['loc_fc_weight'] = mx.nd.zeros((6, num_filter*data_shape[2]*data_shape[3]), ctx=dev) args['loc_fc_bias'] = mx.nd.array([0.5, 0, 0, 0, 0.5, 0], ctx=dev) grad_grad = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes] exe = stn.bind(dev, args=args, args_grad=grad_grad) exe.forward(is_train=True) out = exe.outputs[0].asnumpy() # check forward assert_almost_equal(out, args['data'].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4], rtol=1e-2, atol=1e-4) out_grad = mx.nd.ones(out.shape, ctx=dev) exe.backward([out_grad]) # check backward assert_almost_equal(out_grad.asnumpy(), grad_grad[0].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4], rtol=1e-2, atol=1e-4) def test_stn_valid_sampling(): target_shape = ( 28, 28, ) src_shape = ( 42, 42, ) data = mx.sym.Variable(name="data") loc = mx.sym.Variable(name="loc") data_array = np.zeros(( 1, 1, ) + src_shape) # Have an ever so slight rotation. loc_array = np.array( [[9.03887e-05, 1.00015, 0.00174931, 1.0003, 0.000311901, -0.000919065]]) stn = mx.sym.SpatialTransformer( data=data, loc=loc, target_shape=target_shape, transform_type="affine", sampler_type="bilinear") grad_req = {k: 'write' for k in stn.list_arguments()} grads = { 'data': mx.nd.array(np.zeros_like(data_array)), 'loc': mx.nd.array(np.zeros_like(loc_array)) } executor = stn.bind( ctx=default_context(), args={'data': mx.nd.array(data_array), 'loc': mx.nd.array(loc_array)}, grad_req=grad_req, args_grad=grads) executor.forward(is_train=True) executor.backward(mx.nd.ones(( 1, 1, ) + target_shape)) # @haojin2: Getting rid of fixed seed as flakiness could not be reproduced, # tracked at https://github.com/apache/incubator-mxnet/issues/11714 @with_seed() def test_dot(): ctx=default_context() dtypes = ['float32', 'float64'] if ctx.device_type == 'gpu': dtypes += ['float16'] # Test normal dot. for data_type in dtypes: for m in range(1, 5): for k in range(1, 5): for n in range(1, 5): a_npy = np.random.normal(0, 1, (m, k)) a_npy = a_npy.astype(data_type) b_npy = np.random.normal(0, 1, (k, n)) b_npy = b_npy.astype(data_type) c_npy = np.empty((m, n), dtype=data_type) ograd_npy = np.random.normal(0, 1, (m, n)) ograd_npy = ograd_npy.astype(data_type) agrad_npy = np.empty((m, k), dtype=data_type) bgrad_npy = np.empty((k, n), dtype=data_type) c_npy[:, :] = np.dot(a_npy[:, :], b_npy[:, :]) bgrad_npy[:, :] = np.dot(a_npy[:, :].T, ograd_npy[:, :]) agrad_npy[:, :] = np.dot(ograd_npy[:, :], b_npy[:, :].T) a = mx.sym.Variable('a', dtype=data_type) b = mx.sym.Variable('b', dtype=data_type) c = mx.sym.dot(a, b) exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape) outputs = exe.forward(is_train=True, a=a_npy, b=b_npy) assert_almost_equal(outputs[0].asnumpy(), c_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-3) exe.backward(out_grads=[mx.nd.array(ograd_npy, mx.cpu()).astype(data_type)]) assert_almost_equal(exe.grad_dict['a'].asnumpy(), agrad_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-3) assert_almost_equal(exe.grad_dict['b'].asnumpy(), bgrad_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-3) # Test dot with transpose flag using gradient checker. def dot_sym(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y) def dot_sym_xT(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y, transpose_a=True) def dot_sym_yT(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y, transpose_b=True) def dot_sym_xT_yT(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y, transpose_a=True, transpose_b=True) for data_type in dtypes: for ashape, bshape in [((3, 4), (4, 5)), ((2, 3, 4), (4, 5, 6))]: m1_npy = np.random.uniform(-1, 1, ashape) m1_npy = m1_npy.astype(data_type) m2_npy = np.random.uniform(-1, 1, bshape) m2_npy = m2_npy.astype(data_type) check_numeric_gradient(dot_sym(data_type), [m1_npy, m2_npy], numeric_eps=1e-1, rtol=2e-2, atol=1e-3) check_numeric_gradient(dot_sym_xT(data_type), [m1_npy.T, m2_npy], numeric_eps=1e-1, rtol=2e-2, atol=1e-3) check_numeric_gradient(dot_sym_yT(data_type), [m1_npy, m2_npy.T], numeric_eps=1e-1, rtol=2e-2, atol=1e-3) check_numeric_gradient(dot_sym_xT_yT(data_type), [m1_npy.T, m2_npy.T], numeric_eps=1e-1, rtol=2e-2, atol=1e-3) @with_seed() def test_batch_dot(): dtypes = ['float32', 'float64'] if default_context().device_type == 'gpu': dtypes += ['float16'] for data_type in dtypes: for batch_size in range(1, 5): for m in range(1, 5): for k in range(1, 5): for n in range(1, 5): transpose_a = (np.random.rand() > 0.5) transpose_b = (np.random.rand() > 0.5) a_npy = np.random.normal(0, 1, (batch_size, m, k)) a_npy = a_npy.astype(data_type) b_npy = np.random.normal(0, 1, (batch_size, k, n)) b_npy = b_npy.astype(data_type) c_npy = np.empty((batch_size, m, n), dtype=data_type) ograd_npy = np.random.normal(0, 1, (batch_size, m, n)) ograd_npy = ograd_npy.astype(data_type) agrad_npy = np.empty((batch_size, m, k), dtype=data_type) bgrad_npy = np.empty((batch_size, k, n), dtype=data_type) a_init_grad_npy = np.random.normal(size=(batch_size, m, k)) a_init_grad_npy = a_npy.astype(data_type) b_init_grad_npy = np.random.normal(size=(batch_size, k, n)) b_init_grad_npy = b_npy.astype(data_type) for i in range(batch_size): c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :]) bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :]) agrad_npy[i, :, :] = np.dot(ograd_npy[i, :, :], b_npy[i, :, :].T) a = mx.sym.Variable('a', dtype=data_type) b = mx.sym.Variable('b', dtype=data_type) c = mx.sym.batch_dot(a, b, transpose_a=transpose_a, transpose_b=transpose_b) if transpose_a: a_npy = np.transpose(a_npy, axes=(0, 2, 1)) agrad_npy = np.transpose(agrad_npy, axes=(0, 2, 1)) a_init_grad_npy = np.transpose(a_init_grad_npy, axes=(0, 2, 1)) if transpose_b: b_npy = np.transpose(b_npy, axes=(0, 2, 1)) bgrad_npy = np.transpose(bgrad_npy, axes=(0, 2, 1)) b_init_grad_npy = np.transpose(b_init_grad_npy, axes=(0, 2, 1)) exe = c.simple_bind(ctx=default_context(), a=a_npy.shape, b=b_npy.shape, grad_req='write') exe_add = c.simple_bind(ctx=default_context(), a=a_npy.shape, b=b_npy.shape, grad_req='add') exe_add.grad_dict['a'][:] = a_init_grad_npy exe_add.grad_dict['b'][:] = b_init_grad_npy outputs = exe.forward(is_train=True, a=a_npy, b=b_npy) assert_almost_equal(outputs[0].asnumpy(), c_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-4) exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)]) assert_almost_equal(exe.grad_dict['a'].asnumpy(), agrad_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-4) assert_almost_equal(exe.grad_dict['b'].asnumpy(), bgrad_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-4) exe_add.forward(is_train=True, a=a_npy, b=b_npy) exe_add.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)]) assert_almost_equal(exe_add.grad_dict['a'].asnumpy(), agrad_npy + a_init_grad_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-4) assert_almost_equal(exe_add.grad_dict['b'].asnumpy(), bgrad_npy + b_init_grad_npy, rtol=1e-2 if data_type == 'float16' else 1e-3, atol=1e-2 if data_type == 'float16' else 1e-4) def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply): img1 = mx.sym.Variable('img1') img2 = mx.sym.Variable('img2') return mx.sym.Correlation(data1=img1,data2=img2,kernel_size =kernel_size,max_displacement = max_displacement, stride1 = stride1,stride2 = stride2,pad_size= pad_size,is_multiply = is_multiply) def correlation_forward(data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply): # compute output's dimension paddedbottomheight = data1.shape[2] + 2 * pad_size paddedbottomwidth = data1.shape[3] + 2 * pad_size kernel_radius = (kernel_size - 1) // 2 border_size = max_displacement + kernel_radius top_width = (paddedbottomwidth - border_size * 2) // stride1 top_height = (paddedbottomheight - border_size * 2) // stride1 neighborhood_grid_radius = max_displacement // stride2 neighborhood_grid_width = neighborhood_grid_radius * 2 + 1 top_channels = neighborhood_grid_width * neighborhood_grid_width out = np.zeros((data1.shape[0], top_channels, top_height, top_width)) tmp1 = np.zeros((data1.shape[0],data1.shape[1],paddedbottomheight, paddedbottomwidth)) tmp2 = np.zeros((data1.shape[0],data1.shape[1],paddedbottomheight, paddedbottomwidth)) tmp1[:, :, pad_size:pad_size + data1.shape[2], pad_size:pad_size + data1.shape[3]] = data1[:,:,:,:] tmp2[:, :, pad_size:pad_size + data2.shape[2], pad_size:pad_size + data2.shape[3]] = data2[:,:,:,:] for i in range(top_height): for j in range(top_width): for nbatch in range(data1.shape[0]): # x1,y1 is the location in data1 , i,j is the location in output x1 = j * stride1 + max_displacement y1 = i * stride1 + max_displacement for top_channel in range(top_channels): s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2 s2p = (top_channel // neighborhood_grid_width - neighborhood_grid_radius) * stride2 # location in data2 x2 = x1 + s2o y2 = y1 + s2p for h in range(kernel_size): for w in range(kernel_size): for channel in range(data1.shape[1]): if is_multiply: out[nbatch, top_channel, i, j] += tmp1[nbatch, channel,y1 + h, x1 + w] * tmp2[nbatch, channel, y2 + h,x2 + w] else: out[nbatch, top_channel, i, j] += abs(tmp1[nbatch, channel, y1 + h, x1 + w] - tmp2[nbatch, channel, y2 + h, x2 + w]) out /= float(kernel_size**2*data1.shape[1]) return out,tmp1,tmp2 def correlation_backward(out_grad,tmp1,tmp2,data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply): # compute output's dimension paddedbottomheight = data1.shape[2] + 2 * pad_size paddedbottomwidth = data1.shape[3] + 2 * pad_size kernel_radius = (kernel_size - 1) // 2 border_size = max_displacement + kernel_radius top_width = (paddedbottomwidth - border_size * 2) // stride1 top_height = (paddedbottomheight - border_size * 2) // stride1 neighborhood_grid_radius = max_displacement // stride2 neighborhood_grid_width = neighborhood_grid_radius * 2 + 1 top_channels = neighborhood_grid_width * neighborhood_grid_width out = np.zeros((data1.shape[0], top_channels, top_height, top_width)) tmp1_grad = np.zeros(tmp1.shape) tmp2_grad = np.zeros(tmp2.shape) for i in range(top_height): for j in range(top_width): for nbatch in range(data1.shape[0]): # x1,y1 is the location in data1 , i,j is the location in output x1 = j * stride1 + max_displacement y1 = i * stride1 + max_displacement for top_channel in range(top_channels): s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2 s2p = (top_channel // neighborhood_grid_width - neighborhood_grid_radius) * stride2 # location in data2 x2 = x1 + s2o y2 = y1 + s2p for h in range(kernel_size): for w in range(kernel_size): for channel in range(data1.shape[1]): if is_multiply: tmp1_grad[nbatch,channel,y1+h,x1+w]+= out_grad[nbatch,top_channel,i,j]*tmp2[nbatch, channel, y2 + h,x2 + w] tmp2_grad[nbatch,channel,y2+h,x2+w]+= out_grad[nbatch,top_channel,i,j]*tmp1[nbatch, channel, y1 + h,x1 + w] else: sgn = 1 if (tmp1[nbatch, channel, y1 + h,x1 + w]>=tmp2[nbatch, channel, y2 + h,x2 + w]) else -1 tmp1_grad[nbatch,channel,y1+h,x1+w]+= out_grad[nbatch,top_channel,i,j]*sgn tmp2_grad[nbatch,channel,y2+h,x2+w]+= out_grad[nbatch,top_channel,i,j]*(-sgn) tmp1_grad = tmp1_grad / float(kernel_size**2*data1.shape[1]) tmp2_grad = tmp2_grad / float(kernel_size**2*data1.shape[1]) return tmp1_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],tmp2_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]], def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply,dtype): img1 = np.random.random(data_shape) img1 = img1.astype(dtype) img2 = np.random.random(data_shape) img2 = img2.astype(dtype) net1 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply) net2 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply ) exe1 = net1.simple_bind(default_context(),img1=img1.shape,img2=img1.shape) exe1.arg_dict['img1'][:] = img1 exe1.arg_dict['img2'][:] = img2 #cpu forward exe1.forward(is_train=True) # python forward forward_result,tmp1,tmp2 = correlation_forward(img1,img2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply) # forward error assert_almost_equal(exe1.outputs[0].asnumpy(), forward_result, rtol=1e-4, atol=1e-4) # out_grad a = np.ones(forward_result.shape) out_grad1 = mx.nd.array(a,default_context()) # cpu backward exe1.backward(out_grads=out_grad1) # python backward grad1,grad2 = correlation_backward(a,tmp1,tmp2,img1,img2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply) # backward error assert_almost_equal(exe1.grad_dict['img1'].asnumpy(), grad1, rtol=1e-3, atol=1e-4) assert_almost_equal(exe1.grad_dict['img2'].asnumpy(), grad2, rtol=1e-3, atol=1e-4) @with_seed() def test_correlation(): def test_infer_type(dtype): a = mx.sym.Variable('a') b = mx.sym.Variable('b') corr = mx.sym.Correlation(data1=a, data2=b) arg_type1, out_type1, _ = corr.infer_type(a=dtype) if arg_type1[0] != np.dtype(dtype) and arg_type1[1] != np.dtype(dtype) and out_type1[0] != np.dtype(dtype): msg = npt.npt.build_err_msg([a, b], err_msg="Inferred type from a is not as expected, " "Expected :%s %s %s, Got: %s %s %s" % (dtype, dtype, dtype, arg_type1[0], arg_type1[1], out_type1[0]), names=['a', 'b']) raise AssertionError(msg) arg_type2, out_type2, _ = corr.infer_type(b=dtype) if arg_type2[0] != np.dtype(dtype) and arg_type2[1] != np.dtype(dtype) and out_type2[0] != np.dtype(dtype): msg = npt.npt.build_err_msg([a, b], err_msg="Inferred type from b is not as expected, " "Expected :%s %s %s, Got: %s %s %s" % (dtype, dtype, dtype, arg_type1[0], arg_type1[1], out_type1[0]), names=['a', 'b']) raise AssertionError(msg) for dtype in ['float16', 'float32']: test_infer_type(dtype) unittest_correlation((1,3,10,10), kernel_size = 1,max_displacement = 4,stride1 = 1,stride2 = 1,pad_size = 4,is_multiply = False, dtype = dtype) unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = False, dtype = dtype) unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = True, dtype = dtype) unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 10,stride1 = 1,stride2 = 2,pad_size = 10,is_multiply = True, dtype = dtype) unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = True, dtype = dtype) unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = True, dtype = dtype) unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False, dtype = dtype) unittest_correlation((5,1,6,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False, dtype = dtype) unittest_correlation((5,1,11,11), kernel_size = 5,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = False, dtype = dtype) @with_seed() def test_support_vector_machine_l1_svm(): xpu = default_context() shape = (20, 10) X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') Y = mx.symbol.SVMOutput(data=X, label=L, use_linear=True) x = mx.nd.empty(shape, ctx = xpu) l = mx.nd.empty((shape[0],), ctx = xpu) x_np = np.random.rand(*shape) l_np = np.random.randint(0, shape[1], (shape[0],)) x[:] = x_np l[:] = l_np grad = mx.nd.empty(shape, ctx = xpu) exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) exec1.forward(is_train=True) assert_almost_equal(x_np, exec1.outputs[0].asnumpy()) exec1.backward() l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1])) l_mask = np.array(l_mask, dtype=np.float32)*2 -1 grad_np = (-1) * l_mask * np.greater(1 - l_mask * x_np, 0) assert_almost_equal(grad_np, grad.asnumpy()) @with_seed() def test_support_vector_machine_l2_svm(): xpu = default_context() shape = (20, 10) X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') Y = mx.symbol.SVMOutput(data=X, label=L) x = mx.nd.empty(shape, ctx = xpu) l = mx.nd.empty((shape[0],), ctx = xpu) x_np = np.random.rand(*shape) x_np = x_np.astype(np.float32) l_np = np.random.randint(0, shape[1], (shape[0],)) x[:] = x_np l[:] = l_np grad = mx.nd.empty(shape, ctx = xpu) exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) exec1.forward(is_train=True) assert_almost_equal(x_np, exec1.outputs[0].asnumpy()) exec1.backward() l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1])) l_mask = np.array(l_mask, dtype=np.float32)*2 -1 grad_np = (-2)*l_mask*np.maximum(1-l_mask*x_np,0) grad_np = grad_np.astype(np.float32) assert_almost_equal(grad_np, grad.asnumpy()) # Seed set because the test is not robust enough to operate on random data @with_seed(1234) def test_roipooling(): data = mx.symbol.Variable(name='data') rois = mx.symbol.Variable(name='rois') test = mx.symbol.ROIPooling(data=data, rois=rois, pooled_size=(4, 4), spatial_scale=1) x1 = np.random.rand(4, 3, 12, 8).astype('float32') x2 = np.array([[0, 1.1, 1.1, 6.2, 6.2], [2, 6.1, 2.1, 8.2, 11.2], [1, 3.1, 1.1, 5.2, 10.2], [0, 3, 3, 3, 3]], dtype='float32') check_numeric_gradient(sym=test, location=[x1, x2], grad_nodes={'data':'write', 'rois':'null'}, numeric_eps=1e-4, rtol=1e-1, atol=1e-4) check_numeric_gradient(sym=test, location=[x1, x2], grad_nodes={'data':'add', 'rois':'null'}, numeric_eps=1e-4, rtol=1e-1, atol=1E-4) def check_pad_with_shape(shape, xpu, pad_width, mode, dtype="float64"): # bind with label X = mx.symbol.Variable('X', dtype=dtype) Y = mx.symbol.Pad(data=X, mode=mode, pad_width=pad_width) x = mx.random.uniform(-1, 1, shape, ctx=mx.cpu(), dtype=dtype).copyto(xpu) # numpy result pad_grouped = list(zip(*[iter(list(pad_width))] * 2)) np_out = np.pad(x.asnumpy(), pad_grouped, mode) # mxnet result grad = mx.nd.empty(shape, ctx = xpu, dtype=dtype) exec1 = Y.bind(xpu, args = [x], args_grad = {'X': grad}) exec1.forward(is_train=True) out = exec1.outputs[0].asnumpy() # compare numpy + mxnet assert_almost_equal(out, np_out) # grad check check_numeric_gradient(Y, [x.asnumpy()], numeric_eps=1e-2, rtol=1e-2) @with_seed() def test_pad(): ctx = default_context() shape1 = (2, 3, 3, 5) pad1 = (0, 0, 0, 0, 1, 2, 3, 4) shape2 = (2, 3, 3, 5, 4) pad2 = (0, 0, 0, 0, 1, 2, 3, 4, 3, 1) # note: this op doesn't support ints yet. Add tests when supported dtypes = ["float16", "float32", "float64"] for dtype in dtypes: check_pad_with_shape(shape1, ctx, pad1, 'constant', dtype) check_pad_with_shape(shape1, ctx, pad1, 'edge', dtype) check_pad_with_shape(shape2, ctx, pad2, 'constant', dtype) check_pad_with_shape(shape2, ctx, pad2, 'edge', dtype) check_pad_with_shape(shape1, ctx, pad1, 'reflect', dtype) check_pad_with_shape(shape2, ctx, pad2, 'reflect', dtype) def np_instance_norm(data, weight, bias, eps): spatial_dims = data.shape[2::] num_spatial_vals = np.prod(np.array(spatial_dims)) scale = 1/float(num_spatial_vals) sum_axis = tuple(range(2, data.ndim)) mean = scale * np.sum(data, axis = sum_axis) mean = np.reshape(np.repeat(mean, num_spatial_vals), data.shape) var = scale * np.sum((data - mean)**2, axis = sum_axis) var = np.reshape(np.repeat(var, num_spatial_vals), data.shape) weightBatch = np.tile(weight, (data.shape[0], 1)) weightBatch = np.reshape(np.repeat(weightBatch, num_spatial_vals), data.shape) biasBatch = np.tile(bias, (data.shape[0], 1)) biasBatch = np.reshape(np.repeat(biasBatch, num_spatial_vals), data.shape) return weightBatch * (data - mean)/np.sqrt(var + eps) + biasBatch def check_instance_norm_with_shape(shape, xpu): # bind with label eps = 0.001 X = mx.symbol.Variable('X') G = mx.symbol.Variable('G') B = mx.symbol.Variable('B') Y = mx.symbol.InstanceNorm(data=X, beta=B, gamma=G, eps=eps) x = mx.random.normal(0, 1, shape, ctx=mx.cpu()).copyto(xpu) gamma = mx.random.normal(0, 1, shape[1], ctx=mx.cpu()).copyto(xpu) beta = mx.random.normal(0, 1, shape[1], ctx=mx.cpu()).copyto(xpu) np_out = np_instance_norm(x.asnumpy(), gamma.asnumpy(), beta.asnumpy(), eps) exec1 = Y.bind(xpu, args = {'X':x, 'G':gamma, 'B':beta}) exec1.forward(is_train=False) out = exec1.outputs[0].asnumpy() assert_almost_equal(out, np_out, rtol=1e-4, atol=1e-4) check_numeric_gradient(Y, {'X':x.asnumpy(), 'G':gamma.asnumpy(), 'B':beta.asnumpy()}, numeric_eps=1e-2, rtol=1e-2, atol=1e-2) @with_seed() def test_instance_normalization(): check_instance_norm_with_shape((1, 1, 1), default_context()) check_instance_norm_with_shape((2, 1, 2), default_context()) check_instance_norm_with_shape((2,4,5,6), default_context()) check_instance_norm_with_shape((3,3,2,3,2,1,1), default_context()) def check_l2_normalization(in_shape, mode, dtype, norm_eps=1e-10): ctx = default_context() data = mx.symbol.Variable('data') out = mx.symbol.L2Normalization(data=data, mode=mode, eps=norm_eps) in_data = np.random.uniform(-1, 1, in_shape).astype(dtype) # calculate numpy results if mode == 'channel': assert in_data.ndim > 2 np_norm = np.linalg.norm(in_data, axis=1) + norm_eps np_norm = np.repeat(1. / np.expand_dims(np_norm, axis=1), in_data.shape[1], axis=1) np_out = np.multiply(in_data, np_norm) elif mode == 'spatial': assert in_data.ndim > 2 s = in_data.shape np_norm = np.linalg.norm(in_data.reshape((s[0], s[1], -1)), axis=2) + norm_eps np_norm = np.repeat(1. / np_norm[:, np.newaxis], in_data.size / s[0] / s[1], axis=2) np_out = np.multiply(in_data, np_norm.reshape(s)) elif mode == 'instance': assert in_data.ndim > 1 s = in_data.shape np_norm = np.linalg.norm(in_data.reshape((s[0], -1)), axis=1) + norm_eps np_norm = np.repeat(1. / np_norm[:, np.newaxis], in_data.size / s[0], axis=1) np_out = np.multiply(in_data, np_norm.reshape(s)) else: raise RuntimeError('Unknown l2 normalization mode') exe = out.simple_bind(ctx=ctx, data=in_data.shape) output = exe.forward(is_train=True, data=in_data) # compare numpy + mxnet assert_almost_equal(exe.outputs[0].asnumpy(), np_out, rtol=1e-2 if dtype is 'float16' else 1e-5, atol=1e-5) # check gradient check_numeric_gradient(out, [in_data], numeric_eps=1e-3, rtol=1e-2, atol=5e-3) @with_seed() def test_l2_normalization(): for dtype in ['float16', 'float32', 'float64']: for mode in ['channel', 'spatial', 'instance']: nbatch = random.randint(1, 4) nchannel = random.randint(3, 5) height = random.randint(4, 6) check_l2_normalization((nbatch, nchannel, height), mode, dtype) width = random.randint(5, 7) check_l2_normalization((nbatch, nchannel, height, width), mode, dtype) def check_layer_normalization(in_shape, axis, eps, dtype=np.float32, forward_check_eps=1E-3): def npy_layer_norm(data, gamma, beta, axis=1, eps=1E-5): if axis < 0: axis += data.ndim broadcast_shape = [1 for _ in range(data.ndim)] broadcast_shape[axis] = data.shape[axis] mean = data.mean(axis=axis, keepdims=True).astype(dtype) var = data.var(axis=axis, keepdims=True).astype(dtype) std = np.sqrt(var + dtype(eps)).astype(dtype) out = np.reshape(gamma, broadcast_shape) * (data - mean) / std + \ np.reshape(beta, broadcast_shape) return out ctx = default_context() data = np.random.normal(0, 1, in_shape).astype(dtype) gamma = np.random.normal(0, 1, (in_shape[axis],)).astype(dtype) beta = np.random.normal(0, 1, (in_shape[axis],)).astype(dtype) data_s = mx.symbol.Variable('data') gamma_s = mx.symbol.Variable('gamma') beta_s = mx.symbol.Variable('beta') out_s = mx.symbol.LayerNorm(data=data_s, gamma=gamma_s, beta=beta_s, axis=axis, eps=eps) exe = out_s.simple_bind(ctx, data=in_shape) exe.arg_dict['data'][:] = data exe.arg_dict['gamma'][:] = gamma exe.arg_dict['beta'][:] = beta out_nd = exe.forward()[0] out = npy_layer_norm(data, gamma, beta, axis, eps) assert_almost_equal(out, out_nd.asnumpy(), forward_check_eps, forward_check_eps) for req in ['write', 'add']: check_numeric_gradient(out_s, {'data': data, 'gamma': gamma, 'beta': beta}, grad_nodes={'data': req, 'gamma': req, 'beta': req}, numeric_eps=1e-2, rtol=1e-2, atol=1e-2) @with_seed() def test_norm(): try: import scipy assert LooseVersion(scipy.__version__) >= LooseVersion('0.1') from scipy.linalg import norm as sp_norm except (AssertionError, ImportError): print("Could not import scipy.linalg.norm or scipy is too old. " "Falling back to numpy.linalg.norm which is not numerically stable.") from numpy.linalg import norm as sp_norm def l1norm(input_data, axis=0, keepdims=True): return np.sum(abs(input_data), axis=axis, keepdims=keepdims) def l2norm(input_data, axis=0, keepdims=True): return sp_norm(input_data, axis=axis, keepdims=keepdims) ctx = default_context() data = mx.symbol.Variable('data') in_data_dim = random_sample([4,5,6], 1)[0] in_shape = rand_shape_nd(in_data_dim) epsilon = 1e-3 for order in [1, 2]: for dtype in [np.float16, np.float32, np.float64]: in_data = np.random.uniform(-1, 1, in_shape).astype(dtype) in_data[abs(in_data) < epsilon] = 2 * epsilon for i in range(in_data_dim): norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, keepdims=True) npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i) npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out check_symbolic_forward(norm_sym, [in_data], [npy_out], rtol=1e-2 if dtype is np.float16 else 1e-5, atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx) check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)], [npy_out_backward], rtol=1e-2 if dtype is np.float16 else 1e-5, atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx) # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509 # # check gradient # if dtype is not np.float16: # check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3) if i < in_data_dim-1: norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True) npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1)) npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out check_symbolic_forward(norm_sym, [in_data], [npy_out], rtol=1e-2 if dtype is np.float16 else 1e-5, atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx) check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)], [npy_out_backward], rtol=1e-2 if dtype is np.float16 else 1e-5, atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx) # # check gradient # if dtype is not np.float16: # check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3) def test_layer_norm(): for dtype, forward_check_eps in zip([np.float16, np.float32, np.float64], [1E-2, 1E-3, 1E-4]): for in_shape in [(10, 6, 5), (10, 10)]: for axis in range(-len(in_shape), len(in_shape)): for eps in [1E-2, 1E-3]: check_layer_normalization(in_shape, axis, eps, dtype=dtype, forward_check_eps=forward_check_eps) # Numpy Implementation of Sequence Ops def sequence_last_numpy(array, lengths, axis): # create new array of dims [batch, seqlen, ...] array2 = np.moveaxis(array, axis, 1) dims = array2.shape if lengths is None: return array2[:, -1] lengths = list(lengths) return np.array([array2[i, int(lengths[i]) - 1] for i in range(dims[0])]) def sequence_mask_numpy(array, lengths, axis, value): if lengths is None: return array arrayMask = array.copy() # conform to [batch, seqlen, ...] arrayMask = np.moveaxis(arrayMask, axis, 1) shape = arrayMask.shape lengths = list(lengths) for i in range(shape[0]): arrayMask[i, int(lengths[i]):] = value return np.moveaxis(arrayMask, 1, axis) def sequence_reverse_numpy(array, lengths, axis): rarray = array.copy() # conform to [batch, seqlen, ...] rarray = np.moveaxis(rarray, axis, 1) shape = rarray.shape if lengths is None: lengths = [shape[1]] * shape[0] lengths = list(lengths) for i in range(shape[0]): j = int(lengths[i]) rarray[i,:j] = rarray[i,:j][::-1] return np.moveaxis(rarray, 1, axis) def check_sequence_func(ftype, mask_value=0, axis=0): # bind with label xpu = default_context() X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') # lengths shapes = [(3, 4), (1, 1), (3, 4, 3, 1, 1)] for seqlenQ in [True, False]: for s in shapes: x = mx.random.uniform(-1, 1, s, ctx=mx.cpu()).copyto(xpu) batch = s[1] if (axis == 0) else s[0] seqlen = s[axis] l_np = np.random.randint(1, seqlen + 1, batch) l = mx.nd.array(l_np, ctx=mx.cpu()).copyto(xpu) if not seqlenQ: l_np = None args = {'data':X, 'use_sequence_length':seqlenQ, "axis":axis} if seqlenQ: args['sequence_length'] = L if ftype == "last": Y = mx.symbol.SequenceLast(**args) np_out = sequence_last_numpy(x.asnumpy(), l_np, axis) elif ftype == "mask": args['value'] = mask_value Y = mx.symbol.SequenceMask(**args) np_out = sequence_mask_numpy(x.asnumpy(), l_np, axis, mask_value) elif ftype == "reverse": Y = mx.symbol.SequenceReverse(**args) np_out = sequence_reverse_numpy(x.asnumpy(), l_np, axis) fargs = [x, l] if seqlenQ else [x] gargs = [x.asnumpy(), l_np] if seqlenQ else [x.asnumpy()] check_symbolic_forward(Y, fargs, [np_out]) check_numeric_gradient(Y, gargs, grad_nodes={'X':'write'}, numeric_eps=1e-2, rtol=1e-2) check_numeric_gradient(Y, gargs, grad_nodes={'X':'add'}, numeric_eps=1e-3, rtol=1e-2, atol=1E-4) check_numeric_gradient(Y, gargs, grad_nodes={'X':'null'}, numeric_eps=1e-3, rtol=1e-2, atol=1E-4) @with_seed() @unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/11395") def test_sequence_last(): check_sequence_func("last", axis=0) check_sequence_func("last", axis=1) @with_seed() def test_sequence_mask(): check_sequence_func("mask", axis = 0, mask_value=-2.3) check_sequence_func("mask", axis = 1, mask_value=0.3) def check_sequence_reverse(xpu): # sample data arr = np.array( [[[ 1., 2., 3.], [ 4., 5., 6.]], [[ 7., 8., 9.], [ 10., 11., 12.]], [[ 13., 14., 15.], [ 16., 17., 18.]]]) arr1 = np.array( [[[ 13., 14., 15.], [ 16., 17., 18.]], [[ 7., 8., 9.], [ 10., 11., 12.]], [[ 1., 2., 3.], [ 4., 5., 6.]]]) arr2 = np.array( [[[ 7., 8., 9.], [ 10., 11., 12.]], [[ 1., 2., 3.], [ 4., 5., 6.]], [[ 13., 14., 15.], [ 16., 17., 18.]]]) arr3 = np.array( [[[ 7., 8., 9.], [ 16., 17., 18.]], [[ 1., 2., 3.], [ 10., 11., 12.]], [[ 13., 14., 15.], [ 4., 5., 6.]]]) # test for matrix case seq_len_1 = [1, 2, 2] arr_4 = np.array([[7., 8., 9.], [16., 17., 5.4]], dtype=np.float32) arr_5 = np.array([[7., 17., 5.4], [16., 8., 9.]], dtype=np.float32) def test_wrapper(arr, xpu, sequence_length=None, use_sequence_length=False): # MxNet symbol creation seq = mx.sym.Variable('seq') if sequence_length and use_sequence_length: seq_len = mx.sym.Variable('seq_len') else: # ensure that both are disabled, not just one seq_len=None use_sequence_length=False rev = mx.sym.SequenceReverse(data=seq, sequence_length=seq_len, use_sequence_length=use_sequence_length) # MxNet symbol execution if sequence_length: bound = rev.bind(xpu, {'seq': mx.nd.array(arr), 'seq_len': mx.nd.array(sequence_length)}) else: bound = rev.bind(xpu, {'seq': mx.nd.array(arr)}) fwd = bound.forward() return fwd[0].asnumpy() # test cases assert_array_equal(test_wrapper(arr, xpu, use_sequence_length=False), arr1) assert_array_equal(test_wrapper(arr, xpu, sequence_length=[3, 3], use_sequence_length=True), arr1) assert_array_equal(test_wrapper(arr, xpu, sequence_length=[2, 2], use_sequence_length=True), arr2) assert_array_equal(test_wrapper(arr, xpu, sequence_length=[2, 3], use_sequence_length=True), arr3) assert_array_equal(test_wrapper(arr_4, xpu, sequence_length=seq_len_1, use_sequence_length=True), arr_5) @with_seed() def test_sequence_reverse(): check_sequence_func("reverse", axis=0) check_sequence_reverse(mx.cpu()) def mathematical_core_binary(name, forward_mxnet_call, forward_numpy_call, backward_numpy_call1, backward_numpy_call2, data1_init=2., data2_init=3., grad_init=2.): data1 = mx.symbol.Variable('data') data2 = mx.symbol.Variable('data') shape = (3, 4) data_tmp1 = np.random.rand(3, 4) data_tmp2 = np.random.rand(3, 4) data_tmp1[:] = data1_init data_tmp2[:] = data2_init arr_data1 = mx.nd.array(data_tmp1) arr_data2 = mx.nd.array(data_tmp2) arr_grad1 = mx.nd.empty(shape) arr_grad2 = mx.nd.empty(shape) test = forward_mxnet_call(data1, data2) exe_test = test.bind(default_context(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = forward_numpy_call(data_tmp1, data_tmp2) assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) out_grad[:] = grad_init exe_test.backward(out_grad) npout_grad = np.ones(shape) npout_grad[:] = grad_init npout_grad1 = npout_grad * backward_numpy_call1(data_tmp1, data_tmp2) npout_grad2 = npout_grad * backward_numpy_call2(data_tmp1, data_tmp2) arr_grad1 = arr_grad1.asnumpy() arr_grad2 = arr_grad2.asnumpy() assert_almost_equal(arr_grad1, npout_grad1) assert_almost_equal(arr_grad2, npout_grad2) def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_numpy_call, data_init=5., grad_init=2.): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:] = data_init arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:] = 3 test = forward_mxnet_call(data) exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = forward_numpy_call(data_tmp) assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) out_grad[:] = grad_init npout_grad = out_grad.asnumpy() temp = backward_numpy_call(data_tmp) npout_grad = npout_grad * temp exe_test.backward(out_grad) arr_grad = arr_grad.asnumpy() # print(name) # print(arr_grad) # print(npout_grad) assert_almost_equal(arr_grad, npout_grad) @with_seed() def test_special_functions_using_scipy(): try: from scipy import special as scipy_special except: print("Could not import scipy. Skipping unit tests for special functions") return # gamma mathematical_core("gamma", lambda x: mx.sym.gamma(x), lambda x: scipy_special.gamma(x), lambda x: scipy_special.gamma(x) * scipy_special.psi(x), 0.5, 0.5) # gammaln mathematical_core("gammaln", lambda x: mx.sym.gammaln(x), lambda x: scipy_special.gammaln(x), lambda x: scipy_special.psi(x), 0.5, 0.5) # erf mathematical_core("erf", lambda x: mx.sym.erf(x), lambda x: scipy_special.erf(x), lambda x: 2.0 / math.sqrt(math.pi) * math.exp(-(x ** 2)), 0.5, 0.5) def rounding(name, forward_mxnet_call, forward_numpy_call, data_init=5., grad_init=2.): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:] = data_init arr_data = mx.nd.array(data_tmp) test = forward_mxnet_call(data) exe_test = test.bind(default_context(), args=[arr_data]) exe_test.forward(is_train=True) out = exe_test.outputs[0].asnumpy() npout = forward_numpy_call(data_tmp) assert_almost_equal(out, npout) @with_seed() def test_mathematical(): # rsqrt mathematical_core("rsqrt", lambda x: mx.sym.rsqrt(x), lambda x: 1 / np.sqrt(x), lambda x: -(1.0 / (2.0 * x * np.sqrt(x)))) # tan mathematical_core("tan", lambda x: mx.sym.tan(x), lambda x: np.tan(x), lambda x: np.tan(x) ** 2 + 1) # arcsin mathematical_core("arcsin", lambda x: mx.sym.arcsin(x), lambda x: np.arcsin(x), lambda x: 1. / (1. - x ** 2) ** (1. / 2.), 0.5, 0.5) # arccos mathematical_core("arccos", lambda x: mx.sym.arccos(x), lambda x: np.arccos(x), lambda x: -1. / (1. - x ** 2.) ** (1. / 2.), 0.5, 0.5) # arctan mathematical_core("arctan", lambda x: mx.sym.arctan(x), lambda x: np.arctan(x), lambda x: 1. / (x ** 2. + 1.), 0.5, 0.5) # hypot mathematical_core_binary("hypot", lambda x, y: mx.sym.hypot(x, y), lambda x, y: np.hypot(x, y), lambda x, y: x / np.hypot(x, y), lambda x, y: y / np.hypot(x, y), 0.5, 0.5, 0.5) # hypot scalar mathematical_core("hypot scalar", lambda x: mx.sym.hypot(x, 3), lambda x: np.hypot(x, 3), lambda x: x / np.hypot(x, 3), 0.5, 0.5) # degrees mathematical_core("degrees", lambda x: mx.sym.degrees(x), lambda x: np.degrees(x), lambda x: 180./np.pi, 0.5, 0.5) # radians mathematical_core("radians", lambda x: mx.sym.radians(x), lambda x: np.radians(x), lambda x: np.pi / 180., 0.6, 1) # sinh mathematical_core("sinh", lambda x: mx.sym.sinh(x), lambda x: np.sinh(x), lambda x: np.cosh(x)) # cosh mathematical_core("cosh", lambda x: mx.sym.cosh(x), lambda x: np.cosh(x), lambda x: np.sinh(x), 5, 5) # tanh mathematical_core("tanh", lambda x: mx.sym.tanh(x), lambda x: np.tanh(x), lambda x: 1. - np.tanh(x) ** 2, 0.5, 1) # arcsinh mathematical_core("arcsinh", lambda x: mx.sym.arcsinh(x), lambda x: np.arcsinh(x), lambda x: 1./(x**2 + 1.)**(1./2.)) # arccosh mathematical_core("arccosh", lambda x: mx.sym.arccosh(x), lambda x: np.arccosh(x), lambda x: 1./(x**2 - 1.)**(1./2.)) # arctanh mathematical_core("arctanh", lambda x: mx.sym.arctanh(x), lambda x: np.arctanh(x), lambda x: -1./(x**2 - 1.), 0.5) # log1p mathematical_core("log1p", lambda x: mx.sym.log1p(x), lambda x: np.log1p(x), lambda x: 1. / (1.0 + x), 0.5, 0.5) # expm1 mathematical_core("expm1", lambda x: mx.sym.expm1(x), lambda x: np.expm1(x), lambda x: np.exp(x), 0.5, 0.5) # log10 mathematical_core("log10", lambda x: mx.sym.log10(x), lambda x: np.log10(x), lambda x: 1. / (x * np.log(10.))) # log2 mathematical_core("log2", lambda x: mx.sym.log2(x), lambda x: np.log2(x), lambda x: 1. / (x * np.log(2.))) # rint rounding("rint", lambda x: mx.sym.rint(x), lambda x: np.rint(x)) # fix rounding("fix", lambda x: mx.sym.fix(x), lambda x: np.fix(x)) @with_seed() def test_special_functions_using_scipy(): try: from scipy import special as scipy_special except: print("Could not import scipy. Skipping unit tests for special functions") return # gamma mathematical_core("gamma", lambda x: mx.sym.gamma(x), lambda x: scipy_special.gamma(x), lambda x: scipy_special.gamma(x) * scipy_special.psi(x), 0.5, 0.5) # gammaln mathematical_core("gammaln", lambda x: mx.sym.gammaln(x), lambda x: scipy_special.gammaln(x), lambda x: scipy_special.psi(x), 0.5, 0.5) @with_seed() def test_clip(): data = mx.symbol.Variable('data') shape = (30, 30) data_tmp = np.random.uniform(-1, 1, shape) test = mx.sym.clip(data, a_max=0.6, a_min=-0.6) check_symbolic_forward(test, [data_tmp], [np.clip(data_tmp, -0.6, 0.6)]) check_symbolic_backward(test, [data_tmp], [np.ones(shape)], [np.where(data_tmp < 0.6, [1], [0]) * np.where(data_tmp > -0.6, [1], [0])]) @with_seed() def test_init(): def test_basic_val_init(sym_func, np_func, shape, dtype): x = sym_func(shape=shape, dtype=dtype) exe = x.bind(default_context(), args=[], args_grad=[]) exe.forward(is_train=True) assert_almost_equal(exe.outputs[0].asnumpy(), np_func(shape=shape, dtype=dtype)) assert exe.outputs[0].asnumpy().dtype == dtype def test_arange(): # General Random Tests dtype_list = [np.float32, np.float64, np.int32, np.uint8] config_list = [(10,), (0, 10), (5, 100, 4), (50, -50, -2), (-100, 100, 1), (1.3, 456.6, 1.3)] for dtype in dtype_list: for config in config_list: repeats = random.choice([1, 3]) np_out = np.repeat(np.arange(*config, dtype=dtype), repeats) nd_out = mx.nd.arange(*config, repeat=repeats, dtype=dtype) assert_almost_equal(np_out, nd_out.asnumpy()) def test_arange_inferstop(): s = mx.sym.arange(start=0, stop=None, infer_range=True) s = mx.sym.elemwise_add(s, mx.sym.zeros(shape=[5])) exe = s.bind(ctx=mx.cpu(), args={}) exe.forward() assert_almost_equal(exe.outputs[0].asnumpy(), np.array([0,1,2,3,4])) test_basic_val_init(mx.sym.zeros, np.zeros, (3, 4), np.float32) test_basic_val_init(mx.sym.ones, np.ones, 3, np.int32) test_basic_val_init(mx.sym.ones, np.ones, (2, 2, 3), np.float16) test_arange() test_arange_inferstop() @with_seed() def test_order(): ctx = default_context() def gt_topk(dat, axis, ret_typ, k, is_ascend): if ret_typ == "indices": if is_ascend: indices = np.arange(k) else: indices = np.arange(-1, -k-1, -1) ret = np.take(dat.argsort(axis=axis), axis=axis, indices=indices, mode='wrap') elif ret_typ == "value": if is_ascend: indices = np.arange(k) else: indices = np.arange(-1, -k-1, -1) ret = np.take(np.sort(dat, axis=axis), axis=axis, indices=indices, mode='wrap') else: assert dat.shape == (5, 5, 5, 5) assert axis is None or axis == 1 ret = np.zeros(dat.shape) if is_ascend: indices = np.arange(k) else: indices = np.arange(-1, -k-1, -1) gt_argsort = np.take(dat.argsort(axis=axis), axis=axis, indices=indices, mode='wrap') if axis is None: ret.ravel()[gt_argsort] = 1 else: for i in range(5): for j in range(5): for k in range(5): ret[i, gt_argsort[i, :, j, k], j, k] = 1 return ret dshape = (5, 5, 5, 5) a_npy = np.arange(np.prod(dshape)).astype(np.float32) np.random.shuffle(a_npy) a_npy = a_npy.reshape(dshape) a = mx.sym.Variable('a') def get_large_matrix(): data = np.array([np.arange(300096).astype(np.float32)]) data = np.repeat(data, 100, axis=0) np.apply_along_axis(np.random.shuffle, 1, data) return data large_matrix_npy = get_large_matrix() for axis in [1, 3, None]: K = [1, 3, 5, 7] if axis is None else [1, 3, 5] for k in K: for is_ascend in [True, False]: b = mx.sym.topk(a, axis=axis, is_ascend=is_ascend, ret_typ="value", k=k) out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=k, is_ascend=is_ascend) check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, ctx=ctx) check_symbolic_forward(b, location={'a': a_npy}, expected=[out_npy]) for axis in [1, 3, None]: for is_ascend in [True, False]: b = mx.sym.sort(a, axis=axis, is_ascend=is_ascend) if axis is None: out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=a_npy.size, is_ascend=is_ascend) else: out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=5, is_ascend=is_ascend) check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, ctx=ctx) check_symbolic_forward(b, location={'a': a_npy}, expected=[out_npy]) b = mx.sym.topk(a, axis=1, is_ascend=is_ascend, ret_typ="indices", k=5) check_symbolic_backward(sym=b, location={'a': large_matrix_npy}, out_grads=[np.random.normal(size=(100, 5))], expected=[np.zeros((100, 300096))]) check_symbolic_forward(b, location={'a': large_matrix_npy}, expected=[gt_topk(dat=large_matrix_npy, axis=1, ret_typ="indices", k=5, is_ascend=is_ascend)]) b = mx.sym.topk(a, axis=3, is_ascend=is_ascend, ret_typ="indices", k=3) check_symbolic_backward(sym=b, location={'a': a_npy}, out_grads=[np.random.normal(size=(5, 5, 5, 3))], expected=[np.zeros((5, 5, 5, 5))]) check_symbolic_forward(b, location={'a': a_npy}, expected=[gt_topk(dat=a_npy, axis=3, ret_typ="indices", k=3, is_ascend=False)]) b = mx.sym.topk(a, axis=1, is_ascend=True, ret_typ="mask", k=3) check_symbolic_backward(sym=b, location={'a': a_npy}, out_grads=[np.random.normal(size=(5, 5, 5, 5))], expected=[np.zeros((5, 5, 5, 5))]) check_symbolic_forward(b, location={'a': a_npy}, expected=[gt_topk(dat=a_npy, axis=1, ret_typ="mask", k=3, is_ascend=True)]) b = mx.sym.argsort(a, axis=1, is_ascend=False) check_symbolic_backward(sym=b, location={'a': a_npy}, out_grads=[np.random.normal(size=(5, 5, 5, 5))], expected=[np.zeros((5, 5, 5, 5))]) check_symbolic_forward(b, location={'a': a_npy}, expected=[gt_topk(dat=a_npy, axis=1, ret_typ="indices", k=5, is_ascend=False)]) b = mx.sym.argmax(a, axis=1, keepdims=True) check_symbolic_backward(sym=b, location={'a': a_npy}, out_grads=[np.random.normal(size=(5, 5, 5, 5))], expected=[np.zeros((5, 5, 5, 5))]) check_symbolic_forward(b, location={'a': a_npy}, expected=[gt_topk(dat=a_npy, axis=1, ret_typ="indices", k=1, is_ascend=False)]) b = mx.sym.argmin(a, axis=1, keepdims=True) check_symbolic_backward(sym=b, location={'a': a_npy}, out_grads=[np.random.normal(size=(5, 5, 5, 5))], expected=[np.zeros((5, 5, 5, 5))]) check_symbolic_forward(b, location={'a': a_npy}, expected=[gt_topk(dat=a_npy, axis=1, ret_typ="indices", k=1, is_ascend=True)]) @with_seed() def test_blockgrad(): a = mx.sym.Variable('a') b = mx.sym.BlockGrad(a) exe = b.simple_bind(ctx=default_context(), a=(10, 10)) a_npy = np.random.rand(10, 10) exe.forward(is_train=True, a=a_npy) assert_almost_equal(exe.outputs[0].asnumpy(), a_npy) exe.backward() # No error if BlockGrad works @with_seed() def test_take(): def grad_helper(grad_in, axis, idx): if axis == 0: if axis == len(grad_in.shape) - 1: grad_in[idx] += 1.0 else: grad_in[idx, :] += 1.0 elif axis == 1: if axis == len(grad_in.shape) - 1: grad_in[:, idx] += 1.0 else: grad_in[:, idx, :] += 1.0 elif axis == 2: if axis == len(grad_in.shape) - 1: grad_in[:, :, idx] += 1.0 else: grad_in[:, :, idx, :] += 1.0 elif axis == 3: if axis == len(grad_in.shape) - 1: grad_in[:, :, :, idx] += 1.0 else: grad_in[:, :, :, idx, :] += 1.0 elif axis == 4: grad_in[:, :, :, :, idx] += 1.0 else: raise ValueError("axis %d is not supported..." % axis) def check_output_n_grad(data_shape, idx_shape, axis, mode): data = mx.sym.Variable('a') idx = mx.sym.Variable('indices') idx = mx.sym.BlockGrad(idx) result = mx.sym.take(a=data, indices=idx, axis=axis, mode=mode) exe = result.simple_bind(default_context(), a=data_shape, indices=idx_shape, axis=axis, mode=mode) data_real = np.random.normal(size=data_shape).astype('float32') idx_real = np.random.randint(low=0, high=data_shape[axis], size=idx_shape) if axis < 0: axis += len(data_shape) grad_out = np.ones((data_shape[0:axis] if axis > 0 else ()) + idx_shape + (data_shape[axis+1:] if axis < len(data_shape) - 1 else ()), dtype='float32') grad_in = np.zeros(data_shape, dtype='float32') exe.arg_dict['a'][:] = mx.nd.array(data_real) exe.arg_dict['indices'][:] = mx.nd.array(idx_real) exe.forward(is_train=True) assert_almost_equal(exe.outputs[0].asnumpy(), np.take(data_real, idx_real, axis=axis, mode=mode)) for i in np.nditer(idx_real): grad_helper(grad_in, axis, i) exe.backward([mx.nd.array(grad_out)]) assert_almost_equal(exe.grad_dict['a'].asnumpy(), grad_in) def check_autograd_req(): row_len = 2 col_len = 8 shape = (row_len, col_len) sc = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype="float32") sc.attach_grad() i = mx.nd.array([0], dtype="int64") j = mx.nd.array([0], dtype="int64") with mx.autograd.record(train_mode=True): xs = [] for _ in range(row_len): x_i = [] for _ in range(col_len): x_ij = sc.take(i).squeeze(axis=0).take(j).squeeze(axis=0) x_i.append(x_ij) j = j + 1 i = i + 1 j = j - col_len # reset j xs.append(mx.nd.stack(*x_i)) x = mx.nd.stack(*xs) x = x.sum() x.backward() assert_almost_equal(np.ones(sc.grad.shape), sc.grad.asnumpy()) for mode in ['clip', 'wrap']: for data_ndim in range(1, 5): for idx_ndim in range(1, 4): for axis in range(-data_ndim, data_ndim): data_shape = () for _ in range(data_ndim): data_shape += (np.random.randint(low=1, high=5), ) idx_shape = () for _ in range(idx_ndim): idx_shape += (np.random.randint(low=1, high=5), ) check_output_n_grad(data_shape, idx_shape, axis, mode) check_autograd_req() @with_seed() def test_grid_generator(): # transform_type = affine test_case = [(20,21),(4,3),(6,12),(15,17)] for target_shape in test_case: affine_matrix = mx.sym.Variable('affine') grid = mx.sym.GridGenerator(data=affine_matrix,transform_type='affine', target_shape=target_shape) exe = grid.simple_bind(ctx=default_context(), affine=(1,6), grad_req='write') # check forward exe.arg_dict['affine'][:] = np.array([[1.0,0,0,0,1.0,0]]) exe.forward(is_train=True) output = exe.outputs[0].asnumpy() output[0,0,:,:] = (output[0,0,:,:] + 1) * (target_shape[1] - 1) / 2.0 output[0,1,:,:] = (output[0,1,:,:] + 1) * (target_shape[0] - 1) / 2.0 xv, yv = np.meshgrid(np.arange(target_shape[0]), np.arange(target_shape[1])) assert_almost_equal(output[0,0], yv.T) assert_almost_equal(output[0,1], xv.T) # check backward out_grad = np.random.normal(size=(1,2)+target_shape) exe.backward(mx.nd.array(out_grad)) tmp = np.zeros((3,target_shape[0]*target_shape[1])) tmp[0] = -1.0 + (np.arange(target_shape[0]*target_shape[1]) % target_shape[1]) * (2.0 / (target_shape[1]-1)) tmp[1] = -1.0 + (np.arange(target_shape[0]*target_shape[1]) // target_shape[1]) * (2.0 / (target_shape[0]-1)) tmp[2] = 1 grad_est = np.dot(out_grad[0].reshape(2,target_shape[0]*target_shape[1]),tmp.T).reshape(1,6) assert_almost_equal(exe.grad_dict['affine'].asnumpy(), grad_est, rtol=1e-3, atol=1e-5) # check addto exe = grid.simple_bind(ctx=default_context(), affine=(1,6), grad_req='add') grid_grad_npy = np.random.normal(size=exe.grad_dict['affine'].shape) exe.grad_dict['affine'][:] = grid_grad_npy exe.arg_dict['affine'][:] = np.array([[1.0, 0, 0, 0, 1.0, 0]]) exe.forward(is_train=True) exe.backward(mx.nd.array(out_grad)) assert_almost_equal(exe.grad_dict['affine'].asnumpy(), grad_est + grid_grad_npy, rtol=1e-2, atol=1e-5) # transform_type = warp test_case = [(12,21),(4,3),(6,12)] for target_shape in test_case: flow = mx.sym.Variable('flow') grid = mx.sym.GridGenerator(data=flow,transform_type='warp', target_shape=target_shape) exe = grid.simple_bind(ctx=default_context(), flow=(1,2)+target_shape, grad_req='write') # check forward exe.arg_dict['flow'][:] = np.ones((1,2)+target_shape) exe.forward(is_train=True) output = exe.outputs[0].asnumpy() output[0,0,:,:] = (output[0,0,:,:] + 1) * (target_shape[1] - 1) / 2.0 output[0,1,:,:] = (output[0,1,:,:] + 1) * (target_shape[0] - 1) / 2.0 xv, yv = np.meshgrid(np.arange(target_shape[0])+1, np.arange(target_shape[1])+1) assert_almost_equal(output[0,0], yv.T) assert_almost_equal(output[0,1], xv.T) # check backward out_grad = np.random.normal(size=(1,2)+target_shape) exe.backward(mx.nd.array(out_grad)) grad_est = np.zeros((1,2)+target_shape) grad_est[0,0] = out_grad[0,0] / ((target_shape[1]-1.0) / 2.0) grad_est[0,1] = out_grad[0,1] / ((target_shape[0]-1.0) / 2.0) assert_almost_equal(exe.grad_dict['flow'].asnumpy(), grad_est, rtol=1e-3) # check addto exe_add = grid.simple_bind(ctx=default_context(), flow=(1, 2) + target_shape, grad_req='add') flow_grad_npy = np.random.normal(size=exe_add.grad_dict['flow'].shape) exe_add.arg_dict['flow'][:] = np.ones((1, 2) + target_shape) exe_add.grad_dict['flow'][:] = flow_grad_npy exe_add.forward(is_train=True) exe_add.backward(mx.nd.array(out_grad)) assert_almost_equal(exe_add.grad_dict['flow'].asnumpy(), grad_est + flow_grad_npy, rtol=1e-3, atol=1e-5) @with_seed() def test_index2d(): for _ in range(30): n = np.random.randint(1, 100) m = np.random.randint(1, 500) data = mx.random.uniform(-1, 1, shape=(n, m), ctx=default_context()) x = mx.nd.array(np.random.randint(0, m, size=n), ctx=default_context(), dtype='int32') r = mx.nd.batch_take(data, x) assert_almost_equal(r.asnumpy(), data.asnumpy()[np.arange(n), x.asnumpy()]) @with_seed() def test_cast(): for srctype in [np.int32, np.float32, np.float16]: for dsttype in [np.float32, np.int32, np.float16]: x = mx.sym.Variable('x', dtype=srctype) y = mx.sym.Cast(x, dtype=dsttype) exe = y.simple_bind(ctx=default_context(), x=(10, 10)) assert exe.arg_arrays[0].dtype == srctype assert exe.outputs[0].dtype == dsttype X = np.random.uniform(-10, 10, size=(10, 10)) exe.arg_arrays[0][:] = X exe.forward(is_train=True) exe.backward(mx.nd.array(X, dtype=dsttype, ctx=default_context())) assert_almost_equal(exe.outputs[0].asnumpy(), X.astype(srctype).astype(dsttype), rtol=1e-3, atol=1e-5) assert_almost_equal(exe.grad_arrays[0].asnumpy(), X.astype(dsttype).astype(srctype), rtol=1e-3, atol=1e-5) @with_seed() def test_repeat(): def test_repeat_forward(): ndim_max = 6 # max number of dims of the ndarray size_max = 10 # max number of elements in each dim repeats = 3 for ndim in range(1, ndim_max+1): shape = () for i in range(0, ndim): shape += (np.random.randint(1, size_max+1), ) a = np.random.random_sample(size=shape) aa = np.repeat(a, repeats) b = mx.nd.array(a, ctx=default_context()) bb = mx.nd.repeat(b, repeats).asnumpy() assert_almost_equal(aa, bb) for axis in range(0, ndim): aa = np.repeat(a, repeats, axis) bb = mx.nd.repeat(b, repeats, axis).asnumpy() assert_almost_equal(aa, bb) def test_repeat_backward(axis): data = mx.sym.Variable('data') n1 = 3 n2 = 4 shape = (n1, n2) data_tmp = np.random.randint(0, 10, n1 * n2).reshape(shape) arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) repeats = 2 test = mx.sym.repeat(data, repeats=repeats, axis=axis) exe = test.bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad]) npout_grad = np.random.randint(0, 10, n1 * n2 * repeats) if axis == 0: npout_grad = npout_grad.reshape(n1 * repeats, n2) elif axis == 1: npout_grad = npout_grad.reshape(n1, n2 * repeats) else: raise RuntimeError("Invalid axis value") out_grad = mx.nd.array(npout_grad) exe.backward(out_grad) expected_grad = np.zeros(shape) if axis == 0: for i in range(shape[0]): for j in range(shape[1]): k = i * repeats expected_grad[i][j] = sum(npout_grad[k:k + repeats, j]) elif axis == 1: for j in range(shape[1]): for i in range(shape[0]): k = j * repeats expected_grad[i][j] = sum(npout_grad[i, k:k + repeats]) else: raise RuntimeError("Invalid axis value") assert_almost_equal(expected_grad, arr_grad.asnumpy(), rtol=1e-3) def test_repeat_numeric_gradient(): data = mx.sym.Variable('data') n1 = 3 n2 = 4 shape = (n1, n2) data_tmp = np.random.randint(0, 10, n1 * n2).reshape(shape) repeats = 2 test = mx.sym.repeat(data, repeats=repeats, axis=0) check_numeric_gradient(test, [data_tmp], numeric_eps=1e-3, rtol=1e-2) test_repeat_forward() test_repeat_backward(axis=0) test_repeat_backward(axis=1) test_repeat_numeric_gradient() @with_seed() def test_reverse(): data = mx.symbol.Variable('data') shape = (5, 5, 5) data_tmp = np.random.uniform(-1, 1, shape) test = mx.sym.reverse(data, axis=[1, 2]) grad = np.random.uniform(-1, 1, shape) check_numeric_gradient(test, [data_tmp], numeric_eps=2E-2) check_symbolic_forward(test, [data_tmp], [data_tmp[:, ::-1, ::-1]]) check_symbolic_backward(test, [data_tmp], [grad], [grad[:, ::-1, ::-1]]) @with_seed() def test_tile(): def test_normal_case(): ndim_min = 1 ndim_max = 5 # max number of dims of the ndarray size_max = 10 # max number of elements in each dim length_max = 3 # max length of reps rep_max = 10 # max number of tiling in each dim for ndim in range(ndim_min, ndim_max+1): shape = [] for i in range(1, ndim+1): shape.append(np.random.randint(1, size_max+1)) shape = tuple(shape) a = np.random.randint(0, 100, shape) b = mx.nd.array(a, dtype=a.dtype) reps_len = np.random.randint(1, length_max+1) reps_tuple = () for i in range(1, reps_len): reps_tuple += (np.random.randint(1, rep_max), ) reps_array = np.asarray(reps_tuple) a_tiled = np.tile(a, reps_array) b_tiled = mx.nd.tile(b, reps_tuple).asnumpy() assert same(a_tiled, b_tiled) def test_empty_tensor(): shape = (2, 3, 0, 4) a = np.array([], dtype=np.int32).reshape(shape) b = mx.nd.array(a, ctx=default_context(), dtype=a.dtype) reps = (2, 4, 6) a_tiled = np.tile(a, reps) b_tiled = mx.nd.tile(b, reps).asnumpy() assert same(a_tiled, b_tiled) def test_empty_reps(): a = np.array([[2, 3, 4], [5, 6, 7]], dtype=np.int32) b = mx.nd.array(a, ctx=default_context(), dtype=a.dtype) a_tiled = np.tile(a, ()) b_tiled = mx.nd.tile(b, ()).asnumpy() assert same(a_tiled, b_tiled) def test_tile_backward(): data = mx.sym.Variable('data') n1 = 2 n2 = 2 shape = (n1, n2) data_tmp = np.random.randint(0, 10, n1 * n2).reshape(shape) arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) reps1 = 2 reps2 = 2 reps = (reps1, reps2) test = mx.sym.tile(data, reps=reps) exe = test.bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad]) npout_grad = np.random.randint(0, 10, n1 * n2 * reps1 * reps2).reshape(n1 * reps1, n2 * reps2) out_grad = mx.nd.array(npout_grad) exe.backward(out_grad) expected_grad = np.zeros(shape) for i in range(shape[0]): for j in range(shape[1]): expected_grad[i][j] += sum(sum(npout_grad[i:(n1 * reps1):reps1, j:(n2 * reps2):reps2])) assert_almost_equal(expected_grad, arr_grad.asnumpy(), rtol=1e-3) def test_tile_numeric_gradient(): data = mx.sym.Variable('data') n1 = 2 n2 = 2 shape = (n1, n2) data_tmp = np.random.randint(0, 10, n1 * n2).reshape(shape) reps1 = 2 reps2 = 2 reps = (reps1, reps2) test = mx.sym.tile(data, reps=reps) check_numeric_gradient(test, [data_tmp], numeric_eps=1e-2, rtol=1e-2) def test_invalid_reps(): data = mx.nd.arange(16).reshape((4, 4)) assert_exception(mx.nd.tile, MXNetError, data, (1, 2, -3)) assert_exception(mx.nd.tile, MXNetError, data, (1, 0, 3)) test_normal_case() test_empty_tensor() test_empty_reps() test_tile_backward() test_tile_numeric_gradient() test_invalid_reps() @with_seed() def test_one_hot(): def test_normal_case(index_type=np.int32): ndim_max = 6 dim_size_max = 20 depth = int(dim_size_max / 2) on_value = 1 off_value = 0 for ndim in range(1, ndim_max+1): shape = () for i in range(1, ndim+1): shape += (np.random.randint(1, dim_size_max+1), ) indices = np.random.randint(-dim_size_max, dim_size_max+1, size=np.prod(shape)).reshape(shape) mx_one_hot_array = mx.nd.one_hot( mx.nd.array(indices, ctx=default_context(), dtype=index_type), depth=depth, dtype=np.int32) expected_array = np.zeros((np.prod(shape), depth), dtype=np.int32) expected_array[:] = off_value indices_1d = indices.flatten() row = 0 for idx in indices_1d: if 0 <= idx < depth: expected_array[row, idx] = on_value row += 1 expected_array = expected_array.reshape(shape + (depth, )) one_hot_array = mx_one_hot_array.asnumpy() assert same(expected_array, one_hot_array) def test_empty_indices(): shape = (2, 0, 9, 3) indices = np.array([]).reshape(shape) depth = 10 mx_one_hot_array = mx.nd.one_hot( mx.nd.array(indices, ctx=default_context(), dtype=np.int32), depth=depth, dtype=np.int32).asnumpy() expected_array = np.array([], dtype=np.int32).reshape(shape + (depth, )) assert same(expected_array, mx_one_hot_array) def test_zero_depth(): shape = (2, 4, 9, 3) indices = np.ones(shape) depth = 0 mx_one_hot_array = mx.nd.one_hot( mx.nd.array(indices, ctx=default_context(), dtype=np.int32), depth=depth, dtype=np.int32).asnumpy() expected_array = np.array([], dtype=np.int32).reshape(shape + (depth, )) assert same(expected_array, mx_one_hot_array) test_normal_case(index_type=np.int32) test_normal_case(index_type=np.float64) test_normal_case(index_type=np.float32) test_normal_case(index_type=np.float16) test_empty_indices() test_zero_depth() @with_seed() def test_where(): def get_forward_expected_output(condition, x, y): original_shape = x.shape out = np.zeros(original_shape) if condition.shape == x.shape: for index, c in np.ndenumerate(condition): if c != 0: out[index] = x[index] else: out[index] = y[index] elif condition.shape == (x.shape[0], ): s = x.shape m = s[0] n = int(np.prod(s)/s[0]) x2d = x.reshape((m, n)) y2d = y.reshape((m, n)) out = out.reshape((m, n)) for i in range(0, m): if condition[i] != 0: for j in range(0, n): out[i, j] = x2d[i, j] else: for j in range(0, n): out[i, j] = y2d[i, j] else: raise RuntimeError("Invalid condition shape for where op") out = out.reshape(original_shape) return out def get_forward_inputs_same_shape(shape): condition_np = np.random.randint(0, 2, np.prod(shape)).reshape(shape) x_np = np.random.randint(1, 6, np.prod(shape)).reshape(shape) y_np = np.random.randint(7, 11, np.prod(shape)).reshape(shape) return condition_np, x_np, y_np def get_forward_inputs_condition_vector(shape): condition_np = np.random.randint(0, 2, shape[0]) x_np = np.random.randint(1, 6, np.prod(shape)).reshape(shape) y_np = np.random.randint(7, 11, np.prod(shape)).reshape(shape) return condition_np, x_np, y_np def get_backward_input(shape): return np.random.randint(20, 30, np.prod(shape)).reshape(shape) def get_backward_expected_outputs(grad_in, condition): shape = grad_in.shape grad_cond = np.zeros(condition.shape) grad_x = np.empty(shape) grad_y = np.empty(shape) for index, c in np.ndenumerate(condition): if 0 != c: grad_x[index] = grad_in[index] grad_y[index] = 0 else: grad_x[index] = 0 grad_y[index] = grad_in[index] return grad_cond, grad_x, grad_y def test_where_helper(shape, same_shape): if same_shape: condition_np, x_np, y_np = get_forward_inputs_same_shape(shape) else: condition_np, x_np, y_np = get_forward_inputs_condition_vector(shape) out_expected = get_forward_expected_output(condition_np, x_np, y_np) grad_in_np = get_backward_input(shape) grad_expected_cond, grad_expected_x, grad_expected_y\ = get_backward_expected_outputs(grad_in_np, condition_np) condition = mx.sym.Variable('condition') x = mx.sym.Variable('x') y = mx.sym.Variable('y') grad_in_mx = mx.nd.array(grad_in_np, dtype=np.int32) where_sym = mx.sym.where(condition, x, y) # test req='write' where_exe_write = where_sym.simple_bind(ctx=default_context(), condition=condition_np.shape, x=x_np.shape, y=y_np.shape, grad_req='write') # test forward req='write' outputs = where_exe_write.forward(is_train=True, condition=condition_np, x=x_np, y=y_np) assert same(outputs[0].asnumpy(), out_expected) # test backward req='write' where_exe_write.backward(grad_in_mx) assert same(where_exe_write.grad_dict['x'].asnumpy(), grad_expected_x) assert same(where_exe_write.grad_dict['y'].asnumpy(), grad_expected_y) assert same(where_exe_write.grad_dict['condition'].asnumpy(), grad_expected_cond) # test req='add' x_grad_init = np.random.randint(30, 40, np.prod(shape)).reshape(shape) y_grad_init = np.random.randint(40, 50, np.prod(shape)).reshape(shape) where_exe_add = where_sym.simple_bind(ctx=default_context(), condition=condition_np.shape, x=x_np.shape, y=y_np.shape, grad_req='add') where_exe_add.grad_dict['x'][:] = x_grad_init where_exe_add.grad_dict['y'][:] = y_grad_init # test forward req='add' outputs = where_exe_add.forward(is_train=True, condition=condition_np, x=x_np, y=y_np) assert same(outputs[0].asnumpy(), out_expected) # test backward req='add' where_exe_add.backward(grad_in_mx) x_ograd = where_exe_add.grad_dict['x'].asnumpy() y_ograd = where_exe_add.grad_dict['y'].asnumpy() assert same(x_ograd, grad_expected_x+x_grad_init) assert same(y_ograd, grad_expected_y+y_grad_init) def test_where_numeric_gradient(shape, same_shape): condition = mx.sym.Variable('condition') x = mx.sym.Variable('x') y = mx.sym.Variable('y') where_sym = mx.sym.where(condition, x, y) if same_shape: condition_np, x_np, y_np = get_forward_inputs_same_shape(shape) else: condition_np, x_np, y_np = get_forward_inputs_condition_vector(shape) check_numeric_gradient(where_sym, [condition_np, x_np, y_np], grad_nodes=['x', 'y']) def test_invalid_shape(): condition = mx.sym.Variable('condition') x = mx.sym.Variable('x') y = mx.sym.Variable('y') where_sym = mx.sym.where(condition, x, y) assert_exception(lambda: where_sym.eval(x=mx.nd.array([[2,3],[4,5],[6,7]]), y=mx.nd.array([[8,9],[10,11],[12,13]]), condition=mx.nd.array([1,0])), MXNetError) assert_exception(lambda: mx.nd.where(x=mx.nd.array([[2,3],[4,5],[6,7]]), y=mx.nd.array([[8,9],[10,11],[12,13]]), condition=mx.nd.array([1,0])), MXNetError) def test_1d_cond(): cond = mx.nd.array([1, 0, 1]) x = mx.nd.array([[2, 3], [4, 5], [6, 7]]) y = mx.nd.array([[7, 8], [9, 10], [10, 11]]) expect_out = np.array([[2, 3], [9, 10], [6, 7]]) out = mx.nd.where(cond, x, y).asnumpy() assert(expect_out.all() == out.all()) test_where_helper((5, 9), True) test_where_helper((5, 9), False) test_where_helper((5, 7, 9), True) test_where_helper((5, 7, 9), False) test_where_helper((10, 8, 15, 3), True) test_where_helper((10, 8, 15, 3), False) test_where_numeric_gradient((5, 9), True) test_where_numeric_gradient((5, 9), False) test_where_numeric_gradient((5, 7, 9), True) test_where_numeric_gradient((5, 7, 9), False) test_invalid_shape() test_1d_cond() @with_seed() def test_softmin(): for ndim in range(1, 5): for dtype in [np.float16, np.float32, np.float64]: rtol, atol = (1e-2, 5e-3) if dtype is np.float16 else (1e-3, 1e-3) shape = np.random.randint(1, 5, size=ndim) axis = np.random.randint(-ndim, ndim) data = np.random.uniform(-2, 2, size=shape).astype(dtype) data = data / 10 if dtype is np.float16 else data sym = mx.sym.softmin(axis=axis) expected_fwd = np_softmax(-data, axis=axis) expected_bwd = np.zeros(shape) check_symbolic_forward(sym, [data], [expected_fwd], atol=atol, dtype=dtype) for req in ['null', 'add', 'write']: check_symbolic_backward(sym, [data], [np.ones(expected_fwd.shape)], [expected_bwd], rtol=rtol, atol=atol, grad_req=req, dtype=dtype) if dtype is not np.float16: check_numeric_gradient(sym, [data], rtol=rtol, atol=atol, dtype=dtype) @with_seed() def test_new_softmax(): for ndim in range(1, 5): shape = np.random.randint(1, 5, size=ndim) axis = np.random.randint(-ndim, ndim) data = np.random.uniform(-2, 2, size=shape) sym = mx.sym.softmax(axis=axis) expected_fwd = np_softmax(data, axis=axis) expected_bwd = np.zeros(shape) check_symbolic_forward(sym, [data], [expected_fwd]) for req in ['null', 'add', 'write']: check_symbolic_backward(sym, [data], [np.ones(expected_fwd.shape)], [expected_bwd], rtol=1e-2, atol=1e-3, grad_req=req) check_numeric_gradient(sym, [data], rtol=1e-2, atol=1e-3) @with_seed() def test_softmax_with_temperature(): for ndim in range(1, 5): shape = np.random.randint(1, 5, size=ndim) data = np.random.uniform(-2, 2, size=shape) for temp in range(1, 11): sym = mx.sym.softmax(axis=0, temperature=temp) expected_fwd = np_softmax(data, axis=0, temperature=temp) expected_bwd = np.zeros(shape) check_symbolic_forward(sym, [data], [expected_fwd], rtol=0.05, atol=1e-3) check_symbolic_backward(sym, [data], [np.ones(shape)], [expected_bwd], rtol=0.05, atol=1e-3) check_numeric_gradient(sym, [data], rtol=0.05, atol=1e-3) @with_seed() def test_log_softmax(): for ndim in range(1, 5): for _ in range(5): shape = np.random.randint(1, 5, size=ndim) axis = np.random.randint(0, ndim) data = np.random.uniform(-2, 2, size=shape) sym = mx.sym.log_softmax(axis=axis-ndim) check_symbolic_forward(sym, [data], [np.log(np_softmax(data, axis=axis)+1e-20)]) check_numeric_gradient(sym, [data], rtol=0.05, atol=1e-3) @with_seed() def test_pick(): def test_pick_helper(index_type=np.int32): for _ in range(100): for mode in ['clip', 'wrap']: ndim = np.random.randint(1, 5) bshape = np.random.randint(1, 10, size=ndim) axis = np.random.randint(0, ndim) sshape = bshape.copy() sshape[axis] = 1 data = np.random.uniform(-1, 1, size=bshape) if mode == 'wrap': index = np.random.randint(-2*bshape[axis], 2*bshape[axis], size=sshape) else: index = np.random.randint(0, bshape[axis], size=sshape) exp = [] for i in range(ndim): if i == axis: if mode == 'wrap': exp.append(index % bshape[axis]) else: exp.append(index) else: ishape = [1 for _ in range(ndim)] ishape[i] = bshape[i] exp.append(np.arange(bshape[i]).reshape(ishape)) expected = data[exp] data = mx.nd.array(data, dtype='float32') index = mx.nd.array(index, dtype=index_type) out = mx.nd.pick(data, index, axis=axis, keepdims=True, mode=mode) assert_almost_equal(out.asnumpy(), expected) data_holder = data index_holder = index data = mx.sym.Variable('data') index = mx.sym.Variable('index') sym = mx.sym.pick(data, index, axis=axis, keepdims=True, mode=mode) check_numeric_gradient(sym, [data_holder, index_holder], grad_nodes=['data']) test_pick_helper(np.int32) test_pick_helper(np.float32) def check_ctc_loss(acts, labels, loss_truth): in_var = mx.sym.Variable('input') labels_var = mx.sym.Variable('labels') ctc = mx.sym.ctc_loss(in_var, labels_var) acts_nd = mx.nd.array(acts, ctx=default_context()) labels_nd = mx.nd.array(labels, ctx=default_context()) exe = ctc.bind(ctx=default_context(), args=[acts_nd, labels_nd]) # test forward with grad calc exe.forward(is_train=True) outTest = exe.outputs[0] # test forward without grad calc exe.forward(is_train=False) outTrain = exe.outputs[0] # make sure losses calculated with both modes are the same assert_almost_equal(outTest.asnumpy(), outTrain.asnumpy()) # test against ground truth, if available if loss_truth is not None: assert_almost_equal(outTest.asnumpy(), loss_truth) # test grad check_numeric_gradient(ctc, [acts, labels], grad_nodes=['input'], rtol=0.05, atol=1e-3) # check contrib operator for backward compatibility def check_contrib_ctc_loss(acts, labels, loss_truth): in_var = mx.sym.Variable('input') labels_var = mx.sym.Variable('labels') ctc = mx.sym.contrib.ctc_loss(in_var, labels_var) acts_nd = mx.nd.array(acts, ctx=default_context()) labels_nd = mx.nd.array(labels, ctx=default_context()) exe = ctc.bind(ctx=default_context(), args=[acts_nd, labels_nd]) # test forward with grad calc exe.forward(is_train=True) outTest = exe.outputs[0] # test forward without grad calc exe.forward(is_train=False) outTrain = exe.outputs[0] # make sure losses calculated with both modes are the same assert_almost_equal(outTest.asnumpy(), outTrain.asnumpy()) # test against ground truth, if available if loss_truth is not None: assert_almost_equal(outTest.asnumpy(), loss_truth) # test grad check_numeric_gradient(ctc, [acts, labels], grad_nodes=['input'], rtol=0.05, atol=1e-3) @with_seed() def test_ctc_loss(): # Test 1: check that batches are same + check against Torch WarpCTC acts = np.array([ [[1.2, 3.4, 1.2, -0.1, -2.34], [1.2, 3.4, 1.2, -0.1, -2.34]], [[0.1, 0.2, 0.3, 0.22, 0.123], [0.1, 0.2, 0.3, 0.22, 0.123]], [[-15, -14, -13, -12, -11], [-15, -14, -13, -12, -11]]], dtype=np.float32) labels = np.array([[2, 3, 0], [2, 3, 0]]) true_loss = np.array([4.04789, 4.04789], dtype=np.float32) # from Torch check_ctc_loss(acts, labels, true_loss) check_contrib_ctc_loss(acts, labels, true_loss) # Test 2: acts2 = np.array([ [[-5, -4, -3, -2, -1], [1.2, 3.4, 1.2, -0.1, -2.34]], [[-10, -9, -8, -7, -6], [0.1, 0.2, 0.3, 0.22, 0.123]], [[-15, -14, -13, -12, -11], [-15, -14.2, -13.5, -12.2, -11.22]]], dtype=np.float32) labels2 = np.array([[2, 3, 1], [2, 0, 0]], dtype=np.float32) true_loss = np.array([7.3557, 5.4091], dtype=np.float32) # from Torch check_ctc_loss(acts2, labels2, true_loss) check_contrib_ctc_loss(acts2, labels2, true_loss) # Test 3: check use integer type as label labels3 = np.array([[2, 3, 1], [2, 0, 0]], dtype=np.int32) true_loss = np.array([7.3557, 5.4091], dtype=np.float32) # from Torch check_ctc_loss(acts2, labels3, true_loss) check_contrib_ctc_loss(acts2, labels3, true_loss) @with_seed() def test_ctc_loss_with_large_classes(): ctx = default_context() num_classes = 6000 seq_len = 8 batch_size = 2 data = np.empty((num_classes, 0)) for i in range(seq_len * batch_size) : row = np.roll(np.arange(num_classes, dtype=np.float32), i).reshape(num_classes, 1) data = np.append(data, row/13, axis=1) data = data.reshape(seq_len, batch_size, num_classes) label = np.array([ [100, 200, 300, 400, 500, 0, 0, 0], [1000, 2000, 3000, 4000, 0, 5000, 0, 0]], dtype=np.int32) nd_data = mx.nd.array(data) nd_label = mx.nd.array(label) loss = mx.nd.ctc_loss(data=nd_data, label=nd_label) expected_loss = np.array([688.02826, 145.34462]) assert_almost_equal(loss.asnumpy(), expected_loss) @with_seed() def test_ctc_loss_grad(): def check_ctc_loss_grad(blank_label): # from tf vocab_size = 5 max_label_len = 5 padding_mask = -1+ (blank_label=='first') targets_0 = [0, 1, 2, 1, 0] loss_log_prob_0 = -3.34211 input_prob_matrix_0 = np.asarray( [[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436], [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688], [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533], [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]], dtype=np.float32) gradient_log_prob_0 = np.asarray( [[-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], [0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436], [0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688], [0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533], [-0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]], dtype=np.float32) targets_1 = [0, 1, 1, 0] loss_log_prob_1 = -5.42262 input_prob_matrix_1 = np.asarray( [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508], [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549], [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456], [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345], [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]], dtype=np.float32) gradient_log_prob_1 = np.asarray( [[-0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508], [0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549], [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544], [0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345], [-0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]], dtype=np.float32) inputs = [ np.vstack( [input_prob_matrix_0[t, :], input_prob_matrix_1[t, :]]) for t in range(5) ] + 2 * [np.nan * np.ones((2, vocab_size+1), np.float32)] inputs = np.log(np.asarray(inputs, dtype=np.float32)) grad_truth = np.array([ np.vstack( [gradient_log_prob_0[t, :], gradient_log_prob_1[t, :]]) for t in range(5) ] + 2 * [np.zeros((2, vocab_size+1), np.float32)]) if blank_label == 'first': inputs = np.roll(inputs, 1, axis=2) grad_truth = np.roll(grad_truth, 1, axis=2) labels = (np.asarray([x + [padding_mask]*(max_label_len-len(x)) for x in [targets_0, targets_1]])+(blank_label == 'first')) seq_lens = np.array([5, 5], dtype=np.int32) label_lens = np.array([5, 4], dtype=np.int32) loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32) with default_context(): data = mx.nd.array(inputs) label = mx.nd.array(labels) data.attach_grad() with mx.autograd.record(): l = mx.ndarray.CTCLoss(data, label, use_data_lengths=True, use_label_lengths=True, data_lengths=mx.nd.array(seq_lens), label_lengths=mx.nd.array(label_lens), blank_label=blank_label) l.backward() assert_almost_equal(l.asnumpy(), loss_truth, atol=1e-5, rtol=1e-5) assert_almost_equal(data.grad.asnumpy(), grad_truth, atol=1e-5, rtol=1e-5) # check contrib operator for backward compatibility def check_contrib_ctc_loss_grad(blank_label): # from tf vocab_size = 5 max_label_len = 5 padding_mask = -1+ (blank_label=='first') targets_0 = [0, 1, 2, 1, 0] loss_log_prob_0 = -3.34211 input_prob_matrix_0 = np.asarray( [[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436], [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688], [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533], [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]], dtype=np.float32) gradient_log_prob_0 = np.asarray( [[-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], [0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436], [0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688], [0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533], [-0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]], dtype=np.float32) targets_1 = [0, 1, 1, 0] loss_log_prob_1 = -5.42262 input_prob_matrix_1 = np.asarray( [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508], [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549], [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456], [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345], [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]], dtype=np.float32) gradient_log_prob_1 = np.asarray( [[-0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508], [0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549], [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544], [0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345], [-0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]], dtype=np.float32) inputs = [ np.vstack( [input_prob_matrix_0[t, :], input_prob_matrix_1[t, :]]) for t in range(5) ] + 2 * [np.nan * np.ones((2, vocab_size+1), np.float32)] inputs = np.log(np.asarray(inputs, dtype=np.float32)) grad_truth = np.array([ np.vstack( [gradient_log_prob_0[t, :], gradient_log_prob_1[t, :]]) for t in range(5) ] + 2 * [np.zeros((2, vocab_size+1), np.float32)]) if blank_label == 'first': inputs = np.roll(inputs, 1, axis=2) grad_truth = np.roll(grad_truth, 1, axis=2) labels = (np.asarray([x + [padding_mask]*(max_label_len-len(x)) for x in [targets_0, targets_1]])+(blank_label == 'first')) seq_lens = np.array([5, 5], dtype=np.int32) label_lens = np.array([5, 4], dtype=np.int32) loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32) with default_context(): data = mx.nd.array(inputs) label = mx.nd.array(labels) data.attach_grad() with mx.autograd.record(): l = mx.contrib.ndarray.CTCLoss(data, label, use_data_lengths=True, use_label_lengths=True, data_lengths=mx.nd.array(seq_lens), label_lengths=mx.nd.array(label_lens), blank_label=blank_label) l.backward() assert_almost_equal(l.asnumpy(), loss_truth, atol=1e-5, rtol=1e-5) assert_almost_equal(data.grad.asnumpy(), grad_truth, atol=1e-5, rtol=1e-5) check_ctc_loss_grad('first') check_ctc_loss_grad('last') check_contrib_ctc_loss_grad('first') check_contrib_ctc_loss_grad('last') @with_seed() def test_quantization_op(): min0 = mx.nd.array([0.0]) max0 = mx.nd.array([1.0]) a = mx.nd.array([[0.1392, 0.5928], [0.6027, 0.8579]]) qa, min1, max1 = mx.nd.contrib.quantize(a, min0, max0, out_type='uint8') a_ = mx.nd.contrib.dequantize(qa, min1, max1, out_type='float32') qa_real = mx.nd.array([[35, 151], [154, 219]]) a_real = mx.nd.array([[0.13725491, 0.59215689], [0.60392159, 0.8588236]]) assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) @with_seed() def test_index_copy(): x = mx.nd.zeros((5,3)) t = mx.nd.array([[1,2,3],[4,5,6],[7,8,9]]) index = mx.nd.array([0,4,2], dtype=np.int64) x.attach_grad() t.attach_grad() index.attach_grad() with mx.autograd.record(): out = mx.nd.contrib.index_copy(x, index, t) out.backward() tensor = mx.nd.array([[1,2,3],[0,0,0],[7,8,9],[0,0,0],[4,5,6]]) x_grad = mx.nd.array([[0,0,0],[1,1,1],[0,0,0],[1,1,1],[0,0,0]]) t_grad = mx.nd.array([[1,1,1],[1,1,1],[1,1,1]]) index_grad = mx.nd.array([0,0,0]) assert same(out.asnumpy(), tensor.asnumpy()) assert same(x.grad.asnumpy(), x_grad.asnumpy()) assert same(t.grad.asnumpy(), t_grad.asnumpy()) assert same(index.grad.asnumpy(), index_grad.asnumpy()) @with_seed() def test_div_sqrt_dim(): data_tmp = np.random.normal(0, 1, (5, 10, 8)) data = mx.symbol.Variable('data') test = mx.sym.contrib.div_sqrt_dim(data) check_numeric_gradient(test, [data_tmp], numeric_eps=1E-2) check_symbolic_forward(test, [data_tmp], [data_tmp / np.sqrt(data_tmp.shape[-1])]) @with_seed() def test_reciprocal_op(): eps = 2**(-11) data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors and finite difference method inaccuracies. # Factor of 6 below set empirically, depends on eps. # Issue exposed by seed 879579887. # Replace problematic inputs with 1.0. data_tmp[abs(data_tmp) < 6*eps] = 1.0 data = mx.symbol.Variable('data') test = mx.sym.reciprocal(data) check_numeric_gradient(test, [data_tmp], numeric_eps = eps) check_symbolic_forward(test, [data_tmp], [np.reciprocal(data_tmp)]) @with_seed() def test_cbrt_op(): eps = 2**(-11) data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid finite difference method inaccuracies due to infinite gradient at the origin. # Factor of 4 below set empirically, depends on eps. # Issue exposed by seed 553872106. # Replace problematic inputs with 1.0. data_tmp[abs(data_tmp) < 4*eps] = 1.0 data = mx.symbol.Variable('data') test = mx.sym.cbrt(data) check_numeric_gradient(test, [data_tmp], numeric_eps=eps) check_symbolic_forward(test, [data_tmp], [np.cbrt(data_tmp)]) @with_seed() def test_rcbrt_op(): eps = 2**(-11) data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors and finite difference method inaccuracies. # Factor of 4 below set empirically, depends on eps. # Issue exposed by seed 788174893. # Replace problematic inputs with 1.0. data_tmp[abs(data_tmp) < 4*eps] = 1.0 data = mx.symbol.Variable('data') test = mx.sym.rcbrt(data) check_numeric_gradient(test, [data_tmp], numeric_eps = eps) check_symbolic_forward(test, [data_tmp], [1/np.cbrt(data_tmp)]) @with_seed() def test_custom_op(): class Sqr(mx.operator.CustomOp): def forward(self, is_train, req, in_data, out_data, aux): if in_data[0].stype == 'default': aux[0][:] = 1 self.assign(out_data[0], req[0], in_data[0]*in_data[0]) else: inp = in_data[0] csr_m = inp.data * inp.data out = mx.nd.sparse.csr_matrix((csr_m, inp.indices, inp.indptr), shape=inp.shape) self.assign(out_data[0], req[0], out) if (in_data[0].stype == 'csr'): assert(isinstance(out_data[0], mx.nd.sparse.CSRNDArray)) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): self.assign(in_grad[0], req[0], 2 * mx.nd.sparse.elemwise_mul(in_data[0], out_grad[0])) if in_data[0].stype == 'default': assert (aux[0].asnumpy() == 1).all() @mx.operator.register("sqr") class SqrProp(mx.operator.CustomOpProp): def __init__(self): super(SqrProp, self).__init__(need_top_grad=True) def list_arguments(self): return ['data'] def list_outputs(self): return ['output'] def list_auxiliary_states(self): return ['aux'] def infer_shape(self, in_shape): return in_shape, [in_shape[0]], [in_shape[0]] def infer_type(self, in_type): return in_type, [in_type[0]], [in_type[0]] def infer_storage_type(self, in_stype): if in_stype[0] == 'default': return ['default'], ['default'], ['default'] return ['csr'], ['csr'], ['csr'] def infer_storage_type_backward(self, ograd_stype, in_stype, out_stype, igrad_stype, aux_stype): if in_stype[0] == 'default': return ['default'], ['default'], ['default'], ['default'], ['default'] return ['default'], ['csr'], ['csr'], ['csr'], ['csr'] def create_operator(self, ctx, shapes, dtypes): return Sqr() data = mx.symbol.Variable('data') aux = mx.symbol.Variable('aux') op = mx.symbol.Custom(data=data, aux=aux, name='sqr', op_type='sqr') x = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10))) aux = mx.nd.zeros_like(x) check_numeric_gradient(op, [x], [aux]) data = mx.symbol.cast(data, dtype='float64') op = mx.symbol.cast(op, dtype='float32') check_numeric_gradient(op, [x], [aux]) data = mx.symbol.Variable('data', stype='csr') aux = mx.symbol.Variable('aux') op2 = mx.symbol.Custom(data=data, aux=aux, name='sqr', op_type='sqr') x = x.tostype('csr') aux = mx.nd.zeros_like(x) check_numeric_gradient(op2, [x], [aux], grad_stype_dict={"data": "csr"}) x2 = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10))) x2 = x2.tostype('csr') aux2 = mx.nd.zeros_like(x2) x2.attach_grad() with mx.autograd.record(): output = mx.nd.Custom(x2, aux2, name='sqr', op_type='sqr') output.backward() expected_output = mx.nd.sparse.square(x2) expected_grad = 2 * x2 rtol = 1e-4 atol = 1e-6 assert_almost_equal(output.asnumpy(), expected_output.asnumpy(), rtol=rtol, atol=atol) assert_almost_equal(x2.grad.asnumpy(), expected_grad.asnumpy(), rtol=rtol, atol=atol) # test for backward compatibility, i.e. the correctness of default implementation of # infer storage in custom operator class Mult(mx.operator.CustomOp): def forward(self, is_train, req, in_data, out_data, aux): self.assign(out_data[0], req[0], in_data[0]*in_data[1]) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): self.assign(in_grad[0], req[0], in_data[1]) self.assign(in_grad[1], req[1], in_data[0]) @mx.operator.register("mult") class MultProp(mx.operator.CustomOpProp): def __init__(self): super(MultProp, self).__init__(need_top_grad=True) def list_arguments(self): return ['lhs', 'rhs'] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): return in_shape, [in_shape[0]], [] def create_operator(self, ctx, shapes, dtypes): return Mult() lhs = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10))) rhs = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10))) lhs.attach_grad() rhs.attach_grad() with mx.autograd.record(): y = mx.nd.Custom(lhs, rhs, name='mult', op_type='mult') y.backward() assert_almost_equal(rhs.asnumpy(), lhs.grad.asnumpy(), rtol=rtol, atol=atol) assert_almost_equal(lhs.asnumpy(), rhs.grad.asnumpy(), rtol=rtol, atol=atol) class MultNoGrad(mx.operator.CustomOp): def forward(self, is_train, req, in_data, out_data, aux): self.assign(out_data[0], req[0], in_data[0]*in_data[1]) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): self.assign(in_grad[0], req[0], in_data[1]) self.assign(in_grad[1], req[1], in_data[0]) @mx.operator.register("mult_no_grad") class MultNoGradProp(mx.operator.CustomOpProp): def __init__(self): super(MultNoGradProp, self).__init__(need_top_grad=False) def list_arguments(self): return ['lhs', 'rhs'] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): return in_shape, [in_shape[0]], [] def create_operator(self, ctx, shapes, dtypes): return MultNoGrad() def infer_storage_type_backward(self, ograd_stype, in_stype, out_stype, igrad_stype, aux_stype): return ograd_stype, in_stype, out_stype, igrad_stype, aux_stype with mx.autograd.record(): y2 = mx.nd.Custom(lhs, rhs, name="mult_no_grad", op_type="mult_no_grad") y2.backward() assert_almost_equal(rhs.asnumpy(), lhs.grad.asnumpy(), rtol=rtol, atol=atol) assert_almost_equal(lhs.asnumpy(), rhs.grad.asnumpy(), rtol=rtol, atol=atol) class NoInputOp(mx.operator.CustomOp): def __init__(self, length, depth): super(NoInputOp, self).__init__() self.output = np.ones(shape=(length, depth), dtype=np.float32) def forward(self, is_train, req, in_data, out_data, aux): self.assign(out_data[0], req[0], self.output) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): pass @mx.operator.register("no_input_op") class NoInputOpProp(mx.operator.CustomOpProp): def __init__(self, length, depth): super(NoInputOpProp, self).__init__() self.length = int(length) self.depth = int(depth) def list_arguments(self): return [] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): return [], [(self.length, self.depth)], [] def infer_type(self, in_type): return [], [np.float32], [] def create_operator(self, ctx, shapes, dtypes): return NoInputOp(length=self.length, depth=self.depth) with mx.autograd.record(): x = mx.nd.Custom(length=10, depth=10, op_type="no_input_op") assert_almost_equal(x.asnumpy(), np.ones(shape=(10, 10), dtype=np.float32)) @with_seed() def test_psroipooling(): for num_rois in [1, 2]: for num_classes, num_group in itertools.product([2, 3], [2, 3]): for image_height, image_width in itertools.product([168, 224], [168, 224]): for grad_nodes in [['im_data']]: spatial_scale = 0.0625 feat_height = np.int(image_height * spatial_scale) feat_width = np.int(image_width * spatial_scale) im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) rois_data = np.zeros([num_rois, 5]) rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1)) rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1)) im_data_var = mx.symbol.Variable(name="im_data") rois_data_var = mx.symbol.Variable(name="rois_data") op = mx.sym.contrib.PSROIPooling(data=im_data_var, rois=rois_data_var, spatial_scale=spatial_scale, group_size=num_group, pooled_size=num_group, output_dim=num_classes, name='test_op') rtol, atol = 1e-2, 1e-3 check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol, grad_nodes=grad_nodes) @with_seed() def test_psroipooling_with_type(): arg_params = { 'psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])} # plain psroipooling sym = mx.sym.contrib.PSROIPooling(spatial_scale=0.0625, output_dim=2, pooled_size=3, name='psroipool') ctx_list = [{'ctx': mx.cpu(0), 'psroipool_data': (1, 18, 14, 14), 'psroipool_rois': (2, 5), 'type_dict': {'psroipool_data': np.float64, 'psroipool_rois': np.float64}}, {'ctx': mx.cpu(0), 'psroipool_data': (1, 18, 14, 14), 'psroipool_rois': (2, 5), 'type_dict': {'psroipool_data': np.float32, 'psroipool_rois': np.float32}}, {'ctx': mx.cpu(0), 'psroipool_data': (1, 18, 14, 14), 'psroipool_rois': (2, 5), 'type_dict': {'psroipool_data': np.float16, 'psroipool_rois': np.float16}}, ] check_consistency(sym, ctx_list, grad_req={'psroipool_data': 'write', 'psroipool_rois': 'null'}, arg_params=arg_params) @with_seed() def test_deformable_convolution(): for num_batch in [1, 2]: for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]): for input_height, input_width in itertools.product([5, 6], [5, 6]): for dilate in [(1, 1), (2, 2)]: for grad_nodes in [['im_data'], ['offset_data'], ['weight']]: output_height = input_height output_width = input_width im_data = np.random.rand(num_batch, num_channel_data, input_height, input_width) offset_data = \ np.random.rand(num_batch, num_deformable_group * 3 * 3 * 2, output_height, output_width)\ * 0.8 + 0.1 weight = np.random.normal(0, 0.001, (num_channel_data, num_channel_data, 3, 3)) bias = np.zeros(num_channel_data) im_data_var = mx.symbol.Variable(name="im_data") offset_data_var = mx.symbol.Variable(name="offset_data") weight_var = mx.symbol.Variable(name="weight") bias_var = mx.symbol.Variable(name="bias") op = mx.sym.contrib.DeformableConvolution(name='test_op', data=im_data_var, offset=offset_data_var, weight=weight_var, bias=bias_var, num_filter=num_channel_data, pad=dilate, kernel=(3, 3), stride=(1, 1), dilate=dilate, num_deformable_group=num_deformable_group) if grad_nodes[0] == 'offset_data': # wider tolerance needed for coordinate differential rtol, atol = 1.0, 1e-2 else: rtol, atol = 0.05, 1e-3 # By now we only have gpu implementation if default_context().device_type == 'gpu': check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol, grad_nodes=grad_nodes, ctx=mx.gpu(0)) def _validate_sample_location(input_rois, input_offset, spatial_scale, pooled_w, pooled_h, sample_per_part, part_size, output_dim, num_classes, trans_std, feat_h, feat_w): num_rois = input_rois.shape[0] output_offset = input_offset.copy() # simulate deformable psroipooling forward function for roi_idx in range(num_rois): sub_rois = input_rois[roi_idx, :].astype(np.float32) img_idx, x0, y0, x1, y1 = int(sub_rois[0]), sub_rois[1], sub_rois[2], sub_rois[3], sub_rois[4] roi_start_w = round(x0) * spatial_scale - 0.5 roi_start_h = round(y0) * spatial_scale - 0.5 roi_end_w = round(x1 + 1) * spatial_scale - 0.5 roi_end_h = round(y1 + 1) * spatial_scale - 0.5 roi_w, roi_h = roi_end_w - roi_start_w, roi_end_h - roi_start_h bin_size_w, bin_size_h = roi_w / pooled_w, roi_h / pooled_h sub_bin_size_w, sub_bin_size_h = bin_size_w / sample_per_part, bin_size_h / sample_per_part for c_top in range(output_dim): channel_each_cls = output_dim / num_classes class_id = int(c_top / channel_each_cls) for ph in range(pooled_h): for pw in range(pooled_w): part_h = int(math.floor(float(ph) / pooled_h * part_size)) part_w = int(math.floor(float(pw) / pooled_w * part_size)) trans_x = input_offset[roi_idx, class_id * 2, part_h, part_w] * trans_std trans_y = input_offset[roi_idx, class_id * 2 + 1, part_h, part_w] * trans_std bin_h_start, bin_w_start = ph * bin_size_h + roi_start_h, pw * bin_size_w + roi_start_w need_check = True while need_check: pass_check = True for ih in range(sample_per_part): for iw in range(sample_per_part): h = bin_h_start + trans_y * roi_h + ih * sub_bin_size_h w = bin_w_start + trans_x * roi_w + iw * sub_bin_size_w if w < -0.5 or w > feat_w - 0.5 or h < -0.5 or h > feat_h - 0.5: continue w = min(max(w, 0.1), feat_w - 1.1) h = min(max(h, 0.1), feat_h - 1.1) # if the following condiiton holds, the sampling location is not differentiable # therefore we need to re-do the sampling process if h - math.floor(h) < 1e-3 or math.ceil(h) - h < 1e-3 or w - math.floor(w) < 1e-3 or math.ceil(w) - w < 1e-3: trans_x, trans_y = random.random() * trans_std, random.random() * trans_std pass_check = False break if not pass_check: break if pass_check: output_offset[roi_idx, class_id * 2 + 1, part_h, part_w] = trans_y / trans_std output_offset[roi_idx, class_id * 2, part_h, part_w] = trans_x / trans_std need_check = False return output_offset @unittest.skip("Flaky test, tracked at https://github.com/apache/incubator-mxnet/issues/11713") @with_seed() def test_deformable_psroipooling(): sample_per_part = 4 trans_std = 0.1 for num_rois in [1, 2]: for num_classes, num_group in itertools.product([2, 3], [2, 3]): for image_height, image_width in itertools.product([160, 224], [160, 224]): for grad_nodes in [['im_data'], ['offset_data']]: spatial_scale = 0.0625 stride = int(1 / spatial_scale) feat_height = np.int(image_height * spatial_scale) feat_width = np.int(image_width * spatial_scale) im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) rois_data = np.zeros([num_rois, 5]) rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1 - 2 * stride)) + stride rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1 - 2 * stride)) + stride offset_data = np.random.rand(num_rois, 2*num_classes, num_group, num_group) # at certain points, the bilinear interpolation function may be non-differentiable # to avoid this, we check whether the input locates on the valid points offset_data = _validate_sample_location(rois_data, offset_data, spatial_scale, num_group, num_group, sample_per_part, num_group, num_classes, num_classes, trans_std, feat_height, feat_width) im_data_var = mx.symbol.Variable(name="im_data") rois_data_var = mx.symbol.Variable(name="rois_data") offset_data_var = mx.symbol.Variable(name="offset_data") op = mx.sym.contrib.DeformablePSROIPooling(data=im_data_var, rois=rois_data_var, trans=offset_data_var, spatial_scale=spatial_scale, sample_per_part=4, group_size=num_group, pooled_size=num_group, output_dim=num_classes, trans_std=0.1, no_trans=False, name='test_op') rtol, atol = 1e-2, 1e-3 # By now we only have gpu implementation if default_context().device_type == 'gpu': check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol, grad_nodes=grad_nodes, ctx=mx.gpu(0)) # Helper functions for test_laop def _make_symm_symbol(a, ndims): assert ndims >= 2 tr_shape = list(range(ndims)) tr_shape[-1] = ndims-2 tr_shape[-2] = ndims-1 tr_shape = tuple(tr_shape) return 0.5 * (a + mx.sym.transpose(a, axes=tr_shape)) def _make_lower_triangle_symm(a, ndims, m, dtype=np.float32): assert ndims >= 2 # The last two dimensions must both be m # Create mask for lower triangle and diagonal index = mx.sym.arange(start=0, stop=m, step=1, dtype=np.int32) lt_mask = mx.sym.one_hot(index, depth=m, dtype=dtype) for j in range(1, m): part1 = mx.sym.zeros(shape=(j, m), dtype=dtype) index = mx.sym.arange(start=0, stop=m-j, step=1, dtype=np.int32) part2 = mx.sym.one_hot(index, depth=m, dtype=dtype) lt_mask = lt_mask + mx.sym.concat(*[part1, part2], dim=0) shp = tuple([1]*(ndims-2) + [m, m]) lt_mask = mx.sym.reshape(lt_mask, shape=shp) return mx.sym.broadcast_mul(a, lt_mask) # @ankkhedia: Getting rid of fixed seed as flakiness could not be reproduced # tracked at https://github.com/apache/incubator-mxnet/issues/11718 @with_seed() def test_laop(): dtype = np.float64 rtol_fw = 1e-7 atol_fw = 1e-9 num_eps = 1e-6 rtol_bw = 1e-5 atol_bw = 1e-6 # enable numerical checking of gradients grad_check = 1 data1 = mx.symbol.Variable('data1') data2 = mx.symbol.Variable('data2') data3 = mx.symbol.Variable('data3') check_fw = lambda sym, location, expected :\ check_symbolic_forward(sym, location, expected, rtol=rtol_fw, atol=atol_fw, dtype=dtype) check_grad = lambda sym, location:\ check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol_bw, atol=atol_bw, dtype=dtype) rep_3x = lambda a, m, n :\ np.reshape(np.tile(np.array(a).flatten(), 3), (3, 1, m, n)) # Test gemm separately from other la-operators. shape1 = (2, 3) shape2 = (3, 2) shape3 = (3, 3) shape4 = (2, 2) data_in1 = np.random.uniform(1, 10, shape1) data_in2 = np.random.uniform(1, 10, shape2) data_in3 = np.random.uniform(1, 10, shape3) data_in4 = np.random.uniform(1, 10, shape4) # Check all transpositions of gemm operator. data_in1_t = np.transpose(data_in1) data_in2_t = np.transpose(data_in2) res_gemm = 4. * np.dot(data_in1, data_in2) + 7. * data_in4 test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7.) check_fw(test_gemm, [data_in1, data_in2, data_in4], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in2, data_in4]) res_gemm = 4. * np.dot(data_in1_t, data_in2_t) + 7. * data_in3 test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7., transpose_a=True, transpose_b=True) check_fw(test_gemm, [data_in1, data_in2, data_in3], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in2, data_in3]) res_gemm = 4. * np.dot(data_in1_t, data_in1) + 7. * data_in3 test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7., transpose_a=True) check_fw(test_gemm, [data_in1, data_in1, data_in3], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in1, data_in3]) res_gemm = 4. * np.dot(data_in1, data_in1_t) + 7. * data_in4 test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7., transpose_b=True) check_fw(test_gemm, [data_in1, data_in1, data_in4], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in1, data_in4]) # Check batch of gemm. a = rep_3x(data_in1, 2, 3) b = rep_3x(data_in2, 3, 2) c = rep_3x(data_in4, 2, 2) r = 4. * np.dot(data_in1, data_in2) + 7. * data_in4 r = rep_3x(r, 2, 2) test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7.) check_fw(test_gemm, [a, b, c], [r]) if grad_check == 1: check_grad(test_gemm, [a, b, c]) # Check for different axis that describes matrix rows. a2 = np.copy(np.swapaxes(a, 0, 2)) b2 = np.copy(np.swapaxes(b, 0, 2)) c2 = np.copy(np.swapaxes(c, 0, 2)) r2 = np.copy(np.swapaxes(r, 0, 2)) test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7., axis = 0) check_fw(test_gemm, [a2, b2, c2], [r2]) if grad_check == 1: check_grad(test_gemm, [a2, b2, c2]) a2 = np.copy(np.swapaxes(a, 1, 2)) b2 = np.copy(np.swapaxes(b, 1, 2)) c2 = np.copy(np.swapaxes(c, 1, 2)) r2 = np.copy(np.swapaxes(r, 1, 2)) test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7., axis = -3) check_fw(test_gemm, [a2, b2, c2], [r2]) if grad_check == 1: check_grad(test_gemm, [a2, b2, c2]) # Check gemm2 operator same way as gemm. res_gemm = 4. * np.dot(data_in1, data_in2) test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4.) check_fw(test_gemm, [data_in1, data_in2], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in2]) res_gemm = 4. * np.dot(data_in1_t, data_in2_t) test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., transpose_a=True, transpose_b=True) check_fw(test_gemm, [data_in1, data_in2], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in2]) res_gemm = 4. * np.dot(data_in1_t, data_in1) test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., transpose_a=True) check_fw(test_gemm, [data_in1, data_in1], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in1]) res_gemm = 4. * np.dot(data_in1, data_in1_t) test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., transpose_b=True) check_fw(test_gemm, [data_in1, data_in1], [res_gemm]) if grad_check == 1: check_grad(test_gemm, [data_in1, data_in1]) # Check batch of gemm2. a = rep_3x(data_in1, 2, 3) b = rep_3x(data_in2, 3, 2) r = rep_3x(4. * np.dot(data_in1, data_in2), 2, 2) test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4.) check_fw(test_gemm, [a, b], [r]) if grad_check == 1: check_grad(test_gemm, [a, b]) a2 = np.copy(np.swapaxes(a, 0, 2)) b2 = np.copy(np.swapaxes(b, 0, 2)) r2 = np.copy(np.swapaxes(r, 0, 2)) test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., axis = 0) check_fw(test_gemm, [a2, b2], [r2]) if grad_check == 1: check_grad(test_gemm, [a2, b2]) a2 = np.copy(np.swapaxes(a, 1, 2)) b2 = np.copy(np.swapaxes(b, 1, 2)) r2 = np.copy(np.swapaxes(r, 1, 2)) test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., axis = -3) check_fw(test_gemm, [a2, b2], [r2]) if grad_check == 1: check_grad(test_gemm, [a2, b2]) # Now test all the other operators. # Tests with trivial 1x1 matrices. shape = (4, 4, 1, 1) data_in = np.random.uniform(1, 10, shape) # test potrf # Note: Have to symmetrize input, for gradient test to work res_potrf = np.sqrt(data_in) test_potrf = mx.sym.linalg.potrf(data1) check_fw(test_potrf, [data_in], [res_potrf]) if grad_check == 1: check_grad(test_potrf, [data_in]) # test potri ones = mx.nd.ones(shape).asnumpy() res_potri = np.divide(ones, data_in * data_in) test_potri = mx.sym.linalg.potri(data1) check_fw(test_potri, [data_in], [res_potri]) if grad_check == 1: check_grad(test_potri, [data_in]) # test trsm trian_in = data_in * 7. test_trsm = mx.sym.linalg.trsm(data1, data2, alpha=7.) check_fw(test_trsm, [trian_in, data_in], [ones]) if grad_check == 1: check_grad(test_trsm, [trian_in,data_in]) # test trmm trian_in = np.divide(ones, trian_in) test_trmm = mx.sym.linalg.trmm(data1, data2, alpha=7., transpose=True, rightside=True) check_fw(test_trmm, [trian_in, data_in], [ones]) if grad_check == 1: check_grad(test_trmm, [trian_in, data_in]) # test sumlogdiag res_sumlogdiag = np.reshape(np.log(data_in), (4, 4)) test_sumlogdiag = mx.sym.linalg.sumlogdiag(data1) check_fw(test_sumlogdiag, [data_in], [res_sumlogdiag]) if grad_check == 1: check_grad(test_sumlogdiag, [data_in]) # more elaborate example of Cholesky factorization matrix = np.array([[9., 3., -6., 12.], [3., 26., -7., -11.], [-6., -7., 9., 7.], [12., -11., 7., 65.]]) trian = np.array([[3., 0., 0., 0.], [1., 5., 0., 0.], [-2., -1., 2., 0.], [4., -3., 6., 2.]]) pow = np.array([[2., 1., 1., 1.], [1., 4., 1., 1.], [1., 1., 8., 1.], [1., 1., 1., 16.]]) inv = np.array([[8.95/3., 0.05/3., 2.65, -2.5/3.], [0.05/3., 0.05, 0.05, 0.], [2.65, 0.05, 2.5, -0.75], [-2.5/3., 0., -0.75, 0.25]]) ident = np.eye(4) # test potrf test_potrf = mx.sym.linalg.potrf(_make_symm_symbol(data1, ndims=4)) a = rep_3x(matrix, 4, 4) r = rep_3x(trian, 4, 4) check_fw(test_potrf, [a], [r]) if grad_check == 1: check_grad(test_potrf, [a]) #test potri data1_ltri = _make_lower_triangle_symm( data1, ndims=4, m=4, dtype=dtype) test_potri = mx.sym.linalg.potri(data1_ltri) a = rep_3x(trian, 4, 4) r = rep_3x(inv, 4, 4) check_fw(test_potri, [a], [r]) if grad_check == 1: check_grad(test_potri, [a]) # test trsm test_trsm = mx.sym.linalg.trsm(data1_ltri, data2, alpha=7.) a = rep_3x(trian, 4, 4) b = rep_3x(matrix, 4, 4) r = rep_3x(7. * np.transpose(trian), 4, 4) check_fw(test_trsm, [a, b], [r]) if grad_check == 1: check_grad(test_trsm, [a, b]) test_trsm2 = mx.sym.linalg.trsm( data1_ltri, data2, alpha=-2., rightside=True, transpose=True) r = rep_3x(-2. * trian, 4, 4) check_fw(test_trsm2, [a, b], [r]) if grad_check == 1: check_grad(test_trsm2, [a, b]) test_trsm3 = mx.sym.linalg.trsm( data1_ltri, data2, alpha=0.5, transpose=True) b = rep_3x(np.transpose(trian), 4, 4) r = rep_3x(0.5 * ident, 4, 4) check_fw(test_trsm3, [a, b], [r]) if grad_check == 1: check_grad(test_trsm3, [a, b]) test_trsm4 = mx.sym.linalg.trsm( data1_ltri, data2, alpha=-0.5, rightside=True) b = rep_3x(trian, 4, 4) r = rep_3x(-0.5 * ident, 4, 4) check_fw(test_trsm4, [a, b], [r]) if grad_check == 1: check_grad(test_trsm4, [a, b]) # test trmm test_trmm = mx.sym.linalg.trmm( data1_ltri, data2, alpha=7., transpose=True, rightside=True) a = rep_3x(trian, 4, 4) b = rep_3x(matrix, 4, 4) r = rep_3x(7. * np.dot(matrix, trian.T), 4, 4) check_fw(test_trmm, [a, b], [r]) if grad_check == 1: check_grad(test_trmm, [a, b]) test_trmm2 = mx.sym.linalg.trmm(data1_ltri, data2, alpha=-2.) r = rep_3x(-2. * np.dot(trian, matrix), 4, 4) check_fw(test_trmm2, [a, b], [r]) if grad_check == 1: check_grad(test_trmm2, [a, b]) test_trmm3 = mx.sym.linalg.trmm(data1_ltri, data2, rightside=True) r = rep_3x(np.dot(matrix, trian), 4, 4) check_fw(test_trmm3, [a, b], [r]) if grad_check == 1: check_grad(test_trmm3, [a, b]) test_trmm4 = mx.sym.linalg.trmm( data1_ltri, data2, alpha=1.2, transpose=True) r = rep_3x(1.2 * np.dot(trian.T, matrix), 4, 4) check_fw(test_trmm4, [a, b], [r]) if grad_check == 1: check_grad(test_trmm4, [a, b]) # test sumlogdiag a = rep_3x(pow, 4, 4) r = np.reshape(np.tile(10. * np.log(np.array([2.])), 3), (3,)) check_fw(test_sumlogdiag, [a], [r]) if grad_check == 1: check_grad(test_sumlogdiag, [a]) # Tests for operators linalg.syrk, linalg.gelqf def _gelqf_combined_symbol(a): q, l = mx.sym.linalg.gelqf(a) q_qt = mx.sym.linalg.syrk(q, transpose=False, alpha=1., name='Q_times_Qt') l_q = mx.sym.linalg.trmm(l, q, alpha=1., name='L_times_Q') return mx.sym.Group([q_qt, l_q]) # NOTE: If we leave the unused output dangling, things break if dtype=np.float64. Namely, the # backward gradient for the unused output is of dtype np.float32 then. # ==> Very annoying! def _gelqf_first_output(a): q, l = mx.sym.linalg.gelqf(a) bogus_scal = mx.sym.sum(mx.sym.BlockGrad(l), axis=(), keepdims=True) * 0.0 return mx.sym.broadcast_add(q, bogus_scal) def _gelqf_second_output(a): q, l = mx.sym.linalg.gelqf(a) bogus_scal = mx.sym.sum(mx.sym.BlockGrad(q), axis=(), keepdims=True) * 0.0 return mx.sym.broadcast_add(l, bogus_scal) def _syevd_combined_symbol(a): u, lam = mx.sym.linalg.syevd(a) u_ut = mx.sym.linalg.syrk(u, transpose=False, alpha=1., name='U_times_Ut') lam_u = mx.sym.broadcast_mul(mx.sym.reshape(lam, shape=(-2, 1)), u) ut_lam_u = mx.sym.linalg.gemm2(u, lam_u, alpha=1., transpose_a=True, transpose_b=False, name='Ut_L_U') return mx.sym.Group([u_ut, ut_lam_u]) @with_seed() def test_laop_2(): dtype = np.float64 rtol_fw = 1e-7 atol_fw = 1e-9 num_eps = 1e-6 rtol_bw = 1e-5 atol_bw = 1e-6 # enable numerical checking of gradients grad_check = 1 data1 = mx.symbol.Variable('data1') check_fw = lambda sym, location, expected :\ check_symbolic_forward(sym, location, expected, rtol=rtol_fw, atol=atol_fw, dtype=dtype) check_grad = lambda sym, location:\ check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol_bw, atol=atol_bw, dtype=dtype) rep_3x = lambda a, m, n :\ np.reshape(np.tile(np.array(a).flatten(), 3), (3, 1, m, n)) # Tests for linalg.syrk mnalpha_lst = [(2, 3, 1.), (5, 3, -2.), (1, 6, 5.), (3, 3, 0.5), (4, 1, 10.), (1, 1, 1.)] for m, n, alpha in mnalpha_lst: #print('syrk: m={}, n={}, alpha={}'.format(m, n, alpha)) data_in1 = np.random.uniform(1, 10, (m, n)) res_syrk1 = alpha * np.dot(data_in1, data_in1.T) test_syrk1 = mx.sym.linalg.syrk(data1, transpose=False, alpha=alpha) check_fw(test_syrk1, [data_in1], [res_syrk1]) if grad_check == 1: check_grad(test_syrk1, [data_in1]) res_syrk2 = alpha * np.dot(data_in1.T, data_in1) test_syrk2 = mx.sym.linalg.syrk(data1, transpose=True, alpha=alpha) check_fw(test_syrk2, [data_in1], [res_syrk2]) if grad_check == 1: check_grad(test_syrk2, [data_in1]) # Batch mode (3x the same thing) a_batch = rep_3x(data_in1, m, n) r1_batch = rep_3x(res_syrk1, m, m) check_fw(test_syrk1, [a_batch], [r1_batch]) if grad_check == 1: check_grad(test_syrk1, [a_batch]) r2_batch = rep_3x(res_syrk2, n, n) check_fw(test_syrk2, [a_batch], [r2_batch]) if grad_check == 1: check_grad(test_syrk2, [a_batch]) # Tests for linalg.gelqf # Currently disabled on GPU as they need cuda8 # and MxNet builds use cuda 7.5 if not (default_context() == mx.cpu()): return test_gelqf2 = _gelqf_combined_symbol(data1) # Outputs (dot(Q, Q.T), dot(L, Q)) test_gelqf_q = _gelqf_first_output(data1) # Output Q (L is not dangling) test_gelqf_l = _gelqf_second_output(data1) # Output L (Q is not dangling) mn_lst = [(4, 4), (1, 1), (5, 20), (1, 10), (15, 50)] for m, n in mn_lst: #print('gelqf: m={}, n={}'.format(m, n)) data_in1 = np.random.normal(0., 10., (m, n)) res_eye = np.eye(m) res_a = data_in1 check_fw(test_gelqf2, [data_in1], [res_eye, res_a]) if grad_check == 1: # A => Q check_grad(test_gelqf_q, [data_in1]) # A => L check_grad(test_gelqf_l, [data_in1]) # Batch mode (3x the same thing) a_batch = rep_3x(data_in1, m, n) reye_batch = rep_3x(res_eye, m, m) ra_batch = a_batch check_fw(test_gelqf2, [a_batch], [reye_batch, ra_batch]) if grad_check == 1: # A => Q check_grad(test_gelqf_q, [a_batch]) # A => L check_grad(test_gelqf_l, [a_batch]) # Tests for operator linalg.syevd def _syevd_first_output(a): u, lam = mx.sym.linalg.syevd(a) bogus_scal = mx.sym.sum(mx.sym.BlockGrad(lam), axis=(), keepdims=True) * 0.0 return mx.sym.broadcast_add(u, bogus_scal) def _syevd_second_output(a): u, lam = mx.sym.linalg.syevd(a) bogus_scal = mx.sym.sum(mx.sym.BlockGrad(u), axis=(), keepdims=True) * 0.0 return mx.sym.broadcast_add(lam, bogus_scal) def _syevd_forward(a): lam, ut = np.linalg.eig(a) ind = np.argsort(lam) lam = lam[ind] u = ut[:, ind].T for i in range(0, a.shape[0]): _syevd_forw_eigvec_sign(u[i]) return u, lam def _syevd_forw_eigvec_sign(v): ind = np.argmax(np.abs(v)) if v[ind] < 0.: v[:] = -v def _syevd_backward(grad_u, grad_l, u, l): n = l.size assert grad_l.size == n assert grad_u.shape == (n, n) assert u.shape == (n, n) temp = np.dot(grad_u, u.T) temp2 = np.diag(grad_l) for i in range(1, n): for j in range(0, i): denom = 2. * (l[i] - l[j]) elem = (temp[i, j] - temp[j, i])/denom temp2[i, j] = elem temp2[j, i] = elem temp3 = np.dot(u.T, temp2) return np.dot(temp3, u) # Seed set because the test is not robust enough to operate on random data @with_seed(1896893923) def test_laop_3(): # Currently disabled on GPU as syevd needs cuda8 # and MxNet builds use cuda 7.5 if not (default_context() == mx.cpu()): return dtype = np.float64 rtol_fw = 1e-6 atol_fw = 1e-6 num_eps = 1e-4 rtol_bw = 1e-2 atol_bw = 1e-2 # enable numerical checking of gradients grad_check = 1 data1 = mx.symbol.Variable('data1') check_fw = lambda sym, location, expected :\ check_symbolic_forward(sym, location, expected, rtol=rtol_fw, atol=atol_fw, dtype=dtype) check_grad = lambda sym, location:\ check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol_bw, atol=atol_bw, dtype=dtype) rep_3x = lambda a, m, n :\ np.reshape(np.tile(np.array(a).flatten(), 3), (3, 1, m, n)) check_bw = lambda sym, location, out_grads, expected :\ check_symbolic_backward(sym, location, out_grads, expected, rtol=rtol_fw, atol=atol_fw, dtype=dtype) # Tests for linalg.syevd test_syevd2 = _syevd_combined_symbol(data1) # Outputs (U U^T, U^T (diag L) U) data1_s2 = _make_symm_symbol(data1, ndims=2) test_syevd_u_2 = _syevd_first_output(data1_s2) test_syevd_l_2 = _syevd_second_output(data1_s2) data1_s4 = _make_symm_symbol(data1, ndims=4) test_syevd_u_4 = _syevd_first_output(data1_s4) test_syevd_l_4 = _syevd_second_output(data1_s4) n_lst = [4, 1, 2, 10, 14] for n in n_lst: #print('\n** syevd: n={}'.format(n)) data_in1 = np.random.normal(0., 10., (n, n)) data_in1 = 0.5 * (data_in1 + data_in1.T) res_eye = np.eye(n) res_a = data_in1 check_fw(test_syevd2, [data_in1], [res_eye, res_a]) # Check backward grad_u = np.random.normal(0., 2., (n, n)) grad_l = np.random.normal(0., 2., (n,)) bw_u, bw_l = _syevd_forward(data_in1) grad_a = _syevd_backward(grad_u, grad_l, bw_u, bw_l) check_bw(mx.sym.linalg.syevd(data1), [data_in1], [grad_u, grad_l], [grad_a]) if grad_check == 1: # A => U check_grad(test_syevd_u_2, [data_in1]) # A => L check_grad(test_syevd_l_2, [data_in1]) # Batch mode (3x the same thing) a_batch = rep_3x(data_in1, n, n) reye_batch = rep_3x(res_eye, n, n) ra_batch = a_batch check_fw(test_syevd2, [a_batch], [reye_batch, ra_batch]) if grad_check == 1: # A => U check_grad(test_syevd_u_4, [a_batch]) # A => L check_grad(test_syevd_l_4, [a_batch]) # @piyushghai - Removing the fixed seed for this test. # Issue for flakiness is tracked at - https://github.com/apache/incubator-mxnet/issues/11721 @with_seed() def test_laop_4(): # Currently disabled on GPU as syevd needs cuda8 # and MxNet builds use cuda 7.5 if not (default_context() == mx.cpu()): return rtol_fw = 1e-6 atol_fw = 1e-6 data1 = mx.symbol.Variable('data1') check_fw = lambda sym, location, expected, dtype :\ check_symbolic_forward(sym, location, expected, rtol=rtol_fw, atol=atol_fw, dtype=dtype) a_np = np.array([[1., 2.], [2., 4.]]) u_np = np.array([[0.89442718, -0.44721359], [0.44721359, 0.89442718]]) l_np = np.array([0., 5.]) test_syevd = mx.sym.linalg.syevd(data1) # float64 #print('float64') check_fw(test_syevd, [a_np], [u_np, l_np], np.float64) # float32 #print('float32') check_fw(test_syevd, [a_np], [u_np, l_np], np.float32) @with_seed() def test_stack(): for _ in range(100): ndim = random.randint(1, 5) axis = random.randint(0, ndim) if random.randint(0, 1): axis = axis - ndim - 1 nin = random.randint(1, 3) dshape = [random.randint(1, 5) for _ in range(ndim)] inputs = [np.random.uniform(size=dshape) for _ in range(nin)] output = np.stack(inputs, axis=axis) sym_ins = [mx.sym.var('x%d'%i) for i in range(nin)] out = mx.sym.stack(*sym_ins, axis=axis) check_symbolic_forward(out, inputs, [output]) check_numeric_gradient(out, inputs) @with_seed() @unittest.skip("Flaky test, tracked at https://github.com/apache/incubator-mxnet/issues/12314") def test_dropout(): def zero_count(array, ratio): zeros = 0 for i in array: if i == 0: zeros += 1 elif math.isnan(i): assert ratio == 1 # Only valid for ratio = 1 zeros += 1 return zeros def check_correctness(executor, input, ratio): input = input.ravel() output = executor.outputs[0].asnumpy().ravel() input_sum = np.sum(input) output_sum = np.sum(output) # Make sure input zeroes are none (test data setup check) assert zero_count(input, ratio) == 0 # count number of zeroes in output output_zeroes = zero_count(output, ratio) # Hopefully should be within ratio/2 % error = abs(output_sum - input_sum) / input_sum if ratio == 1.0: assert output_zeroes == len(input) elif ratio > 0.2: assert output_zeroes > 0 assert error < (ratio/2) elif ratio == 0: assert output_zeroes == 0 def check_dropout_ratio(ratio, shape): # test dropout x = mx.sym.var('data') y = mx.sym.Dropout(x, p=ratio) exe = y.simple_bind(ctx=default_context(), data=shape) if ratio == 1: max_value = float('nan') else: max_value = 1 if ratio == 0 else 1/ratio if ratio == 1: min_value = float('nan') else: min_value = 1 if ratio == 0 else 0 exe.arg_arrays[0][:] = 1 exe.forward(is_train=True) if not math.isnan(max_value): assert exe.outputs[0].asnumpy().max() > 0 else: assert math.isnan(exe.outputs[0].asnumpy().max()) if not math.isnan(min_value): assert exe.outputs[0].asnumpy().min() == min_value else: assert math.isnan(exe.outputs[0].asnumpy().min()) check_correctness(exe, exe.arg_arrays[0].asnumpy(), ratio) if ratio == 0.5: exe.backward([mx.nd.ones(shape)]) assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all() exe.forward(is_train=False) assert (exe.outputs[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all() exe.backward([mx.nd.ones(shape)], is_train=False) assert (exe.grad_arrays[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all() # test permanent dropout x = mx.sym.var('data') y = mx.sym.Dropout(x, p=ratio, mode='always') exe = y.simple_bind(ctx=default_context(), data=shape) exe.arg_arrays[0][:] = 1 exe.forward(is_train=True) assert exe.outputs[0].asnumpy().max() == max_value assert exe.outputs[0].asnumpy().min() == min_value exe.backward([mx.nd.ones(shape)]) assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all() exe.forward(is_train=False) assert exe.outputs[0].asnumpy().max() == max_value assert exe.outputs[0].asnumpy().min() == min_value exe.backward([mx.nd.ones(shape)], is_train=False) assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all() def get_slice(x, axis, idx): ix = () for i in range(x.ndim): if i == axis: ix += (idx,) else: ix += (slice(None, None, None),) return x[ix] def check_dropout_axes(ratio, shape, axes): compactshape = list(shape) for axis in axes: compactshape[axis] = 1 compactx = mx.random.uniform(shape=tuple(compactshape)) broadcastx = compactx.broadcast_to(shape) dropouty = mx.nd.Dropout(broadcastx, p=ratio, axes=axes) for axis in axes: target = get_slice(dropouty, axis, 0).asnumpy() for i in range(1, shape[axis]): assert(get_slice(dropouty, axis, i).asnumpy() == target).all() shape = (100, 100) check_dropout_ratio(0.5, shape) check_dropout_ratio(0.0, shape) check_dropout_ratio(1.0, shape) check_dropout_ratio(0.75, shape) check_dropout_ratio(0.25, shape) nshape = (10, 10, 10, 10) with mx.autograd.train_mode(): check_dropout_axes(0.25, nshape, axes = (0,)) check_dropout_axes(0.25, nshape, axes = (1,)) check_dropout_axes(0.25, nshape, axes = (2,)) check_dropout_axes(0.25, nshape, axes = (3,)) check_dropout_axes(0.25, nshape, axes = (0, 1)) check_dropout_axes(0.25, nshape, axes = (0, 2)) check_dropout_axes(0.25, nshape, axes = (0, 3)) check_dropout_axes(0.25, nshape, axes = (1, 2)) check_dropout_axes(0.25, nshape, axes = (1, 3)) check_dropout_axes(0.25, nshape, axes = (2, 3)) check_dropout_axes(0.25, nshape, axes = (0, 1, 2)) check_dropout_axes(0.25, nshape, axes = (0, 2, 3)) check_dropout_axes(0.25, nshape, axes = (1, 2, 3)) @unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11290") @with_seed() def test_scatter_gather_nd(): def check(data, idx): data.attach_grad() with mx.autograd.record(): y = mx.nd.gather_nd(data, idx) y.backward(y) npidx = tuple(i.asnumpy() for i in idx) assert (data.asnumpy()[npidx] == y.asnumpy()).all() npdata = np.zeros_like(data.asnumpy()) npdata[npidx] = y.asnumpy() assert (npdata == data.grad.asnumpy()).all() assert (mx.nd._internal._backward_gather_nd(y, idx, shape=data.shape).asnumpy() == data.grad.asnumpy()).all() for dtype in ['int32', 'int64', 'float16', 'float32', 'float64']: data = mx.nd.arange(360, dtype=dtype).reshape((3,4,5,6)) idx = mx.nd.array([[1,1,2], [3, 3, 0], [3,2,1]], dtype='int32') check(data, idx) idx = mx.nd.array([[1,1,2], [3,3,0], [3,2,1], [5,2,4]], dtype='int32') check(data, idx) data = mx.nd.array([2, 3, 0], dtype=dtype) idx = mx.nd.array([[1, 1, 0], [0, 1, 0]], dtype='int32') assert (mx.nd.scatter_nd(data, idx, shape=(2, 2)).asnumpy() == [[0, 0], [2, 3]]).all() data = mx.nd.array([2, 3, 0], dtype=dtype) idx = mx.nd.array([[1, 1, 0], [1, 1, 0]], dtype='int32') assert (mx.nd._internal._backward_gather_nd(data, idx, shape=(2, 2)).asnumpy() == [[0, 0], [0, 5]]).all() data_npy = np.random.randint(0, 10, (100,)) data = mx.nd.array(data_npy, dtype=dtype) idx = mx.nd.zeros(shape=(1, 100), dtype='int32') assert (mx.nd._internal._backward_gather_nd(data, idx, shape=(1,)).asscalar() == data_npy.sum()) if dtype == 'int64': data = mx.nd.array([2123162361283621, -31231236374787, -112372937128970, -1378278798172378], dtype=dtype) idx = mx.nd.array([[0, 0, 0, 0]], dtype='int32') assert (mx.nd._internal._backward_gather_nd(data, idx, shape=(1,)).asscalar() == data.asnumpy().sum()) def compare_forw_backw_unary_op( name, forward_mxnet_call, forward_numpy_call, backward_numpy_call, shape, input_low, input_high, rtol, atol, dtype=np.float32): check_fw = lambda sym, location, expected :\ check_symbolic_forward(sym, location, expected, rtol=rtol, atol=atol, dtype=dtype) check_bw = lambda sym, location, out_grads, expected :\ check_symbolic_backward(sym, location, out_grads, expected, rtol=rtol, atol=atol, dtype=dtype) op_name = 'unary_op={}, dtype={}'.format(name, dtype) data = mx.symbol.Variable(op_name + '_data', dtype=dtype) # Comparison: Forward expression data_np = np.random.uniform(input_low, input_high, shape).astype(dtype) res_np = forward_numpy_call(data_np) op_ex = mx.sym.broadcast_add( forward_mxnet_call(data), mx.sym.zeros_like(data), name=op_name) check_fw(op_ex, [data_np], [res_np]) # Comparison: Backward expression res_grad = np.random.uniform(-2.0, 2.0, shape).astype(dtype) data_grad = backward_numpy_call(data_np) * res_grad check_bw(op_ex, [data_np], [res_grad], [data_grad]) def finite_diff_unary_op( name, forward_mxnet_call, shape, input_low, input_high, rtol, atol, num_eps): # Finite difference tests are done in float64 dtype = np.float64 check_grad = lambda sym, location:\ check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol, atol=atol, dtype=dtype) data_np = np.random.uniform(input_low, input_high, shape).astype(dtype) data = mx.symbol.Variable('data', dtype=dtype) op_name = 'unary_op={}, dtype={}'.format(name, dtype) op_ex = mx.sym.broadcast_add( forward_mxnet_call(data), mx.sym.zeros_like(data), name=op_name) check_grad(op_ex, [data_np]) def np_smooth_l1(x, sigma): issq = 1. / sigma / sigma absx = np.abs(x) temp = x * sigma return np.where(absx < issq, 0.5 * (temp ** 2), absx - 0.5 * issq) def np_smooth_l1_grad(x, sigma): ssq = sigma * sigma return np.where(np.abs(x) < 1. / ssq, x * ssq, np.sign(x)) # Tests for unary operators (basic mathematical functions): # - Forward: Comparison to NumPy (several dtype) # - Backward: Comparison to NumPy (several dtype) # - Finite difference tests (only dtype = float64) # Seed set because the test is not robust enough to operate on random data @with_seed(192837465) def test_unary_math_operators(): have_scipy = True try: from scipy import special as scipy_special except: print("Could not import scipy. Skipping unit tests for special functions") have_scipy = False shape=(9, 10) dtype_l = [np.float64, np.float32, np.float16] rtol_l = [1e-7, 1e-6, 1e-2] rtol_less_l = [1e-6, 1e-5, 1e-2] atol_l = [1e-7, 1e-6, 1e-2] atol_less_l = [1e-6, 1e-5, 1e-2] rtol_fd = 1e-5 atol_fd = 1e-6 num_eps = 1e-6 unary_ops = { 'arccos' : [lambda x: mx.sym.arccos(x), lambda x: np.arccos(x), lambda x: -1. / np.sqrt(1. - x ** 2.), -0.95, 0.95], 'arccosh': [lambda x: mx.sym.arccosh(x), lambda x: np.arccosh(x), lambda x: 1. / np.sqrt(x ** 2 - 1.), 1.05, 10.0], 'arcsin': [lambda x: mx.sym.arcsin(x), lambda x: np.arcsin(x), lambda x: 1. / np.sqrt(1. - x ** 2), -0.95, 0.95], 'arcsinh': [lambda x: mx.sym.arcsinh(x), lambda x: np.arcsinh(x), lambda x: 1. / np.sqrt(x**2 + 1.), -5.0, 5.0], 'arctan': [lambda x: mx.sym.arctan(x), lambda x: np.arctan(x), lambda x: 1. / (x ** 2. + 1.), -5.0, 5.0], 'arctanh': [lambda x: mx.sym.arctanh(x), lambda x: np.arctanh(x), lambda x: 1. / (1. - x ** 2), -0.95, 0.95], 'cbrt': [lambda x: mx.sym.cbrt(x), lambda x: np.cbrt(x), lambda x: 1. / (3. * np.cbrt(x) ** 2), -10.0, 10.0], 'cos': [lambda x: mx.sym.cos(x), lambda x: np.cos(x), lambda x: -np.sin(x), -5.0, 5.0], 'cosh': [lambda x: mx.sym.cosh(x), lambda x: np.cosh(x), lambda x: np.sinh(x), -2.0, 2.0], 'exp': [lambda x: mx.sym.exp(x), lambda x: np.exp(x), lambda x: np.exp(x), -4.0, 4.0], 'expm1': [lambda x: mx.sym.expm1(x), lambda x: np.expm1(x), lambda x: np.exp(x), -0.1, 0.1], 'log': [lambda x: mx.sym.log(x), lambda x: np.log(x), lambda x: 1. / x, 0.01, 100.0], 'log10': [lambda x: mx.sym.log10(x), lambda x: np.log10(x), lambda x: 1. / (x * np.log(10.)), 0.01, 100.0], 'log2': [lambda x: mx.sym.log2(x), lambda x: np.log2(x), lambda x: 1. / (x * np.log(2.)), 0.01, 100.0], 'log1p': [lambda x: mx.sym.log1p(x), lambda x: np.log1p(x), lambda x: 1. / (1. + x), -0.1, 0.1], 'rcbrt': [lambda x: mx.sym.rcbrt(x), lambda x: 1. / np.cbrt(x), lambda x: -1. / (3. * x * np.cbrt(x)), 0.01, 100.0], 'reciprocal': [lambda x: mx.sym.reciprocal(x), lambda x: 1. / x, lambda x: -1. / (x ** 2), 0.01, 100.0], 'relu': [lambda x: mx.sym.relu(x), lambda x: np.maximum(x, 0.), lambda x: 1. * (x > 0.), -5.0, 5.0], 'rsqrt': [lambda x: mx.sym.rsqrt(x), lambda x: 1. / np.sqrt(x), lambda x: -0.5 / (x * np.sqrt(x)), 0.01, 100.0], 'sigmoid': [lambda x: mx.sym.sigmoid(x), lambda x: 1. / (np.exp(-x) + 1.), lambda x: 1. / (np.exp(-x) + 1.) / (np.exp(x) + 1.), -3.0, 3.0], 'softsign': [lambda x: mx.sym.softsign(x), lambda x: x / (1. + np.abs(x)), lambda x: 1. / np.square(1. + np.abs(x)), -3.0, 3.0], 'sin': [lambda x: mx.sym.sin(x), lambda x: np.sin(x), lambda x: np.cos(x), -5.0, 5.0], 'sinh': [lambda x: mx.sym.sinh(x), lambda x: np.sinh(x), lambda x: np.cosh(x), -2.0, 2.0], 'sqrt': [lambda x: mx.sym.sqrt(x), lambda x: np.sqrt(x), lambda x: 0.5 / np.sqrt(x), 0.01, 100.0], 'tan': [lambda x: mx.sym.tan(x), lambda x: np.tan(x), lambda x: np.tan(x) ** 2 + 1., -1.5, 1.5], 'tanh': [lambda x: mx.sym.tanh(x), lambda x: np.tanh(x), lambda x: 1. - np.tanh(x) ** 2, -4.0, 4.0], 'smooth_l1_sig1': [lambda x: mx.sym.smooth_l1(x, scalar=1.), lambda x: np_smooth_l1(x, 1.), lambda x: np_smooth_l1_grad(x, 1.), -2.0, 2.0], 'smooth_l1_sig_default': [lambda x: mx.sym.smooth_l1(x), lambda x: np_smooth_l1(x, 1.), lambda x: np_smooth_l1_grad(x, 1.), -2.0, 2.0], 'smooth_l1_sig2': [lambda x: mx.sym.smooth_l1(x, scalar=2.), lambda x: np_smooth_l1(x, 2.), lambda x: np_smooth_l1_grad(x, 2.), -1.0, 1.0] } if have_scipy: unary_ops['gamma'] = [lambda x: mx.sym.gamma(x), lambda x: scipy_special.gamma(x), lambda x: scipy_special.gamma(x) * scipy_special.psi(x), 0.01, 5.0] unary_ops['gammaln'] = [lambda x: mx.sym.gammaln(x), lambda x: scipy_special.gammaln(x), lambda x: scipy_special.psi(x), 0.01, 20.0] # Loop over operators for name, op in unary_ops.items(): # Loop over dtype's for ind in range(len(dtype_l)): dtype = dtype_l[ind] if name == 'gammaln' or name == 'gamma': rtol = rtol_less_l[ind] atol = atol_less_l[ind] else: rtol = rtol_l[ind] atol = atol_l[ind] compare_forw_backw_unary_op( name, op[0], op[1], op[2], shape, op[3], op[4], rtol, atol, dtype) # Finite difference testing finite_diff_unary_op( name, op[0], shape, op[3], op[4], rtol_fd, atol_fd, num_eps) def compare_forw_backw_binary_op( name, forward_mxnet_call, forward_numpy_call, backward1_numpy_call, backward2_numpy_call, shape, input1_low, input1_high, input2_low, input2_high, rtol, atol, dtype=np.float32): check_fw = lambda sym, location, expected :\ check_symbolic_forward(sym, location, expected, rtol=rtol, atol=atol, dtype=dtype) check_bw = lambda sym, location, out_grads, expected :\ check_symbolic_backward(sym, location, out_grads, expected, rtol=rtol, atol=atol, dtype=dtype) op_name = 'binary_op={}, dtype={}'.format(name, dtype) data1 = mx.symbol.Variable(op_name + '_data1', dtype=dtype) data2 = mx.symbol.Variable(op_name + '_data2', dtype=dtype) # Comparison: Forward expression data1_np = np.random.uniform(input1_low, input1_high, shape).astype(dtype) data2_np = np.random.uniform(input2_low, input2_high, shape).astype(dtype) res_np = forward_numpy_call(data1_np, data2_np) op_ex = mx.sym.broadcast_add( forward_mxnet_call(data1, data2), mx.sym.zeros_like(data1), name=op_name) check_fw(op_ex, [data1_np, data2_np], [res_np]) # Comparison: Backward expression res_grad = np.random.uniform(-2.0, 2.0, shape).astype(dtype) data1_grad = backward1_numpy_call(data1_np, data2_np) * res_grad data2_grad = backward2_numpy_call(data1_np, data2_np) * res_grad check_bw(op_ex, [data1_np, data2_np], [res_grad], [data1_grad, data2_grad]) def finite_diff_binary_op( name, forward_mxnet_call, shape, input1_low, input1_high, input2_low, input2_high, rtol, atol, num_eps): # Finite difference tests are done in float64 dtype = np.float64 check_grad = lambda sym, location:\ check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol, atol=atol, dtype=dtype) data1_np = np.random.uniform(input1_low, input1_high, shape).astype(dtype) data2_np = np.random.uniform(input2_low, input2_high, shape).astype(dtype) data1 = mx.symbol.Variable('data1', dtype=dtype) data2 = mx.symbol.Variable('data2', dtype=dtype) op_name = 'binary_op={}, dtype={}'.format(name, dtype) op_ex = mx.sym.broadcast_add( forward_mxnet_call(data1, data2), mx.sym.zeros_like(data1), name=op_name) check_grad(op_ex, [data1_np, data2_np]) # Tests for unary operators (basic mathematical functions): # - Forward: Comparison to NumPy (several dtype) # - Backward: Comparison to NumPy (several dtype) # - Finite difference tests (only dtype = float64) @with_seed() def test_binary_math_operators(): shape=(9, 10) dtype_l = [np.float64, np.float32, np.float16] rtol_l = [1e-7, 1e-6, 1e-2] atol_l = [1e-7, 1e-6, 1e-2] rtol_fd = 1e-5 atol_fd = 1e-6 num_eps = 1e-6 binary_ops = { 'hypot' : [lambda x, y: mx.sym.hypot(x, y), lambda x, y: np.hypot(x, y), lambda x, y: x / np.hypot(x, y), lambda x, y: y / np.hypot(x, y), -5.0, 5.0, -5.0, 5.0], 'pow': [lambda x, y: mx.sym.pow(x, y), lambda x, y: np.power(x, y), lambda x, y: np.power(x, y - 1.) * y, lambda x, y: np.power(x, y) * np.log(x), 0.2, 5.0, -4.0, 4.0] } # Loop over operators for name, op in binary_ops.items(): # Loop over dtype's for ind in range(len(dtype_l)): dtype = dtype_l[ind] compare_forw_backw_binary_op( name, op[0], op[1], op[2], op[3], shape, op[4], op[5], op[6], op[7], rtol_l[ind], atol_l[ind], dtype) # Finite difference testing finite_diff_binary_op( name, op[0], shape, op[4], op[5], op[6], op[7], rtol_fd, atol_fd, num_eps) @with_seed() def test_softmax(): check_softmax_with_shape((3, 4), default_context(), preserve_shape=False) check_softmax_with_shape((3, 4), default_context(), preserve_shape=True) check_softmax_with_shape((3, 4, 2), default_context(), preserve_shape=True) check_softmax_grad(default_context()) check_smoothed_softmax_grad(default_context()) @with_seed() def test_slice(): def test_slice_forward_backward(a, index): a_np = a.asnumpy() begin = [] end = [] step = [] for slice_i in index: begin.append(slice_i.start) end.append(slice_i.stop) step.append(slice_i.step) b = mx.nd.slice(a, begin=begin, end=end, step=step) b_np = a_np[index] assert same(b.asnumpy(), b_np) data = mx.sym.Variable('data') slice_sym = mx.sym.slice(data, begin=begin, end=end, step=step) expected_in_grad = np.zeros_like(a_np) expected_in_grad[index] = b_np check_symbolic_backward(slice_sym, [a_np], [b_np], [expected_in_grad]) shape = (16, 14, 17, 20) arr = mx.nd.arange(np.prod(shape)).reshape(shape=shape) index_list = [(slice(None),), (slice(None), slice(None)), (slice(1, 10),), (slice(1, 10), slice(3, 9)), (slice(1, 10), slice(2, 5), slice(3, 6), slice(7, 10)), (slice(1, 10, 2), slice(2, 9, 3), slice(3, 6, 5), slice(7, 10, 2)), (slice(None, None, -1), slice(None, None, -1), slice(None, None, -1)), (slice(10, 0, -2), slice(5, 2, -1), slice(7, None, 3), slice(None, 12, 4))] for index in index_list: test_slice_forward_backward(arr, index) # check numeric gradient in_data = np.arange(36).reshape(2, 2, 3, 3) data = mx.sym.Variable('data') slice_sym = mx.sym.slice(data, begin=[0, None], end=[1, None], step=[2, -1]) check_numeric_gradient(slice_sym, [in_data]) def test_slice_partial_infer(): def check_slice_partial_infer(data, begin, end, step, expected_out_shape): out = mx.sym.slice(data, begin=begin, end=end, step=step) assert (out.infer_shape_partial()[1][0] == expected_out_shape), out.infer_shape_partial()[1] def check_slice_axis_partial_infer(data, axis, begin, end, expected_out_shape): out = mx.sym.slice_axis(data, axis=axis, begin=begin, end=end) assert (out.infer_shape_partial()[1][0] == expected_out_shape), out.infer_shape_partial()[1] var1 = mx.sym.var(name="data", shape=(0, 20)) check_slice_partial_infer(var1, (None, None), (None, 10), [], (0, 10)) check_slice_partial_infer(var1, (None, None), (None, 10), (None, 2), (0, 5)) check_slice_partial_infer(var1, (None, 3), (None, 10), [], (0, 7)) check_slice_partial_infer(var1, (None, 3), (5, 10), [], (0, 7)) check_slice_partial_infer(var1, (2, 3), (None, 10), [], (0, 7)) check_slice_partial_infer(var1, (2, 3), (None, 10), (None, 1), (0, 7)) check_slice_partial_infer(var1, (2, 3), (None, 10), (3, 3), (0, 3)) var1 = mx.sym.var(name="data", shape=(10, 0)) check_slice_axis_partial_infer(var1, 0, 0, 5, (5, 0)) check_slice_axis_partial_infer(var1, 1, 0, 5, (10, 0)) @with_seed() def test_float16_min_max(): """Test for issue: https://github.com/apache/incubator-mxnet/issues/9007""" a = mx.nd.array([np.finfo('float16').min, np.finfo('float16').max], dtype='float16') assert a.dtype == np.float16 assert np.finfo('float16').min == mx.nd.min(a).asscalar() assert np.finfo('float16').max == mx.nd.max(a).asscalar() @with_seed() def test_squeeze_op(): def check_squeeze_op(shape, axis=None): data = mx.nd.random.uniform(low=-10.0, high=10.0, shape=shape) if axis is None: out = mx.nd.squeeze(data).asnumpy() out_expected = np.squeeze(data.asnumpy()) else: out = mx.nd.squeeze(data, axis=axis).asnumpy() out_expected = np.squeeze(data.asnumpy(), axis=axis) if out.shape == (1,): # as an exception (1, 1, 1) will be squeezed to (1,) out_expected = np.squeeze(data.asnumpy(), axis=tuple([i for i in range(1, len(shape))])) assert same(out, out_expected) # check forward check_squeeze_op((1, 5, 1, 3, 1), 0) check_squeeze_op((1, 5, 1, 3, 1), 2) check_squeeze_op((1, 5, 1, 3, 1), 4) check_squeeze_op((1, 5, 1, 3, 1), (0, 4)) check_squeeze_op((1, 5, 1, 3, 1), (0, 2, 4)) check_squeeze_op((1, 5, 1, 3, 1)) check_squeeze_op((1, 1, 1, 1)) # check gradient data = mx.symbol.Variable('data') shape = (1, 2, 1, 3, 1) data_tmp = np.ones(shape) test = mx.sym.squeeze(data) check_numeric_gradient(test, [data_tmp]) test = mx.sym.squeeze(data, axis=2) check_numeric_gradient(test, [data_tmp]) test = mx.sym.squeeze(data, axis=(2, 4)) check_numeric_gradient(test, [data_tmp]) @with_seed() def test_adaptive_avg_pool_op(): def py_adaptive_avg_pool(x, height, width): # 2D per frame adaptive avg pool def adaptive_avg_pool_frame(x, y): isizeH, isizeW = x.shape osizeH, osizeW = y.shape for oh in range(osizeH): istartH = int(np.floor(1.0 * (oh * isizeH) / osizeH)) iendH = int(np.ceil(1.0 * (oh + 1) * isizeH / osizeH)) kH = iendH - istartH for ow in range(osizeW): istartW = int(np.floor(1.0 * (ow * isizeW) / osizeW)) iendW = int(np.ceil(1.0 * (ow + 1) * isizeW / osizeW)) kW = iendW - istartW xsum = 0 for ih in range(kH): for iw in range(kW): xsum += x[istartH+ih][istartW+iw] y[oh][ow] = xsum / kH / kW B,C,_,_ = x.shape y = np.empty([B,C,height, width], dtype=x.dtype) for b in range(B): for c in range(C): adaptive_avg_pool_frame(x[b][c], y[b][c]) return y def check_adaptive_avg_pool_op(shape, output_height, output_width=None): x = mx.nd.random.uniform(shape=shape) if output_width is None: y = mx.nd.contrib.AdaptiveAvgPooling2D(x, output_size=output_height) npy = py_adaptive_avg_pool(x.asnumpy(), output_height, output_height) else: y = mx.nd.contrib.AdaptiveAvgPooling2D(x, output_size=(output_height, output_width)) npy = py_adaptive_avg_pool(x.asnumpy(), output_height, output_width) assert_almost_equal(y.asnumpy(), npy) shape = (2, 2, 10, 10) for i in range(1, 11): check_adaptive_avg_pool_op(shape, i) for j in range(1, 11): check_adaptive_avg_pool_op(shape, i, j) @with_seed() def test_bilinear_resize_op(): def py_bilinear_resize(x, outputHeight, outputWidth): batch, channel, inputHeight, inputWidth = x.shape if outputHeight == inputHeight and outputWidth == inputWidth: return x y = np.empty([batch, channel, outputHeight, outputWidth]) rheight = 1.0 * (inputHeight - 1) / (outputHeight - 1) if outputHeight > 1 else 0.0 rwidth = 1.0 * (inputWidth - 1) / (outputWidth - 1) if outputWidth > 1 else 0.0 for h2 in range(outputHeight): h1r = 1.0 * h2 * rheight h1 = int(np.floor(h1r)) h1lambda = h1r - h1 h1p = 1 if h1 < (inputHeight - 1) else 0 for w2 in range(outputWidth): w1r = 1.0 * w2 * rwidth w1 = int(np.floor(w1r)) w1lambda = w1r - w1 w1p = 1 if w1 < (inputHeight - 1) else 0 for b in range(batch): for c in range(channel): y[b][c][h2][w2] = (1-h1lambda)*((1-w1lambda)*x[b][c][h1][w1] + \ w1lambda*x[b][c][h1][w1+w1p]) + \ h1lambda*((1-w1lambda)*x[b][c][h1+h1p][w1] + \ w1lambda*x[b][c][h1+h1p][w1+w1p]) return y def check_bilinear_resize_op(shape, height, width): x = mx.nd.random.uniform(shape=shape) y = mx.nd.contrib.BilinearResize2D(x, height=height, width=width) assert_almost_equal(y.asnumpy(), py_bilinear_resize(x.asnumpy(), height, width)) shape = (2, 2, 10, 10) check_bilinear_resize_op(shape, 5, 5) check_bilinear_resize_op(shape, 10, 10) check_bilinear_resize_op(shape, 15, 15) check_bilinear_resize_op(shape, 3, 7) check_bilinear_resize_op(shape, 13, 17) def test_multi_proposal_op(): # paramters feature_stride = 16 scales = (8, 16, 32) ratios = (0.5, 1, 2) rpn_pre_nms_top_n = 12000 rpn_post_nms_top_n = 2000 threshold = 0.7 rpn_min_size = 16 batch_size = 20 feat_len = (1000 + 15) // 16 H, W = feat_len, feat_len num_anchors = len(scales) * len(ratios) count_anchors = H * W * num_anchors ''' cls_prob: (batch_size, 2 * num_anchors, H, W) bbox_pred: (batch_size, 4 * num_anchors, H, W) im_info: (batch_size, 3) ''' cls_prob = mx.nd.empty((batch_size, 2 * num_anchors, H, W), dtype = np.float32) bbox_pred = mx.nd.empty((batch_size, 4 * num_anchors, H, W), dtype = np.float32) im_info = mx.nd.empty((batch_size, 3), dtype = np.float32) cls_prob = mx.nd.array(np.random.random(cls_prob.shape)) bbox_pred = mx.nd.array(np.random.random(bbox_pred.shape)) for i in range(batch_size): im_size = np.random.randint(100, feat_len * feature_stride, size = (2,)) im_scale = np.random.randint(70, 100) / 100.0 im_info[i, :] = [im_size[0], im_size[1], im_scale] def get_sub(arr, i): new_shape = list(arr.shape) new_shape[0] = 1 res = arr[i].reshape(new_shape) return res def check_forward(rpn_pre_nms_top_n, rpn_post_nms_top_n): single_proposal = [] single_score = [] for i in range(batch_size): rois, score = mx.nd.contrib.Proposal( cls_prob = get_sub(cls_prob, i), bbox_pred = get_sub(bbox_pred, i), im_info = get_sub(im_info, i), feature_stride = feature_stride, scales = scales, ratios = ratios, rpn_pre_nms_top_n = rpn_pre_nms_top_n, rpn_post_nms_top_n = rpn_post_nms_top_n, threshold = threshold, rpn_min_size = rpn_min_size, output_score = True) single_proposal.append(rois) single_score.append(score) multi_proposal, multi_score = mx.nd.contrib.MultiProposal( cls_prob = cls_prob, bbox_pred = bbox_pred, im_info = im_info, feature_stride = feature_stride, scales = scales, ratios = ratios, rpn_pre_nms_top_n = rpn_pre_nms_top_n, rpn_post_nms_top_n = rpn_post_nms_top_n, threshold = threshold, rpn_min_size = rpn_min_size, output_score = True) single_proposal = mx.nd.stack(*single_proposal).reshape(multi_proposal.shape) single_score = mx.nd.stack(*single_score).reshape(multi_score.shape) single_proposal_np = single_proposal.asnumpy() multi_proposal_np = multi_proposal.asnumpy() single_score_np = single_score.asnumpy() multi_score_np = multi_score.asnumpy() # check rois x1,y1,x2,y2 assert np.allclose(single_proposal_np[:, 1:], multi_proposal_np[:, 1:]) # check rois batch_idx for i in range(batch_size): start = i * rpn_post_nms_top_n end = start + rpn_post_nms_top_n assert (multi_proposal_np[start:end, 0] == i).all() # check score assert np.allclose(single_score_np, multi_score_np) def check_backward(rpn_pre_nms_top_n, rpn_post_nms_top_n): im_info_sym = mx.sym.Variable('im_info') cls_prob_sym = mx.sym.Variable('cls_prob') bbox_pred_sym = mx.sym.Variable('bbox_pred') sym = mx.sym.contrib.MultiProposal( cls_prob = cls_prob_sym, bbox_pred = bbox_pred_sym, im_info = im_info_sym, feature_stride = feature_stride, scales = scales, ratios = ratios, rpn_pre_nms_top_n = rpn_pre_nms_top_n, rpn_post_nms_top_n = rpn_post_nms_top_n, threshold = threshold, rpn_min_size = rpn_min_size, output_score = False) location = [cls_prob.asnumpy(), bbox_pred.asnumpy(), im_info.asnumpy()] expected = [np.zeros_like(e) for e in location] out_grads = [np.ones((rpn_post_nms_top_n, 5))] check_symbolic_backward(sym, location, out_grads, expected) check_forward(rpn_pre_nms_top_n, rpn_post_nms_top_n) check_forward(rpn_pre_nms_top_n, 1500) check_forward(1000, 500) check_backward(rpn_pre_nms_top_n, rpn_post_nms_top_n) @with_seed() def test_quadratic_function(): def f(x, a, b, c): return a * x**2 + b * x + c a = np.random.random_sample() b = np.random.random_sample() c = np.random.random_sample() data = mx.symbol.Variable('data') quad_sym = mx.sym.contrib.quadratic(data=data, a=a, b=b, c=c) for dtype in [np.float16, np.float32, np.float64]: for ndim in range(1, 6): shape = rand_shape_nd(ndim, 5) data_np = np.random.randn(*shape).astype(dtype) expected = f(data_np, a, b, c) backward_expected = 2 * a * data_np + b # check imperative forward output = mx.nd.contrib.quadratic(mx.nd.array(data_np), a=a, b=b, c=c) assert_almost_equal(output.asnumpy(),expected, rtol=1e-2 if dtype is np.float16 else 1e-5, atol=1e-2 if dtype is np.float16 else 1e-5) # check forward check_symbolic_forward(quad_sym, [data_np], [expected], rtol=1e-2 if dtype is np.float16 else 1e-5, atol=1e-2 if dtype is np.float16 else 1e-5) # check backward check_symbolic_backward(quad_sym, [data_np], [np.ones(expected.shape)], [backward_expected], rtol=1e-2 if dtype is np.float16 else 1e-5, atol=1e-2 if dtype is np.float16 else 1e-5) # check backward using finite difference check_numeric_gradient(quad_sym, [data_np], atol=0.001) @with_seed() def test_histogram(): def f(x, bins=10, range=None): return np.histogram(x, bins, range=range) for ndim in range(1, 6): shape = rand_shape_nd(ndim) x = rand_ndarray(shape, stype='default', dtype=np.float64) mx_bins = mx.nd.array([-1.0, 0.5, 2.0, 4.5, 50.0], dtype=np.float64) np_bins = mx_bins.asnumpy() bin_cnt = random.randint(2, 10) bin_range = (-2.5, 2.5) mx_histo1, mx_bins1 = mx.nd.histogram(x, bins=bin_cnt, range=bin_range) np_histo1, np_bins1 = f(x.asnumpy(), bins=bin_cnt, range=bin_range) assert_almost_equal(mx_bins1.asnumpy(), np_bins1) assert_almost_equal(mx_histo1.asnumpy(), np_histo1, rtol=1e-3, atol=1e-5) mx_histo2, mx_bins2 = mx.nd.histogram(x, bins=mx_bins) np_histo2, np_bins2 = f(x.asnumpy(), bins=np_bins) assert_almost_equal(mx_histo2.asnumpy(), np_histo2, rtol=1e-3, atol=1e-5) assert_almost_equal(mx_bins2.asnumpy(), np_bins2, rtol=1e-3, atol=1e-5) data = mx.sym.Variable("data") bins = mx.sym.Variable("bins") histo1 = mx.sym.histogram(a=data, bins=bin_cnt, range=bin_range) histo2 = mx.sym.histogram(a=data, bins=bins) executor1 = histo1.bind(ctx=default_context(), args={"data" : x}) executor1.forward(is_train=False) assert_almost_equal(np_histo1, executor1.outputs[0].asnumpy(), 0, 0, ("EXPECTED_histo1", "FORWARD_histo1"), equal_nan=False) executor2 = histo2.bind(ctx=default_context(), args={"data" : x, "bins" : mx_bins}) executor2.forward(is_train=False) assert_almost_equal(np_histo2, executor2.outputs[0].asnumpy(), 0, 0, ("EXPECTED_histo2", "FORWARD_histo2"), equal_nan=False) def test_op_output_names_monitor(): def check_name(op_sym, expected_names): output_names = [] def get_output_names_callback(name, arr): output_names.append(py_str(name)) op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null') op_exe.set_monitor_callback(get_output_names_callback) op_exe.forward() for output_name, expected_name in zip(output_names, expected_names): assert output_name == expected_name data = mx.sym.Variable('data', shape=(10, 3, 10, 10)) conv_sym = mx.sym.Convolution(data, kernel=(2, 2), num_filter=1, name='conv') check_name(conv_sym, ['conv_output']) deconv_sym = mx.sym.Deconvolution(data, kernel=(2, 2), num_filter=1, name='deconv') check_name(deconv_sym, ['deconv_output']) fc_sym = mx.sym.FullyConnected(data, num_hidden=10, name='fc') check_name(fc_sym, ['fc_output']) lrn_sym = mx.sym.LRN(data, nsize=1, name='lrn') check_name(lrn_sym, ['lrn_output', 'lrn_tmp_norm']) act_sym = mx.sym.Activation(data, act_type='relu', name='act') check_name(act_sym, ['act_output']) cc_sym = mx.sym.concat(data, data, dim=0, name='concat') check_name(cc_sym, ['concat_output']) sm_sym = mx.sym.softmax(data, name='softmax') check_name(sm_sym, ['softmax_output']) sa_sym = mx.sym.SoftmaxActivation(data, name='softmax') check_name(sa_sym, ['softmax_output']) us_sym = mx.sym.UpSampling(data, scale=2, sample_type='nearest', name='upsampling') check_name(us_sym, ['upsampling_output']) us_sym = mx.sym.Pooling(data, kernel=(2, 2), pool_type='avg', name='pooling') check_name(us_sym, ['pooling_output']) @with_seed() def test_activation(): shape=(9, 10) dtype_l = [np.float64, np.float32, np.float16] rtol_l = [1e-7, 1e-6, 1e-2] atol_l = [1e-7, 1e-6, 1e-2] rtol_fd = 1e-5 atol_fd = 1e-6 num_eps = 1e-6 unary_ops = { 'relu': [lambda x: mx.sym.Activation(x, act_type='relu'), lambda x: np.maximum(x, 0.), lambda x: 1. * (x > 0.), -5.0, 5.0], 'sigmoid': [lambda x: mx.sym.Activation(x, act_type='sigmoid'), lambda x: 1. / (np.exp(-x) + 1.), lambda x: 1. / (np.exp(-x) + 1.) / (np.exp(x) + 1.), -3.0, 3.0], 'tanh': [lambda x: mx.sym.Activation(x, act_type='tanh'), lambda x: np.tanh(x), lambda x: 1. - np.tanh(x) ** 2, -4.0, 4.0], 'softrelu': [lambda x: mx.sym.Activation(x, act_type='softrelu'), lambda x: np.log(1. + np.exp(x)), lambda x: 1. - 1 / (1 + np.exp(x)), -3.0, 3.0], 'softsign': [lambda x: mx.sym.Activation(x, act_type='softsign'), lambda x: x / (1. + np.abs(x)), lambda x: 1. / np.square(1. + np.abs(x)), -3.0, 3.0], } # Loop over operators for name, op in unary_ops.items(): # Loop over dtype's for ind in range(len(dtype_l)): dtype = dtype_l[ind] rtol = rtol_l[ind] atol = atol_l[ind] compare_forw_backw_unary_op( name, op[0], op[1], op[2], shape, op[3], op[4], rtol, atol, dtype) # Finite difference testing finite_diff_unary_op( name, op[0], shape, op[3], op[4], rtol_fd, atol_fd, num_eps) @with_seed() def test_ravel(): # be aware that check_symbolic_forward will use float type internally # for the arrays and that limits the representable flat index range. # Taking dim==4 and a range of [0,..,100] for the data can already # cause precision issues and break this test. for dim in [1, 2, 3, 4]: data = np.random.randint(50, size=(dim, 500)) shape = tuple(np.add(np.amax(data, axis=1), [1])) a = mx.sym.Variable('a') ravel_npy = np.ravel_multi_index(data, shape) b = mx.sym.ravel_multi_index(a, shape=shape) check_symbolic_forward(b, location={'a': data}, expected=[ravel_npy]) c = mx.sym.unravel_index(a, shape=shape) check_symbolic_forward(c, location={'a': ravel_npy}, expected=[data]) def test_context_num_gpus(): try: # Note: the test is run both on GPU and CPU hosts, so that we can not assert # on a specific number here. assert mx.context.num_gpus() >= 0 except mx.MXNetError as e: # Note: On a CPU only host CUDA sometimes is not able to determine the number # of GPUs if str(e).find("CUDA") == -1: raise e @with_seed() def test_op_roi_align(): # Adapted from https://github.com/wkcn/MobulaOP/blob/master/tests/test_roi_align_op.py def bilinear_interpolate(bottom, height, width, y, x): if y < -1.0 or y > height or x < -1.0 or x > width: return 0.0, [] x = max(0.0, x) y = max(0.0, y) x_low = int(x) y_low = int(y) if x_low >= width - 1: x_low = x_high = width - 1 x = x_low else: x_high = x_low + 1 if y_low >= height - 1: y_low = y_high = height - 1 y = y_low else: y_high = y_low + 1 ly = y - y_low lx = x - x_low hy = 1.0 - ly hx = 1.0 - lx v1 = bottom[y_low, x_low] v2 = bottom[y_low, x_high] v3 = bottom[y_high, x_low] v4 = bottom[y_high, x_high] ''' ----------->x |hx hy | lx hy |------+------ |hx ly | lx ly V y v1|v2 --+-- v3|v4 ''' w1 = hy * hx w2 = hy * lx w3 = ly * hx w4 = ly * lx val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4 grad = [(y_low, x_low, w1), (y_low, x_high, w2), (y_high, x_low, w3), (y_high, x_high, w4) ] return val, grad def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_ratio, dy): N, C, H, W = data.shape R = rois.shape[0] PH, PW = pooled_size assert len(rois.shape) == 2 assert rois.shape[1] == 5 out = np.zeros((R, C, PH, PW)) dx = np.zeros_like(data) drois = np.zeros_like(rois) for r in range(R): batch_ind = int(rois[r, 0]) sw, sh, ew, eh = rois[r, 1:5] * spatial_scale roi_w = max(ew - sw, 1.0) roi_h = max(eh - sh, 1.0) bin_h = roi_h * 1.0 / PH bin_w = roi_w * 1.0 / PW bdata = data[batch_ind] if sampling_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = sampling_ratio else: roi_bin_grid_h = int(np.ceil(roi_h * 1.0 / PH)) roi_bin_grid_w = int(np.ceil(roi_w * 1.0 / PW)) count = roi_bin_grid_h * roi_bin_grid_w for c in range(C): for ph in range(PH): for pw in range(PW): val = 0.0 for iy in range(roi_bin_grid_h): y = sh + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h for ix in range(roi_bin_grid_w): x = sw + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w v, g = bilinear_interpolate(bdata[c], H, W, y, x) val += v # compute grad for qy, qx, qw in g: dx[batch_ind, c, qy, qx] += dy[r, c, ph, pw] * qw * 1.0 / count out[r, c, ph, pw] = val * 1.0 / count return out, [dx, drois] def test_roi_align_value(sampling_ratio=0): ctx=default_context() dtype = np.float32 dlen = 224 N, C, H, W = 5, 3, 16, 16 assert H == W R = 7 pooled_size = (3, 4) spatial_scale = H * 1.0 / dlen data = mx.nd.array(np.arange(N*C*W*H).reshape((N,C,H,W)), ctx=ctx, dtype = dtype) # data = mx.nd.random.uniform(0, 1, (N, C, H, W), dtype = dtype) center_xy = mx.nd.random.uniform(0, dlen, (R, 2), ctx=ctx, dtype = dtype) wh = mx.nd.random.uniform(0, dlen, (R, 2), ctx=ctx, dtype = dtype) batch_ind = mx.nd.array(np.random.randint(0, N, size = (R,1)), ctx=ctx) pos = mx.nd.concat(center_xy - wh / 2, center_xy + wh / 2, dim = 1) rois = mx.nd.concat(batch_ind, pos, dim = 1) data.attach_grad() rois.attach_grad() with mx.autograd.record(): output = mx.nd.contrib.ROIAlign(data, rois, pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sampling_ratio) dy = mx.nd.random.uniform(-1, 1, (R, C) + pooled_size, ctx=ctx, dtype = dtype) output.backward(dy) real_output, [dx, drois] = roialign_forward_backward(data.asnumpy(), rois.asnumpy(), pooled_size, spatial_scale, sampling_ratio, dy.asnumpy()) assert np.allclose(output.asnumpy(), real_output) # It seems that the precision between Cfloat and Pyfloat is different. assert np.allclose(data.grad.asnumpy(), dx, atol = 1e-5), np.abs(data.grad.asnumpy() - dx).max() assert np.allclose(rois.grad.asnumpy(), drois) # modified from test_roipooling() def test_roi_align_autograd(sampling_ratio=0): ctx = default_context() data = mx.symbol.Variable(name='data') rois = mx.symbol.Variable(name='rois') test = mx.symbol.contrib.ROIAlign(data=data, rois=rois, pooled_size=(4, 4), spatial_scale=1, sample_ratio=sampling_ratio) x1 = np.random.rand(4, 1, 12, 12).astype('float64') x2 = np.array([[0, 1.1, 1.1, 6.2, 6.2], [2, 6.1, 2.1, 8.2, 11.2], [1, 3.1, 1.1, 5.2, 10.2]], dtype='float64') check_numeric_gradient(sym=test, location=[x1, x2], grad_nodes={'data':'write', 'rois':'null'}, numeric_eps=1e-4, rtol=1e-1, atol=1e-4, ctx=ctx) check_numeric_gradient(sym=test, location=[x1, x2], grad_nodes={'data':'add', 'rois':'null'}, numeric_eps=1e-4, rtol=1e-1, atol=1e-4, ctx=ctx) test_roi_align_value() test_roi_align_value(2) test_roi_align_autograd() @with_seed() def test_diag(): # Test 2d input h = np.random.randint(2,9) w = np.random.randint(2,9) a_np = np.random.random((h, w)).astype(np.float32) a = mx.nd.array(a_np).astype('float32') # k == 0 r = mx.nd.diag(a) assert_almost_equal(r.asnumpy(), np.diag(a_np)) # k == 1 k = 1 r = mx.nd.diag(a, k=k) assert_almost_equal(r.asnumpy(), np.diag(a_np, k=k)) # k == -1 k = -1 r = mx.nd.diag(a, k=k) assert_almost_equal(r.asnumpy(), np.diag(a_np, k=k)) # random k k = np.random.randint(-min(h,w) + 1, min(h,w)) r = mx.nd.diag(a, k=k) assert_almost_equal(r.asnumpy(), np.diag(a_np, k=k)) # invalid k k = max(h,w) + 1 assertRaises(MXNetError, mx.nd.diag, a, k=k) # Test 2d backward, k=0 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data) check_numeric_gradient(diag_sym, [a_np]) # Test 2d backward, k=1 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=1) check_numeric_gradient(diag_sym, [a_np]) # Test 2d backward, k=-1 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=-1) check_numeric_gradient(diag_sym, [a_np]) # test 1d input d = np.random.randint(2,9) a_np = np.random.random((d)) a = mx.nd.array(a_np) # k is random k = np.random.randint(-d,d) r = mx.nd.diag(a, k=k) assert_almost_equal(r.asnumpy(), np.diag(a_np, k=k)) # Test 2d backward, k=0 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data) check_numeric_gradient(diag_sym, [a_np]) # Test 2d backward, k=1 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=1) check_numeric_gradient(diag_sym, [a_np]) # Test 2d backward, k=-1 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=-1) check_numeric_gradient(diag_sym, [a_np]) # Test 4d input x1 = np.random.randint(3,9) x2 = np.random.randint(3,9) x3 = np.random.randint(3,9) x4 = np.random.randint(3,9) a_np = np.random.random((x1, x2, x3, x4)).astype(np.float32) a = mx.nd.array(a_np).astype('float32') # k = 0, axis1=0, axis2=1 r = mx.nd.diag(data=a, k=0, axis1=0, axis2=1) assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=0, axis1=0, axis2=1)) # k = 1, axis1=1, axis2=0 r = mx.nd.diag(data=a, k=1, axis1=1, axis2=0) assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=1, axis1=1, axis2=0)) # k = -1 axis1=1, axis3=3 r = mx.nd.diag(data=a, k=-1, axis1=1, axis2=3) assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=-1, axis1=1, axis2=3)) # k = 2, axis1=-2, axis2=0 r = mx.nd.diag(data=a, k=2, axis1=-2, axis2=0) assert_almost_equal(r.asnumpy(), np.diagonal(a_np, offset=2, axis1=-2, axis2=0)) # Test 4d backward, k=0, axis1=3, axis2=0 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=0, axis1=3, axis2=0) check_numeric_gradient(diag_sym, [a_np]) # Test 4d backward, k=1, axis1=1, axis2=2 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=1, axis1=1, axis2=2) check_numeric_gradient(diag_sym, [a_np]) # Test 4d backward, k=-1, axis1=2, axis2=0 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=-1, axis1=2, axis2=0) check_numeric_gradient(diag_sym, [a_np]) # Test 4d backward, k=-2, axis1=1, axis2=-1 data = mx.sym.Variable('data') diag_sym = mx.sym.diag(data=data, k=-2, axis1=1, axis2=-1) check_numeric_gradient(diag_sym, [a_np]) @with_seed() def test_depthtospace(): def f(x, blocksize): b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3] tmp = np.reshape(x, [b, blocksize, blocksize, c // (blocksize**2), h, w]) tmp = np.transpose(tmp, [0, 3, 4, 1, 5, 2]) y = np.reshape(tmp, [b, c // (blocksize**2), h * blocksize, w * blocksize]) return y block = random.randint(2, 4) rand_mul1 = random.randint(1, 4) n = random.randint(1, 5) c = block * block * rand_mul1 h = random.randint(1, 5) w = random.randint(1, 5) shape_inp = (n, c, h, w) data = rand_ndarray(shape_inp, 'default') data_np = data.asnumpy() expected = f(data_np, block) output = mx.nd.depth_to_space(data, block) assert_almost_equal(output.asnumpy(), expected, atol=1e-3, rtol=1e-3) shape_out = (n, c // (block ** 2), h * block, w * block) data = mx.sym.Variable('data') dts_sym = mx.sym.depth_to_space(data, block) check_numeric_gradient(dts_sym, [np.ones(shape_inp)]) check_symbolic_forward(dts_sym, [data_np], [expected]) check_symbolic_backward(dts_sym, [data_np], [np.ones(shape_out)], [np.ones(shape_inp)]) def test_invalid_depth_dim(): invalid_shape_inp = (n, block - 1, h, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.depth_to_space, data, block) def test_invalid_space_dim(): invalid_shape_inp = (n, block ** 2, 0, block + 1) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.depth_to_space, data, block) def test_invalid_block_size(): block = 0 invalid_shape_inp = (n , c, h, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.depth_to_space, data, block) test_invalid_depth_dim() test_invalid_space_dim() test_invalid_block_size() @with_seed() def test_spacetodepth(): def f(x, blocksize): b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3] tmp = np.reshape(x, [b, c, h // blocksize, blocksize, w // blocksize, blocksize]) tmp = np.transpose(tmp, [0, 3, 5, 1, 2, 4]) y = np.reshape(tmp, [b, c * (blocksize**2), h // blocksize, w // blocksize]) return y block = random.randint(2, 4) rand_mul1 = random.randint(1, 4) rand_mul2 = random.randint(1, 4) n = random.randint(1, 5) c = random.randint(1, 5) h = block * rand_mul1 w = block * rand_mul2 shape_inp = (n, c, h, w) data = rand_ndarray(shape_inp, 'default') data_np = data.asnumpy() expected = f(data_np, block) output = mx.nd.space_to_depth(data, block) assert_almost_equal(output.asnumpy(), expected, atol=1e-3, rtol=1e-3) shape_out = (n, c * (block ** 2), h // block, w // block) data = mx.sym.Variable('data') dts_sym = mx.sym.space_to_depth(data, block) check_numeric_gradient(dts_sym, [np.ones(shape_inp)]) check_symbolic_forward(dts_sym, [data_np], [expected]) check_symbolic_backward(dts_sym, [data_np], [np.ones(shape_out)], [np.ones(shape_inp)]) def test_invalid_space_dim(): invalid_shape_inp = (n , c, block - 1, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.space_to_depth, data, block) def test_invalid_block_size(): block = 0 invalid_shape_inp = (n, c, h, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.space_to_depth, data, block) def test_invalid_depth_dim(): invalid_shape_inp = (n, 0, h, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.space_to_depth, data, block) test_invalid_space_dim() test_invalid_block_size() test_invalid_depth_dim() @with_seed() def test_softmax_cross_entropy(): def f_sm_ce(data, label): return np.sum(-np.log(data) * label) data = mx.sym.Variable('data') label = mx.sym.Variable('label') sym = mx.sym.softmax_cross_entropy(data=data, label=label) num_labels = random.randint(100, 200) batch_size = random.randint(100, 200) np_data = rand_ndarray((batch_size, num_labels), stype='default').asnumpy() np_sm = np_softmax(np_data) np_label = np.random.randint(0, num_labels, (batch_size, )) np_one_hot_label = np.zeros((batch_size, num_labels)) np_one_hot_label[np.arange(batch_size), np_label] = 1. check_symbolic_forward(sym, {'data' : np_data, 'label' : np_label}, [np.array([f_sm_ce(np_sm, np_one_hot_label)])], rtol=1e-3, atol=1e-5) @with_seed() def test_invalid_kernel_size(): invalid_kernel_size = 28 assert_exception( mx.nd.Correlation, MXNetError, mx.nd.array(np.random.rand(1, 1, 28, 28)), mx.nd.array(np.random.rand(1, 1, 28, 28)), kernel_size=invalid_kernel_size) @with_seed() def test_valid_kernel_size(): valid_kernel_size = 9 mx.nd.Correlation( mx.nd.array(np.random.rand(1, 1, 28, 28)), mx.nd.array(np.random.rand(1, 1, 28, 28)), kernel_size=valid_kernel_size) @with_seed() def test_valid_max_pooling_pad_type_same(): import math input_data = mx.nd.array(np.random.rand(1,1,10)) stride = 2 kernel = 2 output_data=mx.nd.Pooling( input_data, kernel=kernel, stride=stride, pad=(0,0,0), pool_type='max', name='pooling', pooling_convention="same") assert(math.ceil(input_data.shape[2]/stride) == output_data.shape[2]) @with_seed() def test_invalid_max_pooling_pad_type_same(): import math input_data = mx.nd.array(np.random.rand(1,1,10)) stride = 2 kernel = 2 pad = 2 assert_exception( mx.nd.Pooling, MXNetError, input_data, stride=stride, kernel=kernel, pad=pad, pool_type='max', name='pooling', pooling_convention="same") if __name__ == '__main__': import nose nose.runmodule()