Customized loss function¶

This tutorial provides guidelines for using customized loss function in network construction.

Model Training Example¶

Let’s begin with a small regression example. We can build and train a regression model with the following code:

data(BostonHousing, package = "mlbench")
BostonHousing[, sapply(BostonHousing, is.factor)] <-
  as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
BostonHousing <- data.frame(scale(BostonHousing))

test.ind = seq(1, 506, 5)    # 1 pt in 5 used for testing
train.x = data.matrix(BostonHousing[-test.ind,-14])
train.y = BostonHousing[-test.ind, 14]
test.x = data.matrix(BostonHousing[--test.ind,-14])
test.y = BostonHousing[--test.ind, 14]

require(mxnet)

## Loading required package: mxnet

data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")

mx.set.seed(0)
model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
                                     ctx = mx.cpu(),
                                     num.round = 5,
                                     array.batch.size = 60,
                                     optimizer = "rmsprop",
                                     verbose = TRUE,
                                     array.layout = "rowmajor",
                                     batch.end.callback = NULL,
                                     epoch.end.callback = NULL)

## Start training with 1 devices

pred <- predict(model, test.x)

## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..

sum((test.y - pred[1,])^2) / length(test.y)

## [1] 0.2485236

Besides the LinearRegressionOutput, we also provide LogisticRegressionOutput and MAERegressionOutput. However, this might not be enough for real-world models. You can provide your own loss function by using mx.symbol.MakeLoss when constructing the network.

How to Use Your Own Loss Function¶

We still use our previous example, but this time we use mx.symbol.MakeLoss to minimize the (pred-label)^2

data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")

Then we can train the network just as usual.

mx.set.seed(0)
model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
                                      ctx = mx.cpu(),
                                      num.round = 5,
                                      array.batch.size = 60,
                                      optimizer = "rmsprop",
                                      verbose = TRUE,
                                      array.layout = "rowmajor",
                                      batch.end.callback = NULL,
                                      epoch.end.callback = NULL)

## Start training with 1 devices

We should get very similar results because we are actually minimizing the same loss function. However, the result is quite different.

pred2 <- predict(model2, test.x)

## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..

sum((test.y - pred2)^2) / length(test.y)

## [1] 1.234584

This is because output of mx.symbol.MakeLoss is the gradient of loss with respect to the input data. We can get the real prediction as below.

internals = internals(model2$symbol)
fc_symbol = internals[[match("fc2_output", outputs(internals))]]

model3 <- list(symbol = fc_symbol,
               arg.params = model2$arg.params,
               aux.params = model2$aux.params)

class(model3) <- "MXFeedForwardModel"

pred3 <- predict(model3, test.x)

## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..

sum((test.y - pred3[1,])^2) / length(test.y)

## [1] 0.248294

We have provided many operations on the symbols. An example of |pred-label| can be found below.

lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
mx.set.seed(0)
model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
                                      ctx = mx.cpu(),
                                      num.round = 20,
                                      array.batch.size = 60,
                                      optimizer = "sgd",
                                      learning.rate = 0.001,
                                      verbose = TRUE,
                                      array.layout = "rowmajor",
                                      batch.end.callback = NULL,
                                      epoch.end.callback = NULL)

## Start training with 1 devices

internals = internals(model4$symbol)
fc_symbol = internals[[match("fc2_output", outputs(internals))]]

model5 <- list(symbol = fc_symbol,
               arg.params = model4$arg.params,
               aux.params = model4$aux.params)

class(model5) <- "MXFeedForwardModel"

pred5 <- predict(model5, test.x)

## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..

sum(abs(test.y - pred5[1,])) / length(test.y)

## [1] 0.7056902

lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
mx.set.seed(0)
model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
                                      ctx = mx.cpu(),
                                      num.round = 20,
                                      array.batch.size = 60,
                                      optimizer = "sgd",
                                      learning.rate = 0.001,
                                      verbose = TRUE,
                                      array.layout = "rowmajor",
                                      batch.end.callback = NULL,
                                      epoch.end.callback = NULL)

## Start training with 1 devices

pred6 <- predict(model6, test.x)

## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..

sum(abs(test.y - pred6[1,])) / length(test.y)

## [1] 0.7056902

Customized loss function¶

Model Training Example¶

How to Use Your Own Loss Function¶

Next Steps¶

Table Of Contents