Customized loss function¶
This tutorial provides guidelines for using customized loss function in network construction.
Model Training Example¶
Let’s begin with a small regression example. We can build and train a regression model with the following code:
data(BostonHousing, package = "mlbench")
BostonHousing[, sapply(BostonHousing, is.factor)] <-
as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
BostonHousing <- data.frame(scale(BostonHousing))
test.ind = seq(1, 506, 5) # 1 pt in 5 used for testing
train.x = data.matrix(BostonHousing[-test.ind,-14])
train.y = BostonHousing[-test.ind, 14]
test.x = data.matrix(BostonHousing[--test.ind,-14])
test.y = BostonHousing[--test.ind, 14]
require(mxnet)
## Loading required package: mxnet
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")
mx.set.seed(0)
model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
ctx = mx.cpu(),
num.round = 5,
array.batch.size = 60,
optimizer = "rmsprop",
verbose = TRUE,
array.layout = "rowmajor",
batch.end.callback = NULL,
epoch.end.callback = NULL)
## Start training with 1 devices
pred <- predict(model, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum((test.y - pred[1,])^2) / length(test.y)
## [1] 0.2485236
Besides the LinearRegressionOutput
, we also provide LogisticRegressionOutput
and MAERegressionOutput
. However, this might not be enough for real-world models. You can provide your own loss function by using mx.symbol.MakeLoss
when constructing the network.
How to Use Your Own Loss Function¶
We still use our previous example, but this time we use mx.symbol.MakeLoss
to minimize the (pred-label)^2
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")
Then we can train the network just as usual.
mx.set.seed(0)
model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
ctx = mx.cpu(),
num.round = 5,
array.batch.size = 60,
optimizer = "rmsprop",
verbose = TRUE,
array.layout = "rowmajor",
batch.end.callback = NULL,
epoch.end.callback = NULL)
## Start training with 1 devices
We should get very similar results because we are actually minimizing the same loss function. However, the result is quite different.
pred2 <- predict(model2, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum((test.y - pred2)^2) / length(test.y)
## [1] 1.234584
This is because output of mx.symbol.MakeLoss
is the gradient of loss with respect to the input data. We can get the real prediction as below.
internals = internals(model2$symbol)
fc_symbol = internals[[match("fc2_output", outputs(internals))]]
model3 <- list(symbol = fc_symbol,
arg.params = model2$arg.params,
aux.params = model2$aux.params)
class(model3) <- "MXFeedForwardModel"
pred3 <- predict(model3, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum((test.y - pred3[1,])^2) / length(test.y)
## [1] 0.248294
We have provided many operations on the symbols. An example of |pred-label|
can be found below.
lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
mx.set.seed(0)
model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
ctx = mx.cpu(),
num.round = 20,
array.batch.size = 60,
optimizer = "sgd",
learning.rate = 0.001,
verbose = TRUE,
array.layout = "rowmajor",
batch.end.callback = NULL,
epoch.end.callback = NULL)
## Start training with 1 devices
internals = internals(model4$symbol)
fc_symbol = internals[[match("fc2_output", outputs(internals))]]
model5 <- list(symbol = fc_symbol,
arg.params = model4$arg.params,
aux.params = model4$aux.params)
class(model5) <- "MXFeedForwardModel"
pred5 <- predict(model5, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum(abs(test.y - pred5[1,])) / length(test.y)
## [1] 0.7056902
lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
mx.set.seed(0)
model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
ctx = mx.cpu(),
num.round = 20,
array.batch.size = 60,
optimizer = "sgd",
learning.rate = 0.001,
verbose = TRUE,
array.layout = "rowmajor",
batch.end.callback = NULL,
epoch.end.callback = NULL)
## Start training with 1 devices
pred6 <- predict(model6, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum(abs(test.y - pred6[1,])) / length(test.y)
## [1] 0.7056902