Source code for mxnet.optimizer.adamax
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=W0223
"""Adamax optimizer."""
from __future__ import absolute_import
from ..ndarray import (zeros, clip, maximum, abs as NDabs)
from .optimizer import Optimizer, register
__all__ = ['Adamax']
# pylint: enable=line-too-long
[docs]@register
class Adamax(Optimizer):
"""The AdaMax optimizer.
It is a variant of Adam based on the infinity norm
available at http://arxiv.org/abs/1412.6980 Section 7.
The optimizer updates the weight by::
grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
m = beta1 * m_t + (1 - beta1) * grad
u = maximum(beta2 * u, abs(grad))
weight -= lr / (1 - beta1**t) * m / (u + epsilon)
This optimizer accepts the following parameters in addition to those accepted
by :class:`.Optimizer`.
Parameters
----------
learning_rate : float, default 0.002
The initial learning rate. If None, the optimization will use the
learning rate from ``lr_scheduler``. If not None, it will overwrite
the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
is also None, then it will be set to 0.01 by default.
beta1 : float, default 0.9
Exponential decay rate for the first moment estimates.
beta2 : float, default 0.999
Exponential decay rate for the second moment estimates.
use_fused_step : bool, default False
Whether or not to use fused kernels for optimizer.
When use_fused_step=False, step is called,
otherwise, fused_step is called.
"""
def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8,
use_fused_step=False, **kwargs):
super(Adamax, self).__init__(learning_rate=learning_rate,
use_fused_step=use_fused_step,
**kwargs)
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
[docs] def create_state(self, index, weight):
return (zeros(weight.shape, weight.context, dtype=weight.dtype), # mean
zeros(weight.shape, weight.context, dtype=weight.dtype)) # variance
[docs] def step(self, indices, weights, grads, states):
"""Perform an optimization step using gradients and states.
Parameters
----------
indices : list of int
List of unique indices of the parameters into the individual learning rates
and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
and `set_wd_mult()`, respectively.
weights : list of NDArray
List of parameters to be updated.
grads : list of NDArray
List of gradients of the objective with respect to this parameter.
states : List of any obj
List of state returned by `create_state()`.
"""
for index, weight, grad, state in zip(indices, weights, grads, states):
self._update_count(index)
lr = self._get_lr(index)
wd = self._get_wd(index)
t = self._index_update_count[index]
lr /= (1. - self.beta1**t)
# preprocess grad
grad *= self.rescale_grad
if self.clip_gradient is not None:
grad = clip(grad, -self.clip_gradient, self.clip_gradient)
grad += wd * weight
# update mean and var
mean, var = state
mean[:] *= self.beta1
mean[:] += (1. - self.beta1) * grad
var[:] = maximum(self.beta2 * var, NDabs(grad))
# update weight
d = mean / (var + self.epsilon)
weight[:] -= lr * d
Did this page help you?
Yes
No
Thanks for your feedback!