Character-level Language Model using RNN¶
This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
- readr
- stringr
- stringi
- mxnet
We will use the tinyshakespeare dataset to build this model.
library("readr")
library("stringr")
library("stringi")
library("mxnet")
Preprocess and prepare the data¶
Download the data:
download.data <- function(data_dir) {
dir.create(data_dir, showWarnings = FALSE)
if (!file.exists(paste0(data_dir,'input.txt'))) {
download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
destfile=paste0(data_dir,'input.txt'), method='wget')
}
}
Next we transform the test into feature vectors that is fed into the RNN model. The make_data
function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length seq.len
.
make_data <- function(path, seq.len = 32, dic=NULL) {
text_vec <- read_file(file = path)
text_vec <- stri_enc_toascii(str = text_vec)
text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
text_vec <- strsplit(text_vec, '') %>% unlist
if (is.null(dic)) {
char_keep <- sort(unique(text_vec))
} else char_keep <- names(dic)[!dic == 0]
# Remove terms not part of dictionary
text_vec <- text_vec[text_vec %in% char_keep]
# Build dictionary
dic <- 1:length(char_keep)
names(dic) <- char_keep
# reverse dictionary
rev_dic <- names(dic)
names(rev_dic) <- dic
# Adjust by -1 to have a 1-lag for labels
num.seq <- (length(text_vec) - 1) %/% seq.len
features <- dic[text_vec[1:(seq.len * num.seq)]]
labels <- dic[text_vec[1:(seq.len*num.seq) + 1]]
features_array <- array(features, dim = c(seq.len, num.seq))
labels_array <- array(labels, dim = c(seq.len, num.seq))
return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
}
seq.len <- 100
data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic=NULL)
Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.
X <- data_prep$features_array
Y <- data_prep$labels_array
dic <- data_prep$dic
rev_dic <- data_prep$rev_dic
vocab <- length(dic)
samples <- tail(dim(X), 1)
train.val.fraction <- 0.9
X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]
X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]
train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))
train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
Create iterators for training and evaluation datasets.
vocab <- length(eval_buckets$dic)
batch.size <- 32
train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size,
data.mask.element = 0, shuffle = TRUE)
eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
data.mask.element = 0, shuffle = FALSE)
Train the Model¶
This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.
rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3,
num_hidden = 96,
input_size = vocab,
num_embed = 64,
num_decode = vocab,
dropout = 0.2,
ignore_label = 0,
cell_type = "lstm",
masking = F,
output_last_state = T,
loss_output = "softmax",
config = "one-to-one")
graph.viz(rnn_graph_one_one, type = "graph", direction = "LR",
graph.height.px = 180, shape=c(100, 64))
devices <- mx.cpu()
initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
clip_gradient = 5, rescale.grad = 1/batch.size)
logger <- mx.metric.logger()
epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
batch.end.callback <- mx.callback.log.train.metric(period = 50)
mx.metric.custom_nd <- function(name, feval) {
init <- function() {
c(0, 0)
}
update <- function(label, pred, state) {
m <- feval(label, pred)
state <- c(state[[1]] + 1, state[[2]] + m)
return(state)
}
get <- function(state) {
list(name=name, value = (state[[2]] / state[[1]]))
}
ret <- (list(init = init, update = update, get = get))
class(ret) <- "mx.metric"
return(ret)
}
mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
label <- mx.nd.reshape(label, shape = -1)
label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
batch <- length(label_probs)
NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
Perplexity <- exp(NLL)
return(Perplexity)
})
model <- mx.model.buckets(symbol = rnn_graph_one_one,
train.data = train.data, eval.data = eval.data,
num.round = 20, ctx = devices, verbose = TRUE,
metric = mx.metric.Perplexity,
initializer = initializer, optimizer = optimizer,
batch.end.callback = NULL,
epoch.end.callback = epoch.end.callback)
mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
Start training with 1 devices
[1] Train-Perplexity=13.7040474322178
[1] Validation-Perplexity=7.94617194460922
[2] Train-Perplexity=6.57039815554525
[2] Validation-Perplexity=6.60806110658011
[3] Train-Perplexity=5.65360504501481
[3] Validation-Perplexity=6.18932770630876
[4] Train-Perplexity=5.32547285727298
[4] Validation-Perplexity=6.02198756798859
[5] Train-Perplexity=5.14373631472579
[5] Validation-Perplexity=5.8095658243407
[6] Train-Perplexity=5.03077673487379
[6] Validation-Perplexity=5.72582993567431
[7] Train-Perplexity=4.94453383291536
[7] Validation-Perplexity=5.6445258528126
[8] Train-Perplexity=4.88635290100261
[8] Validation-Perplexity=5.6730024536433
[9] Train-Perplexity=4.84205646230548
[9] Validation-Perplexity=5.50960780230982
[10] Train-Perplexity=4.80441673535513
[10] Validation-Perplexity=5.57002263750006
[11] Train-Perplexity=4.77763413242626
[11] Validation-Perplexity=5.55152143269169
[12] Train-Perplexity=4.74937775290777
[12] Validation-Perplexity=5.44968305351486
[13] Train-Perplexity=4.72824849541467
[13] Validation-Perplexity=5.50889348298234
[14] Train-Perplexity=4.70980846981694
[14] Validation-Perplexity=5.51473225859859
[15] Train-Perplexity=4.69685776886122
[15] Validation-Perplexity=5.45391985233811
[16] Train-Perplexity=4.67837107034824
[16] Validation-Perplexity=5.46636764997829
[17] Train-Perplexity=4.66866961934873
[17] Validation-Perplexity=5.44267086113492
[18] Train-Perplexity=4.65611469144194
[18] Validation-Perplexity=5.4290169469462
[19] Train-Perplexity=4.64614689879405
[19] Validation-Perplexity=5.44221549833917
[20] Train-Perplexity=4.63764001963654
[20] Validation-Perplexity=5.42114250842862
Inference on the Model¶
We now use the saved model to do inference and sample text character by character that will look like the original training data.
set.seed(0)
model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)
internals <- model$symbol$get.internals()
sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)
infer_raw <- c("Thou ")
infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
infer_length <- length(infer_split)
infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),
batch.size = 1, shuffle = FALSE)
infer <- mx.infer.rnn.one(infer.data = infer.data,
symbol = symbol,
arg.params = model$arg.params,
aux.params = model$aux.params,
input.params = NULL,
ctx = devices)
pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
infer$loss_output, axis = 0, begin = infer_length-1, end = infer_length)))
pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
predict <- c(predict, pred)
for (i in 1:200) {
infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),
batch.size = 1, shuffle = FALSE)
infer <- mx.infer.rnn.one(infer.data = infer.data,
symbol = symbol,
arg.params = model$arg.params,
aux.params = model$aux.params,
input.params = list(rnn.state = infer[[2]],
rnn.state.cell = infer[[3]]),
ctx = devices)
pred_prob <- as.numeric(as.array(infer$loss_output))
pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
predict <- c(predict, pred)
}
predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
print(predict_txt_tot)
[1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "