Char-RNN¶
In this tutorial, we will build a char-rnn model for natural language generation. The training text is tokenized as a sequence of characters. After training, the model is able to output the probability distribution over the alphabet, therefore "predicting" the next character. By iterating this process, one can generate text snippets.
Char-RNN processes text sequences of arbitrary length, and the loss function makes use of ordinary Scala control-flow features during the training phase. Therefore it is an instance of dynamic neural network.
This implementation of Char-RNN is inspired by Andrej Karpathy's execellent blog post The Unreasonable Effectiveness of Recurrent Neural Networks and Python/numpy implementation.
Importing dependencies¶
In [1]:
import $ivy.`org.nd4j:nd4j-native-platform:0.8.0`
import $ivy.`com.thoughtworks.deeplearning::plugins-builtins:2.0.0`
import $ivy.`org.plotly-scala::plotly-jupyter-scala:0.3.2`
import scala.math
import collection.immutable.IndexedSeq
import scala.io.Source
import scala.concurrent.ExecutionContext.Implicits.global
import scalaz.concurrent.Task
import scalaz.std.iterable._
import scalaz.syntax.all._
import com.thoughtworks.future._
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.ops.transforms.Transforms
import org.nd4j.linalg.api.ops.impl.indexaccum.IMax
import com.thoughtworks.deeplearning.plugins.DoubleLiterals
import com.thoughtworks.deeplearning.plugins.INDArrayLiterals
import com.thoughtworks.deeplearning.plugins.CumulativeDoubleLayers
import com.thoughtworks.deeplearning.plugins.DoubleTraining
import com.thoughtworks.deeplearning.plugins.CumulativeINDArrayLayers
import com.thoughtworks.deeplearning.plugins.INDArrayWeights
import com.thoughtworks.deeplearning.plugins.Operators
import com.thoughtworks.deeplearning.plugins.Logging
import com.thoughtworks.deeplearning.plugins.Builtins
import com.thoughtworks.feature.Factory
import plotly._
import plotly.element._
import plotly.layout._
import plotly.JupyterScala._
Out[1]:
Preparing the corpus, setting up plugins & parameters¶
In [2]:
val data = "DeepLearning.scala"
val dataSize = data.size
val ixToChar = data.toSet.toArray
val charToIx = (for (i <- ixToChar.indices) yield (ixToChar(i), i)).toMap
val vocabSize = ixToChar.size
def oneOfK(c: Char) = Nd4j.zeros(vocabSize, 1).putScalar(charToIx(c), 1)
Out[2]:
In [3]:
trait LearningRate extends INDArrayWeights {
val learningRate: Double
trait INDArrayOptimizerApi extends super.INDArrayOptimizerApi { this: INDArrayOptimizer =>
override def delta: INDArray = super.delta mul learningRate
}
override type INDArrayOptimizer <: INDArrayOptimizerApi with Optimizer
}
trait Adagrad extends INDArrayWeights {
val eps: Double
trait INDArrayWeightApi extends super.INDArrayWeightApi { this: INDArrayWeight =>
var cache: Option[INDArray] = None
}
override type INDArrayWeight <: INDArrayWeightApi with Weight
trait INDArrayOptimizerApi extends super.INDArrayOptimizerApi { this: INDArrayOptimizer =>
private lazy val deltaLazy: INDArray = {
import org.nd4s.Implicits._
import weight._
val delta0 = super.delta
cache = Some(cache.getOrElse(Nd4j.zeros(delta0.shape: _*)) + delta0 * delta0)
delta0 / (Transforms.sqrt(cache.get) + eps)
}
override def delta = deltaLazy
}
override type INDArrayOptimizer <: INDArrayOptimizerApi with Optimizer
}
Out[3]:
In [5]:
interp.load("""
val hyperparameters = Factory[Adagrad with LearningRate with Builtins].newInstance(learningRate = 0.05, eps=1e-8)
""")
In [6]:
import hyperparameters.INDArrayWeight
import hyperparameters.DoubleLayer
import hyperparameters.INDArrayLayer
import hyperparameters.implicits._
Out[6]:
In [7]:
val hiddenSize = 100
val seqLength = 25
val wxh = {
import org.nd4s.Implicits._
INDArrayWeight(Nd4j.randn(hiddenSize, vocabSize) * 0.01)
}
val whh = {
import org.nd4s.Implicits._
INDArrayWeight(Nd4j.randn(hiddenSize, hiddenSize) * 0.01)
}
val why = {
import org.nd4s.Implicits._
INDArrayWeight(Nd4j.randn(vocabSize, hiddenSize) * 0.01)
}
val bh = INDArrayWeight(Nd4j.zeros(hiddenSize, 1))
val by = INDArrayWeight(Nd4j.zeros(vocabSize, 1))
Out[7]:
Implementing the neural network¶
In [8]:
def tanh(x: INDArrayLayer): INDArrayLayer = {
val exp_x = hyperparameters.exp(x)
val exp_nx = hyperparameters.exp(-x)
(exp_x - exp_nx) / (exp_x + exp_nx)
}
Out[8]:
In [9]:
def charRNN(x: INDArray, y: INDArray, hprev: INDArrayLayer): (DoubleLayer, INDArrayLayer, INDArrayLayer) = {
val hnext = tanh(wxh.dot(x) + whh.dot(hprev) + bh)
val yraw = why.dot(hnext) + by
val yraw_exp = hyperparameters.exp(yraw)
val prob = yraw_exp / yraw_exp.sum
val loss = -hyperparameters.log((prob * y).sum)
(loss, prob, hnext)
}
Out[9]:
In [11]:
val batches = data.zip(data.tail).grouped(seqLength).toVector
type WithHiddenLayer[A] = (A, INDArrayLayer)
type Batch = IndexedSeq[(Char, Char)]
type Losses = Vector[Double]
def singleBatch(batch: WithHiddenLayer[Batch]): WithHiddenLayer[DoubleLayer] = {
batch match {
case (batchseq, hprev) => batchseq.foldLeft((DoubleLayer(0.0.forward), hprev)) {
(bstate: WithHiddenLayer[DoubleLayer], xy: (Char, Char)) =>
(bstate, xy) match {
case ((tot, localhprev), (x, y)) => {
charRNN(oneOfK(x), oneOfK(y), localhprev) match {
case (localloss, _, localhnext) => {
(tot + localloss, localhnext)
}
}
}
}
}
}
}
def initH = INDArrayLayer(Nd4j.zeros(hiddenSize, 1).forward)
def singleRound(initprevloss: Losses): Future[Losses] =
(batches.foldLeftM((initprevloss, initH)) {
(bstate: WithHiddenLayer[Losses], batch: Batch) =>
bstate match {
case (prevloss, hprev) => singleBatch(batch, hprev) match {
case (bloss, hnext) => bloss.train.map {
(blossval: Double) => {
val nloss = prevloss.last * 0.999 + blossval * 0.001
val loss_seq = prevloss :+ prevloss.last * 0.999 + blossval * 0.001
(loss_seq, hnext)
}
}
}
}
}).map {
(fstate: WithHiddenLayer[Losses]) =>
fstate match {
case (floss, _) => floss
}
}
def allRounds: Future[Losses] = (0 until 2048).foldLeftM(Vector(-math.log(1.0 / vocabSize) * seqLength)) {
(ploss: Losses, round: Int) => {
singleRound(ploss)
}
}
Out[11]:
Training the model and using it to generate text¶
In [12]:
def unsafePerformFuture[A](f: Future[A]): A = Await.result(f.toScalaFuture, Duration.Inf)
val losses = unsafePerformFuture(allRounds)
plotly.JupyterScala.init()
Scatter(losses.indices, losses).plot(title = "Smooth loss by time")
Out[12]:
In [13]:
def genIdx(v: INDArray): Int = Nd4j.getExecutioner().execAndReturn(new IMax(v)).getFinalResult()
def generate(seed: Char, n: Int): Future[String] = ((0 until n).foldLeftM((seed.toString, initH)) {
(st: (String, INDArrayLayer), i: Int) =>
st match {
case (tot, hprev) => {
val x = oneOfK(tot.last)
charRNN(x, x, hprev) match {
case (_, prob, hnext) =>
prob.predict.flatMap { (probv: INDArray) =>
val nidx = genIdx(probv)
val nc = ixToChar(nidx)
Future.now(tot + nc.toString, hnext)
}
}
}
}
}).map { (st: (String, INDArrayLayer)) =>
st match {
case (r, _) => r
}
}
Out[13]:
In [14]:
unsafePerformFuture(generate('D', 128))
Out[14]: