Merge branch 'master' into HEAD

author: Natalia Gimelshein <ngimelshein@nvidia.com> 2017-04-26 02:29:24 +0300
committer: Natalia Gimelshein <ngimelshein@nvidia.com> 2017-04-26 02:29:24 +0300
commit: 39d6c28b1eddb9632c90c18d1e2fc104ebdf259f (patch)
tree: 29eaa0dce7a89d74086f297a581c13fe6a7ee72c
parent: 4a50d76a79cf432e69e066f0348d085bc2d469c6 (diff)
parent: a418f1e530cee811ab10e97a5388b62a6865aa20 (diff)
2 files changed, 486 insertions, 66 deletions
diff --git a/RNN.lua b/RNN.lua
index 0145da0..f75f505 100644
--- a/RNN.lua
+++ b/RNN.lua
@@ -28,6 +28,7 @@ function RNN:__init(inputSize, hiddenSize, numLayers, batchFirst, dropout, remem
    self.batchFirst = batchFirst or false -- Set to true for batch x time x inputdim.
    self.rememberStates = rememberStates or false
    self.sync = true
+   self.inputPacked = false
    self.gradInput = torch.CudaTensor()
    self.output = torch.CudaTensor()
    self.weight = torch.CudaTensor()
@@ -65,7 +66,8 @@ function RNN:reset(stdv)
 
    self:resetDropoutDescriptor()
    self:resetRNNDescriptor()
-   self:resetIODescriptors()
+   self:resetInputDescriptor()
+   self:resetOutputDescriptor()
 
    local weightSizePtr = ffi.new("size_t[1]")
    errcheck('cudnnGetRNNParamsSize',
@@ -186,28 +188,60 @@ function RNN:resetWeightDescriptor()
    )
 end
 
-function RNN:resetIODescriptors()
+function RNN:resetInputDescriptor(input, batchSizes)
    self.xDescs = self:createTensorDescriptors(self.seqLength)
-   self.yDescs = self:createTensorDescriptors(self.seqLength)
 
-   for i = 0, self.seqLength - 1 do
-      local dim = torch.IntTensor({ self.miniBatch,self.inputSize, 1})
-      local stride = torch.IntTensor({dim[3] * dim[2], dim[3],1})
-      errcheck('cudnnSetTensorNdDescriptor',
-               self.xDescs[i],
-               self.datatype,
-               3,
-               dim:data(),
-               stride:data())
+   if self.inputPacked and input ~= nil and batchSizes ~= nil then
+      assert(#batchSizes == self.seqLength)
+      for i = 0, self.seqLength - 1 do
+         -- tensor shape is (# of sequences in the batch at the timestep, inputSize, 1 (for cudnn))
+         local dim = torch.IntTensor({batchSizes[i+1], input:size(2), 1})
+         local stride = torch.IntTensor({dim[3] * dim[2], dim[3],1})
+         errcheck('cudnnSetTensorNdDescriptor',
+                  self.xDescs[i],
+                  self.datatype,
+                  3,
+                  dim:data(),
+                  stride:data())
+      end
+   else
+      for i = 0, self.seqLength - 1 do
+         local dim = torch.IntTensor({ self.miniBatch,self.inputSize, 1})
+         local stride = torch.IntTensor({dim[3] * dim[2], dim[3],1})
+         errcheck('cudnnSetTensorNdDescriptor',
+                  self.xDescs[i],
+                  self.datatype,
+                  3,
+                  dim:data(),
+                  stride:data())
+      end
+   end
+end
 
-      local dim = torch.IntTensor({self.miniBatch, self.hiddenSize * self.numDirections, 1})
-      local stride = torch.IntTensor({dim[3] * dim[2], dim[3],1})
-      errcheck('cudnnSetTensorNdDescriptor',
-               self.yDescs[i],
-               self.datatype,
-               3,
-               dim:data(),
-               stride:data())
+function RNN:resetOutputDescriptor(output, batchSizes)
+   self.yDescs = self:createTensorDescriptors(self.seqLength)
+   if self.inputPacked and output ~= nil and batchSizes ~= nil then
+      for i = 0, self.seqLength - 1 do
+         local dim = torch.IntTensor({batchSizes[i+1], self.hiddenSize * self.numDirections, 1})
+         local stride = torch.IntTensor({dim[3] * dim[2], dim[3],1})
+         errcheck('cudnnSetTensorNdDescriptor',
+                  self.yDescs[i],
+                  self.datatype,
+                  3,
+                  dim:data(),
+                  stride:data())
+      end
+   else
+      for i = 0, self.seqLength - 1 do
+         local dim = torch.IntTensor({self.miniBatch, self.hiddenSize * self.numDirections, 1})
+         local stride = torch.IntTensor({dim[3] * dim[2], dim[3],1})
+         errcheck('cudnnSetTensorNdDescriptor',
+                  self.yDescs[i],
+                  self.datatype,
+                  3,
+                  dim:data(),
+                  stride:data())
+      end
    end
 end
 
@@ -265,10 +299,6 @@ function RNN:makeContiguous(input, gradOutput)
    return input, gradOutput
 end
 
-function RNN:resizeOutput(tensor)
-    return tensor:resize(self.seqLength, self.miniBatch, self.hiddenSize * self.numDirections)
-end
-
 function RNN:resizeHidden(tensor)
     return tensor:resize(self.numLayers * self.numDirections, self.miniBatch, self.hiddenSize)
 end
@@ -292,14 +322,145 @@ function RNN:resetStates()
    end
 end
 
+-- input a TxBx* tensor (or BxTx* if batchFirst) where T is the length
+-- of the longest sequence, B is the batch size, and * is any number of
+-- dimensions.
+--
+-- lengths is a table of sequence lengths, which should be sorted in
+-- decreasing order.
+--
+-- returns a table containing a packed tensor of size (sum of lengths x *)
+-- and a list of batch sizes per timestep, i.e. the number of sequences
+-- with at least timestep elements.
+function RNN:packPaddedSequence(input, lengths, batchFirst)
+    if batchFirst then
+        input = input:transpose(1, 2)
+    end
 
+    local batches = {}
+    local bszpts = {}
+    local lengthsIdx = #lengths
+    local currentLength = lengths[lengthsIdx]
 
+    local steps = input:size(1)
+    local bsz = input:size(2)
+    if bsz ~= #lengths then
+        error("lengths array has incorrect size (expected: " .. bsz .. "but found: " .. #lengths ..")")
+    end
 
-function RNN:updateOutput(input)
-    if (self.batchFirst) then
-        input = input:transpose(1, 2)
+    for ts = 1, steps do
+        table.insert(batches, input[ts]:narrow(1, 1, bsz))
+        table.insert(bszpts, bsz)
+
+        while ts == currentLength do
+            if lengthsIdx == 0 then
+                currentLength = nil
+                break
+            else
+                lengthsIdx = lengthsIdx - 1
+                bsz = bsz - 1
+                local nextLength = lengths[lengthsIdx]
+                if currentLength ~= nil and nextLength ~= nil and currentLength > nextLength then
+                    error("lengths array has to be sorted in decreasing order")
+                end
+                currentLength = lengths[lengthsIdx]
+            end
+        end
+
+        if currentLength == nil then
+            break
+        end
+    end
+
+    return {torch.cat(batches, 1), bszpts}
+end
+
+-- An inverse operation to packPaddedSequence(...) above. Takes a sequence (i.e.
+-- a Tensor, bszpts table  with the format as returned by packPaddedSequence and
+-- reconverts it into the TxBx* (or BxTx* if batchFirst) tensor and lengths array
+function RNN:padPackedSequence(seq, batchFirst)
+    local data, bszpts = unpack(seq)
+    local maxBatchSize = bszpts[1]
+    local outputSize = torch.LongStorage(2 + data[1]:nDimension())
+    outputSize[1] = #bszpts
+    outputSize[2] = maxBatchSize
+    for i = 1, data[1]:nDimension() do
+        outputSize[i + 2] = data[1]:size(i)
+    end
+    local output = torch.Tensor():typeAs(data):resize(outputSize):zero()
+
+    local lengths = {}
+    local offset = 1
+    local pbsz = bszpts[1]
+    local bsz = nil
+
+    local i = 1
+    while i <= #bszpts do
+        bsz = bszpts[i]
+        output[i]:narrow(1, 1, bsz):copy(data:narrow(1, offset, bsz))
+        offset = offset + bsz
+
+        local dec = pbsz - bsz
+        for j = 1, dec do
+            table.insert(lengths, i - 1)
+        end
+        pbsz = bsz
+        i = i + 1
+    end
+    for j = 1, bsz do
+        table.insert(lengths, i - 1)
     end
-   assert(input:dim() == 3, 'input must have 3 dimensions: seqLength, miniBatch, inputSize')
+
+    -- reverse lengths list
+    local reversed = {}
+    for i = #lengths, 1, -1 do
+        table.insert(reversed, lengths[i])
+    end
+
+    if batchFirst then
+        output = output:transpose(1, 2)
+    end
+    return output, reversed
+end
+
+-- it feels a little dirty setting this function on the class as opposed
+-- to having it be functional, but because we need to access class state,
+-- here we are...
+function RNN:deriveOutputSize(input)
+   if self.inputPacked then
+      return torch.LongStorage({input:size(1), self.hiddenSize * self.numDirections})
+   else
+      return torch.LongStorage({self.seqLength, self.miniBatch, self.hiddenSize * self.numDirections})
+   end
+end
+
+-- updateOutput takes either of the following as inputs:
+--
+-- 1. A seqLength x miniBatch x inputSize Tensor, where seqLength is the
+-- length of the sequence for every input in the batch, miniBatch is the
+-- number of elements in the batch, and inputSize is the size of the input vectors
+-- at each time step
+--
+-- OR
+--
+-- 2. A table containing a packed tensor and a list of batch sizes per timestep. In this
+-- case we are supporting variable length sequences for the forward pass. This table
+-- is the output from packPaddedSequence(...) above
+function RNN:updateOutput(input)
+   local inputPacked = (type(input) == 'table')
+   local switched = self.inputPacked ~= inputPacked
+   self.inputPacked = inputPacked
+
+   if self.batchFirst and not self.inputPacked then
+       input = input:transpose(1, 2)
+   end
+
+   if self.inputPacked then
+      assert(input[1]:dim() == 2, 'packed input must have two dimensions: sum(sequence lengths), inputSize')
+   else
+      assert(input:dim() == 3, 'input must have 3 dimensions: seqLength, miniBatch, inputSize')
+   end
+
    assert(self.dropout == 0 or cudnn.version >= 5103, 'dropout supported only in cudnn v5.1 and above')
    -- Decide which descriptors/tensors need to be updated.
    local resetRNN = not self.dropoutDesc or not self.rnnDesc
@@ -307,19 +468,58 @@ function RNN:updateOutput(input)
    local resetHC = not self.hxDesc or not self.hyDesc or not self.cxDesc or not self.cyDesc
    local resetWeight = not self.wDesc
 
-   if input:size(1) ~= self.seqLength then
-      self.seqLength = input:size(1)
-      resetIO = true
-   end
+   if self.inputPacked then
+      -- Handle resets for packed input
 
-   if input:size(2) ~= self.miniBatch then
-      self.miniBatch = input:size(2)
-      resetIO = true
-      resetHC = true
-   end
+      -- In the case of packed inputs, the sequence length is the length of the bsz per time list.
+      -- We need to reset the IO descriptors if this has changed.
+      if #input[2] ~= self.seqLength then
+         self.seqLength = #input[2]
+         resetIO = true
+      end
 
-   assert(input:size(3) == self.inputSize, 'Incorrect input size!')
+      -- Similarly, the miniBatch "size" is the batch size at the first timestep (when all
+      -- sequences are in the batch, regardless of length). If this has changed then we need
+      -- to reset both the IO descriptors and the hidden/cell descriptors
+      if input[2][1] ~= self.miniBatch then
+         self.miniBatch = input[2][1]
+         resetIO = true
+         resetHC = true
+      end
+      assert(input[1]:size(2) == self.inputSize, 'Incorrect input size!')
+   else
+      -- Handle resets for standard (i.e. not packed) input
+
+      -- If the length of the sequences in this input batch differ from the previous batch
+      -- we need to: reset the IO descriptors to describe the new size of the input and
+      -- output Tensors in the seqLength dimension
+      if input:size(1) ~= self.seqLength then
+         self.seqLength = input:size(1)
+         resetIO = true
+      end
 
+      -- If the batch size has changed we need to:
+      -- 1. Update the IO descritprs to describe the new size of the input and output Tensors in the
+      -- batchSize dimension
+      -- 2. Reset the size of the hidden/cell descriptors so they can store batchSize states
+      if input:size(2) ~= self.miniBatch then
+         self.miniBatch = input:size(2)
+         resetIO = true
+         resetHC = true
+      end
+      assert(input:size(3) == self.inputSize, 'Incorrect input size!')
+   end
+
+   -- Make sure input is contiguous
+   local x = self:makeContiguous(self.inputPacked and input[1] or input)
+   local oSize = self:deriveOutputSize(x)
+   local oStride = self.inputPacked and
+      torch.LongStorage({oSize[2], 1}) or
+      torch.LongStorage({oSize[2] * oSize[3], oSize[3], 1})
+   self.output:resize(oSize, oStride)
+   local y = self.output
+   local w = self.weight
+   local bszpts = self.inputPacked and input[2]
 
    -- Update descriptors/tensors
    if resetRNN then
@@ -327,7 +527,8 @@ function RNN:updateOutput(input)
       self:resetRNNDescriptor()
    end
    if resetIO then
-      self:resetIODescriptors(input)
+      self:resetInputDescriptor(x, bszpts)
+      self:resetOutputDescriptor(y, bszpts)
    end
    if resetHC then
       self:resetHiddenDescriptors()
@@ -337,13 +538,6 @@ function RNN:updateOutput(input)
       self:resetWeightDescriptor()
    end
 
-   local x = self:makeContiguous(input)
-   local oSize = torch.LongStorage({self.seqLength, self.miniBatch, self.hiddenSize * self.numDirections})
-   local oStride = torch.LongStorage({self.miniBatch * self.hiddenSize * self.numDirections, self.hiddenSize * self.numDirections, 1})
-   self.output:resize(oSize, oStride)
-   local y = self.output
-   local w = self.weight
-
    -- Optionally use hiddenInput/cellInput parameters
    if self.rememberStates then
         if self.hiddenOutput:nDimension() == 3 and self.hiddenOutput:size(1) == self.numLayers * self.numDirections and
@@ -400,6 +594,7 @@ function RNN:updateOutput(input)
       local elemSize = self.reserve:elementSize()
       reserveSize = math.floor((reserveSize + elemSize - 1) / elemSize)
       self.reserve:resize(reserveSize)
+
       errcheck('cudnnRNNForwardTraining',
                cudnn.getHandle(),
                self.rnnDesc[0],
@@ -430,31 +625,36 @@ function RNN:updateOutput(input)
 	       wsSize)
    end
    if self.sync then cutorch.synchronize() end
-   if (self.batchFirst) then
+   if self.batchFirst and not self.inputPacked then
       self.output = self.output:transpose(1, 2)
    end
    return self.output
 end
 
 function RNN:updateGradInput(input, gradOutput)
-    if (self.batchFirst) then
-        input = input:transpose(1, 2)
-        gradOutput = gradOutput:transpose(1, 2)
-        self.output = self.output:transpose(1, 2)
-    end
+   if self.batchFirst and not self.inputPacked then
+       input = input:transpose(1, 2)
+       gradOutput = gradOutput:transpose(1, 2)
+       self.output = self.output:transpose(1, 2)
+   end
    assert(self.dropout == 0 or cudnn.version >= 5103, 'dropout supported only in cudnn v 5.1 and above')
-   assert(input:dim() == 3, 'input should have 3 dimensions: seqLength, miniBatch, inputSize')
-   assert(input:size(1) == self.seqLength, 'input has incorrect sequence length!')
-   assert(input:size(2) == self.miniBatch, 'input has incorrect minibatch size!')
-   assert(input:size(3) == self.inputSize, 'input has incorrect size!')
+
+   if self.inputPacked then
+      assert(input[1]:dim() == 2, 'packed input must have two dimensions: sum(sequence lengths), inputSize')
+   else
+      assert(input:dim() == 3, 'input should have 3 dimensions: seqLength, miniBatch, inputSize')
+      assert(input:size(1) == self.seqLength, 'input has incorrect sequence length!')
+      assert(input:size(2) == self.miniBatch, 'input has incorrect minibatch size!')
+      assert(input:size(3) == self.inputSize, 'input has incorrect size!')
+   end
 
    assert(gradOutput:isSameSizeAs(self.output), 'gradOutput has incorrect size!')
    assert(self.train, 'updateGradInput can only be called when training!')
 
-   local x, dy = self:makeContiguous(input, gradOutput)
+   local x, dy = self:makeContiguous(self.inputPacked and input[1] or input, gradOutput)
    local y = self.output
    local w = self.weight
-   local dx = self.gradInput:resizeAs(input)
+   local dx = self.gradInput:resizeAs(self.inputPacked and input[1] or input)
    local hx = self.hiddenInput
    local cx = self.cellInput
    local dhy = self.gradHiddenOutput
@@ -524,7 +724,7 @@ function RNN:updateGradInput(input, gradOutput)
 	    wsPtr, wsSize,
             self.reserve:data(), self.reserve:size(1) * self.reserve:elementSize())
     if self.sync then cutorch.synchronize() end
-    if (self.batchFirst) then
+    if self.batchFirst and not self.inputPacked then
         self.gradInput = self.gradInput:transpose(1, 2)
         self.output = self.output:transpose(1, 2)
     end
@@ -532,7 +732,7 @@ function RNN:updateGradInput(input, gradOutput)
 end
 
 function RNN:accGradParameters(input, gradOutput, scale)
-    if (self.batchFirst) then
+    if self.batchFirst and not self.inputPacked then
         input = input:transpose(1, 2)
         gradOutput = gradOutput:transpose(1, 2)
         self.output = self.output:transpose(1, 2)
@@ -540,15 +740,19 @@ function RNN:accGradParameters(input, gradOutput, scale)
    scale = scale or 1
    if scale == 0 then return end
    assert(self.dropout == 0 or cudnn.version >= 5103, 'dropout supported only in cudnn 5.1 and above')
-   assert(input:dim() == 3, 'input should have 3 dimensions: seqLength, miniBatch, inputSize')
-   assert(input:size(1) == self.seqLength, 'input has incorrect sequence length!')
-   assert(input:size(2) == self.miniBatch, 'input has incorrect minibatch size!')
-   assert(input:size(3) == self.inputSize, 'input has incorrect size!')
+   if self.inputPacked then
+      assert(input[1]:dim() == 2, 'packed input must have two dimensions: sum(sequence lengths), inputSize')
+   else
+      assert(input:dim() == 3, 'input should have 3 dimensions: seqLength, miniBatch, inputSize')
+      assert(input:size(1) == self.seqLength, 'input has incorrect sequence length!')
+      assert(input:size(2) == self.miniBatch, 'input has incorrect minibatch size!')
+      assert(input:size(3) == self.inputSize, 'input has incorrect size!')
+   end
 
    assert(gradOutput:isSameSizeAs(self.output), 'gradOutput has incorrect size!')
    assert(self.train, 'accGradParameters can only be called when training!')
 
-   local x, dy = self:makeContiguous(input, gradOutput)
+   local x, dy = self:makeContiguous(self.inputPacked and input[1] or input, gradOutput)
    local hx = self.hiddenInput
    local y = self.output
    local dw = self.gradWeight
@@ -604,7 +808,7 @@ function RNN:accGradParameters(input, gradOutput, scale)
                scaleTensor:data())
    end
 
-    if (self.batchFirst) then
+    if self.batchFirst and not self.inputPacked then
         gradOutput = gradOutput:transpose(1, 2)
         self.output = self.output:transpose(1, 2)
     end
diff --git a/test/test_rnn.lua b/test/test_rnn.lua
index 63520b6..0372983 100644
--- a/test/test_rnn.lua
+++ b/test/test_rnn.lua
@@ -261,6 +261,222 @@ function getRNNCheckSums(miniBatch, seqLength, hiddenSize, numberOfLayers, numbe
     return checkSums
 end
 
+function cudnntest.testPackPadSequences()
+    -- T is 4, B = 5, vector size = 3
+    local input = torch.CudaIntTensor({
+        {{101, 102, 103},
+         {201, 202, 203},
+         {301, 302, 303},
+         {401, 402, 403},
+         {501, 502, 503}},
+        {{104, 105, 106},
+         {204, 205, 206},
+         {304, 305, 306},
+         {  0,   0,   0},
+         {  0,   0,   0}},
+        {{107, 108, 109},
+         {207, 208, 209},
+         {  0,   0,   0},
+         {  0,   0,   0},
+         {  0,   0,   0}},
+        {{110, 111, 112},
+         {  0,   0,   0},
+         {  0,   0,   0},
+         {  0,   0,   0},
+         {  0,   0,   0}},
+    })
+    local lengths = {4, 3, 2, 1, 1}
+
+    local expectedPacked = torch.CudaIntTensor({
+        {101, 102, 103}, {201, 202, 203}, {301, 302, 303}, {401, 402, 403}, {501, 502, 503},
+        {104, 105, 106}, {204, 205, 206}, {304, 305, 306},
+        {107, 108, 109}, {207, 208, 209},
+        {110, 111, 112}
+    })
+    local expectedBSPT = {5, 3, 2, 1}
+
+    local result = cudnn.RNN:packPaddedSequence(input, lengths)
+    local actualPacked, actualBSPT = unpack(result)
+    mytester:assertTensorEq(expectedPacked, actualPacked)
+    mytester:assertTableEq(expectedBSPT, actualBSPT)
+
+    local actualUnpacked, actualLengths = cudnn.RNN:padPackedSequence(result)
+    mytester:assertTensorEq(input, actualUnpacked)
+    mytester:assertTableEq(lengths, actualLengths)
+
+    -- test again with batchFirst
+    input = input:transpose(1, 2)
+
+    local result = cudnn.RNN:packPaddedSequence(input, lengths, true)
+    local actualPacked, actualBSPT = unpack(result)
+    mytester:assertTensorEq(expectedPacked, actualPacked)
+    mytester:assertTableEq(expectedBSPT, actualBSPT)
+
+    local actualUnpacked, actualLengths = cudnn.RNN:padPackedSequence(result, true)
+    mytester:assertTensorEq(input, actualUnpacked)
+    mytester:assertTableEq(lengths, actualLengths)
+end
+
+-- clone the parameters of src into dest, assumes both RNNs were created with
+-- the same options (e.g. same input size, hidden size, layers, etc.)
+local function deepcopyRNN(dest, src)
+   dest.weight = src.weight:clone() -- encompasses W_hh, W_xh etc.
+   dest.gradWeight = src.gradWeight:clone()
+end
+
+function cudnntest.testVariableLengthSequences()
+   local input = torch.CudaTensor({
+      {{1, 2, 2, 1},
+       {2, 1, 2, 2},
+       {1, 1, 1, 2},
+       {2, 2, 2, 1}},
+      {{4, 1, 3, 1},
+       {3, 1, 2, 1},
+       {1, 1, 2, 1},
+       {0, 0, 0, 0}},
+      {{1, 1, 2, 1},
+       {2, 1, 2, 2},
+       {1, 2, 2, 1},
+       {0, 0, 0, 0}},
+      {{1, 2, 1, 1},
+       {0, 0, 0, 0},
+       {0, 0, 0, 0},
+       {0, 0, 0, 0}}
+   })
+
+   -- same as above
+   local indivInputs = {
+      torch.CudaTensor({
+         {{1, 2, 2, 1}},
+         {{4, 1, 3, 1}},
+         {{1, 1, 2, 1}},
+         {{1, 2, 1, 1}},
+      }),
+      torch.CudaTensor({
+         {{2, 1, 2, 2}},
+         {{3, 1, 2, 1}},
+         {{2, 1, 2, 2}},
+      }),
+      torch.CudaTensor({
+         {{1, 1, 1, 2}},
+         {{1, 1, 2, 1}},
+         {{1, 2, 2, 1}},
+      }),
+      torch.CudaTensor({
+         {{2, 2, 2, 1}},
+      }),
+   }
+
+   local lengths = {4, 3, 3, 1}
+   local maxLength = 4
+
+   -- Generate gradOutput based on input sizes
+   local gradOutput = torch.CudaTensor(11, 1, 10):uniform()
+   local indivGradOutputs = {
+      torch.cat({gradOutput:narrow(1, 1, 1), gradOutput:narrow(1, 5, 1), gradOutput:narrow(1, 8, 1), gradOutput:narrow(1, 11, 1)}, 1):clone(),
+      torch.cat({gradOutput:narrow(1, 2, 1), gradOutput:narrow(1, 6, 1), gradOutput:narrow(1, 9, 1)}, 1):clone(),
+      torch.cat({gradOutput:narrow(1, 3, 1), gradOutput:narrow(1, 7, 1), gradOutput:narrow(1, 10, 1)}, 1):clone(),
+      gradOutput:narrow(1, 4, 1):clone()
+   }
+   gradOutput = gradOutput:squeeze()
+
+   local inputSize = 4
+   local hiddenSize = 10
+   local numLayers = 1
+   local batchFirst = false
+   local dropout = false
+   local rememberStates = false
+
+   local lstm = cudnn.LSTM(
+      inputSize,
+      hiddenSize,
+      numLayers,
+      batchFirst,
+      dropout,
+      rememberStates)
+
+   local lstm2 = cudnn.LSTM(
+      inputSize,
+      hiddenSize,
+      numLayers,
+      batchFirst,
+      dropout,
+      rememberStates)
+
+   deepcopyRNN(lstm2, lstm)
+
+   -- Step 1: Pass Sequences as batch and individually, verify weights, outputs
+   -- are the same in both instances
+
+   -- batched
+   local packed = cudnn.RNN:packPaddedSequence(input, lengths)
+   local packedOutput = lstm:updateOutput(packed)
+   local packedHiddenOutput = lstm.hiddenOutput:clone()
+   -- could use padPackedSequence here, but for testing simplicity, we'll just
+   -- operate on the returned results
+
+   local separate = {}
+   local hids = {}
+   local indivGradInputs = {}
+
+   for i, length in ipairs(lengths) do
+      local inp = indivInputs[i]
+      local output = lstm2:updateOutput(inp):clone()
+      table.insert(separate, output)
+      local hid = lstm2.hiddenOutput:clone()
+      table.insert(hids, hid)
+
+      -- need to do backwards pass here too
+      local gradOutput = indivGradOutputs[i]
+      local gradInp = lstm2:updateGradInput(inp, gradOutput):clone()
+      table.insert(indivGradInputs, gradInp)
+   end
+   separate = torch.cat(separate, 1):squeeze()
+   hids = torch.cat(hids, 1):squeeze()
+
+   mytester:asserteq(packedOutput:size(1), separate:size(1))
+   mytester:asserteq(packedOutput:size(2), separate:size(2))
+
+   -- packedOutput has format where all 4 from first batch, then all 3 from
+   -- second batch, etc. while separate has all 4 from first sequence,
+   -- all 3 from next sequence, etc. I manually map the matches here
+   local corresponding = {
+      {1, 1},
+      {2, 5},
+      {3, 8},
+      {4, 11},
+      {5, 2},
+      {6, 6},
+      {7, 9},
+      {8, 3},
+      {9, 7},
+      {10, 10},
+      {11, 4}
+   }
+   for _, pair in ipairs(corresponding) do
+      local sep, batched = unpack(pair)
+      local diff = torch.csub(separate[sep], packedOutput[batched]):abs():sum()
+      mytester:assert(diff < 1e-7)
+   end
+
+   local hdiff = torch.csub(packedHiddenOutput, hids):abs():sum()
+   mytester:assert(hdiff < 1e7)
+
+   -- Step 2: update grad input as batch and individually
+
+   local packedGradInput = lstm:updateGradInput(packed, gradOutput)
+   local igiTestable = torch.cat(indivGradInputs, 1):squeeze(2)
+
+   for _, pair in ipairs(corresponding) do
+      sep, batched = unpack(pair)
+      local diff = torch.csub(igiTestable[sep], packedGradInput[batched]):abs():sum()
+      mytester:assert(diff < 1e-7)
+   end
+
+   -- Step 3: Basically verify that accGradParameters works for batch
+   lstm:accGradParameters(packed, gradOutput)
+end
+
 mytester = torch.Tester()
 mytester:add(cudnntest)
 mytester:run()
author	Natalia Gimelshein <ngimelshein@nvidia.com>	2017-04-26 02:29:24 +0300
committer	Natalia Gimelshein <ngimelshein@nvidia.com>	2017-04-26 02:29:24 +0300
commit	39d6c28b1eddb9632c90c18d1e2fc104ebdf259f (patch)
tree	29eaa0dce7a89d74086f297a581c13fe6a7ee72c
parent	4a50d76a79cf432e69e066f0348d085bc2d469c6 (diff)
parent	a418f1e530cee811ab10e97a5388b62a6865aa20 (diff)