implement EAMSGD in word-rnn - cluster-rnn - a distributed Torch7 RNN cluster over MPI

commit daa6e794ac441e478a3efa2b7794ea1d0d2451d4
parent 4f4198ba67f6ea53b20bf19baba0aebb4fb9fa58
Author: umhau <umhau@users.noreply.github.com>
Date:   Tue, 14 Feb 2017 17:53:57 -0500

implement EAMSGD in word-rnn

Diffstat:
A asyncsgd/codeblock.lua  | 27 +++++++++++++++++++++++++++
M asyncsgd/goot.lua  | 1 +
D asyncsgd/mlaunch.lua  | 119 -------------------------------------------------------------------------------
D asyncsgd/optim-eamsgd.lua  | 74 --------------------------------------------------------------------------
R asyncsgd/init.lua -> init.lua  | 0 
A mlaunch.lua  | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A optim-eamsgd.lua  | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R asyncsgd/pclient.lua -> pclient.lua  | 0 
R asyncsgd/pserver.lua -> pserver.lua  | 0 
M train.lua  | 23 +++++++++++++++++++++++

10 files changed, 245 insertions(+), 193 deletions(-)
diff --git a/asyncsgd/codeblock.lua b/asyncsgd/codeblock.lua
@@ -0,0 +1,26 @@
+-------------------------------------------------------------------------------
+-- Author: umhau (umhau@alum.gcc.edu)
+-------------------------------------------------------------------------------
+
+
+
+-- NOTES ----------------------------------------------------------------------
+
+
+
+-- MPI SETTINGS ---------------------------------------------------------------
+-- most of these are set in the mlaunch file.  These are mostly duplicates.
+
+local state = state or {}
+local mb = opt.mb or 128
+
+require 'optim'
+
+opti = optim.eamsgd
+state.optim = {
+    lr = lr,
+    pclient = pclient,
+    su = su,
+    mva = mva,
+    mom = mom,
+}
+\ No newline at end of file
diff --git a/asyncsgd/goot.lua b/asyncsgd/goot.lua
@@ -10,6 +10,7 @@
 local opt = opt or {}
 
 -- location of training data
+-- looks like it's designed to continue processing data after an interruption.
 local data_root = opt.data_root or
    io.popen('echo $HOME'):read() .. '/data/torch7/mnist10'
 
diff --git a/asyncsgd/mlaunch.lua b/asyncsgd/mlaunch.lua
@@ -1,119 +0,0 @@
--- mpi launch
--- Author: Sixin Zhang (zsx@cims.nyu.edu)
--- Author: Umhau (umhau@alum.gcc.edu)
--- mpirun -n 12 luajit mlaunch.lua
-
---[[ NOTES --------------------------------------------------------------------
-
-This script is used to launch mpi.  The user's script goes at the bottom, where
-goot.lua has been referenced.   Edit this file only to change the variables 
-noted.
-
-MPI is configured so this script will be running separately on each available 
-core on each available machine in the cluster - so the 'ranks' below will range
-from 0-7 if there are two machines with 4 CPU cores each.  I don't have any 
-GPUs, so I can't speak to how those are presented.
-
---]]
-
--- VARIABLES ------------------------------------------------------------------
-
-local oncuda = false -- Set for working with CPUs. Change this if using GPUs.
-local torchfile = 'goot.lua' -- name of torch file to run with MPI
-local iterations = 10 -- i.e., epochs.  don't need that many for testing.
-
--- there's other EAMSGD variables that can be tuned below. I'll do that later.
-
--- GPU SETTINGS ---------------------------------------------------------------
-
-local AGPU = nil
-if oncuda then
-   require 'cutorch'
-   AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine
-end
-
-local gpuid = -1
-
--- MPI CONFIGURATION ----------------------------------------------------------
-
-dofile('init.lua')
-mpiT.Init()
-
-local world = mpiT.COMM_WORLD
-local rank = mpiT.get_rank(world)
-local size = mpiT.get_size(world)
-
-local conf = {}
-conf.rank = rank
-conf.world = world
-conf.sranks = {}
-conf.cranks = {}
-for i = 0,size-1 do
-   if math.fmod(i,2)==0 then
-      table.insert(conf.sranks,i)
-   else
-      table.insert(conf.cranks,i)
-   end
-end
-
-opt = {}
---[[ -- delete this stuff later.
-opt.name = 'downpour'
-opt.lr = 1e-4
-opt.su = 1
---]]
-opt.name = 'eamsgd' -- using most efficient optimizer
---opt.lr = 1e-1
-opt.su = 100
-opt.mva = 0.9/6 -- this is \beta/p when p=6
-opt.lr = 1e-2 -- order of magnitude from the other - what's the difference?
-opt.mom = 0.99
-
-opt.maxepoch = iterations
-
--- determine if the current node should be server or client. Seems like there
--- should be more clients than servers...investigate later.  (change the '2'?)
-if math.fmod(rank,2)==0 then
-   -- if the rank # is even, it's a server
-   print('[server] rank',rank,'use cpu')
-   torch.setdefaulttensortype('torch.FloatTensor')  
-   local ps = pServer(conf)
-   ps:start()
-
-else
-   -- if node rank # is odd, it's a client.  This means we have to choose how 
-   -- to process the metric $#!7-ton of data that's going to be directed this 
-   -- way.  So, this is where we configure our GPUs or CPUs.  
-
-   if AGPU then
-      -- if not nil, GPUs are enabled
-      require 'cunn'
-      -- use CUDA
-      local gpus = cutorch.getDeviceCount()
-      -- how many GPUs available on this machine?
-      gpuid = AGPU[(rank%(size/2)) % gpus + 1]
-      -- use the node's rank to set the ID of each(?) GPU
-      cutorch.setDevice(gpuid)
-      print('[client] rank ' .. rank .. ' use gpu ' .. gpuid)
-      torch.setdefaulttensortype('torch.CudaTensor')
-
-   else
-      -- if the GPU flag is set FALSE, we're using CPUs
-      print('[client] rank ' .. rank .. ' use cpu')
-      torch.setdefaulttensortype('torch.FloatTensor')
-   end
-
-   -- done with configuring the processors.  These are settings specific to the
-   -- node at hand, now that we know exactly what it's going to be doing.
-   opt.gpuid = gpuid       -- Tell the optimizer if GPUs are available.
-   opt.pc = pClient(conf)  -- MPI settings for communicating with the other nodes.
-   opt.rank = rank         -- Simple access to the node number.  
-
-   -- Time to run the training algorithm.  This is not an arbitrary script,
-   -- and must contain some cruicial settings.
-   dofile(torchfile)
-
-end
-
--- clean up the MPI communication channels.
-mpiT.Finalize()
diff --git a/asyncsgd/optim-eamsgd.lua b/asyncsgd/optim-eamsgd.lua
@@ -1,74 +0,0 @@
--- Async EASGD/EAMSGD
--- Author: Sixin Zhang (zsx@cims.nyu.edu)
--- when mom==0, it is the easgd
-require 'optim'
-
-function optim.eamsgd(opfunc, w, config, state)
-   local config = config or {}   
-   local state = state or config
-  
-   local lr = config.lr or 0   -- learning rate \eta
-   local lrd = config.lrd or 0 -- learning rate decay
-   local lrp = config.lrp or 0 -- learning rate decay power
-   local mom = config.mom or 0 -- momentum term \delta
-   local l2wd = config.l2wd or 0
-
-   local pc = config.pclient or nil
-   local mva = config.mva or 0 -- moving rate \alpha
-   local su = config.su or 1   -- comm period \tau
-
-   state.pversion = state.pversion or 0
-   state.dusync = state.dusync or 0
-
-   local fx,dfdx
-   local function localupdate()
-      if lr ~= 0 then
-	 if mom > 0 then
-	    if not state.vt then
-	       state.vt = w:clone():zero()
-	    end
-	    state.vt:mul(mom)
-	    w:add(state.vt)
-	 end	 	 
-	 fx,dfdx = opfunc(w)
-	 if l2wd ~= 0 then dfdx:add(l2wd, w) end	 
-	 local clr = lr
-	 if lrd ~= 0 and lrp > 0 then 
-	    clr = lr / math.pow(1+state.pversion*lrd,lrp)
-	 end
-	 w:add(-clr,dfdx)
-	 if mom > 0 then
-	    state.vt:add(-clr,dfdx)
-	 end
-	 state.pversion = state.pversion + 1	         
-      end
-   end
-   
-   if (pc and su>0 and mva>0) then
-      if (state.pversion%su == 0) then
-	 if not config.suw then -- need 2 copies
-	    config.suw = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
-	    config.sug = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
-	    pc:reset(config.suw,config.sug)
-	 end
-	 pc:async_recv_param() -- suw=w*
-	 local synctime = sys.clock()
-	 pc:wait() -- sug is sent and suw is recv
-	 state.dusync = state.dusync + sys.clock()-synctime
-	 config.sug:copy(w) -- sug=w
-	 config.sug:add(-1,config.suw) -- sug=w-w*
-	 config.sug:mul(mva) -- sug=mva*(w-w*)
-	 pc:async_send_grad() -- apply w*=w*+mva*(w-w*)
-	 local synctime = sys.clock()
-	 pc:ping() -- overlap aio and computation
-	 state.dusync = state.dusync + sys.clock()-synctime
-	 localupdate()
-	 w:add(-1,config.sug) -- w=w+mva*(w*-w)
-      else
-	 localupdate()
-      end
-   else
-      assert(false)
-   end
-   return w,{fx}
-end
diff --git a/asyncsgd/init.lua b/init.lua
diff --git a/mlaunch.lua b/mlaunch.lua
@@ -0,0 +1,119 @@
+-- mpi launch
+-- Author: Sixin Zhang (zsx@cims.nyu.edu)
+-- Author: Umhau (umhau@alum.gcc.edu)
+-- mpirun -n 12 luajit mlaunch.lua
+
+--[[ NOTES --------------------------------------------------------------------
+
+This script is used to launch mpi.  The user's script goes at the bottom, where
+goot.lua has been referenced.   Edit this file only to change the variables 
+noted.
+
+MPI is configured so this script will be running separately on each available 
+core on each available machine in the cluster - so the 'ranks' below will range
+from 0-7 if there are two machines with 4 CPU cores each.  I don't have any 
+GPUs, so I can't speak to how those are presented.
+
+--]]
+
+-- VARIABLES ------------------------------------------------------------------
+
+local oncuda = false -- Set for working with CPUs. Change this if using GPUs.
+local torchfile = 'goot.lua' -- name of torch file to run with MPI
+local iterations = 10 -- i.e., epochs.  don't need that many for testing.
+
+-- there's other EAMSGD variables that can be tuned below. I'll do that later.
+
+-- GPU SETTINGS ---------------------------------------------------------------
+
+local AGPU = nil
+if oncuda then
+   require 'cutorch'
+   AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine
+end
+
+local gpuid = -1
+
+-- MPI CONFIGURATION ----------------------------------------------------------
+
+dofile('init.lua')
+mpiT.Init()
+
+local world = mpiT.COMM_WORLD
+local rank = mpiT.get_rank(world)
+local size = mpiT.get_size(world)
+
+local conf = {}
+conf.rank = rank
+conf.world = world
+conf.sranks = {}
+conf.cranks = {}
+for i = 0,size-1 do
+   if math.fmod(i,2)==0 then
+      table.insert(conf.sranks,i)
+   else
+      table.insert(conf.cranks,i)
+   end
+end
+
+opt = {}
+--[[ -- delete this stuff later.
+opt.name = 'downpour'
+opt.lr = 1e-4
+opt.su = 1
+--]]
+opt.name = 'eamsgd' -- using most efficient optimizer
+--opt.lr = 1e-1
+opt.communicationPeriod = 100
+opt.movingRateAlpha = 0.9/6 -- this is \beta/p when p=6
+opt.learningRate = 1e-1 --1e-2 -- order of magnitude from the other - what's the difference?
+opt.momentum = 0.99
+
+opt.maxepoch = iterations
+
+-- determine if the current node should be server or client. Seems like there
+-- should be more clients than servers...investigate later.  (change the '2'?)
+if math.fmod(rank,2)==0 then
+   -- if the rank # is even, it's a server
+   print('[server] rank',rank,'use cpu')
+   torch.setdefaulttensortype('torch.FloatTensor')  
+   local ps = pServer(conf)
+   ps:start()
+
+else
+   -- if node rank # is odd, it's a client.  This means we have to choose how 
+   -- to process the metric $#!7-ton of data that's going to be directed this 
+   -- way.  So, this is where we configure our GPUs or CPUs.  
+
+   if AGPU then
+      -- if not nil, GPUs are enabled
+      require 'cunn'
+      -- use CUDA
+      local gpus = cutorch.getDeviceCount()
+      -- how many GPUs available on this machine?
+      gpuid = AGPU[(rank%(size/2)) % gpus + 1]
+      -- use the node's rank to set the ID of each(?) GPU
+      cutorch.setDevice(gpuid)
+      print('[client] rank ' .. rank .. ' use gpu ' .. gpuid)
+      torch.setdefaulttensortype('torch.CudaTensor')
+
+   else
+      -- if the GPU flag is set FALSE, we're using CPUs
+      print('[client] rank ' .. rank .. ' use cpu')
+      torch.setdefaulttensortype('torch.FloatTensor')
+   end
+
+   -- done with configuring the processors.  These are settings specific to the
+   -- node at hand, now that we know exactly what it's going to be doing.
+   opt.gpuid = gpuid       -- Tell the optimizer if GPUs are available.
+   opt.pclient = pClient(conf)  -- MPI settings for communicating with the other nodes.
+   opt.rank = rank         -- Simple access to the node number.  
+
+   -- Time to run the training algorithm.  This is not an arbitrary script,
+   -- and must contain some cruicial settings.
+   dofile(torchfile)
+
+end
+
+-- clean up the MPI communication channels.
+mpiT.Finalize()
diff --git a/optim-eamsgd.lua b/optim-eamsgd.lua
@@ -0,0 +1,75 @@
+-- Async EASGD/EAMSGD
+-- Author: Sixin Zhang (zsx@cims.nyu.edu)
+-- Author: umhau (umhau@alum.gcc.edu)
+-- when mom==0, it is the easgd
+require 'optim'
+
+function optim.eamsgd(opfunc, w, config, state)
+   local config = config or {}   
+   local state = state or config
+  
+   local lr = config.learningRate or 0   -- learning rate \eta
+   local lrd = config.learningRateDecay or 0 -- learning rate decay
+   local lrp = config.learningRateDecayPower or 0 -- learning rate decay power
+   local mom = config.momentum or 0 -- momentum term \delta
+   local l2wd = config.l2wd or 0
+
+   local pc = config.pclient or nil
+   local mva = config.movingRateAlpha or 0 -- moving rate \alpha
+   local su = config.communicationPeriod or 1   -- comm period \tau
+
+   state.pversion = state.pversion or 0
+   state.dusync = state.dusync or 0
+
+   local fx,dfdx
+   local function localupdate()
+      if lr ~= 0 then
+	 if mom > 0 then
+	    if not state.vt then
+	       state.vt = w:clone():zero()
+	    end
+	    state.vt:mul(mom)
+	    w:add(state.vt)
+	 end	 	 
+	 fx,dfdx = opfunc(w)
+	 if l2wd ~= 0 then dfdx:add(l2wd, w) end	 
+	 local clr = lr
+	 if lrd ~= 0 and lrp > 0 then 
+	    clr = lr / math.pow(1+state.pversion*lrd,lrp)
+	 end
+	 w:add(-clr,dfdx)
+	 if mom > 0 then
+	    state.vt:add(-clr,dfdx)
+	 end
+	 state.pversion = state.pversion + 1	         
+      end
+   end
+   
+   if (pc and su>0 and mva>0) then
+      if (state.pversion%su == 0) then
+	 if not config.suw then -- need 2 copies
+	    config.suw = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
+	    config.sug = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
+	    pc:reset(config.suw,config.sug)
+	 end
+	 pc:async_recv_param() -- suw=w*
+	 local synctime = sys.clock()
+	 pc:wait() -- sug is sent and suw is recv
+	 state.dusync = state.dusync + sys.clock()-synctime
+	 config.sug:copy(w) -- sug=w
+	 config.sug:add(-1,config.suw) -- sug=w-w*
+	 config.sug:mul(mva) -- sug=mva*(w-w*)
+	 pc:async_send_grad() -- apply w*=w*+mva*(w-w*)
+	 local synctime = sys.clock()
+	 pc:ping() -- overlap aio and computation
+	 state.dusync = state.dusync + sys.clock()-synctime
+	 localupdate()
+	 w:add(-1,config.sug) -- w=w+mva*(w*-w)
+      else
+	 localupdate()
+      end
+   else
+      assert(false)
+   end
+   return w,{fx}
+end
diff --git a/asyncsgd/pclient.lua b/pclient.lua
diff --git a/asyncsgd/pserver.lua b/pserver.lua
diff --git a/train.lua b/train.lua
@@ -346,10 +346,28 @@ elseif opt.optimizer == 'sgd' then
     optim_state.momentum = 0.99
     optim_state.nesterov = true
     optim_state.dampening = 0
+elseif opt.optimizer == 'eamsgd' then
+    optimizer = optim.eamsgd
+    optim_state.learningRate = opt.learningRate
+    optim_state.momentum = opt.momentum
+    optim_state.pclient = opt.pclient
+    optim_state.communicationPeriod = opt.communicationPeriod
+    optim_state.movingRateAlpha = opt.movingRateAlpha
 else
     optimizer = optim.rmsprop
 end
 
+-- initialize MPI optimizer clients
+rank = opt.rank
+print('i am ' .. rank .. ' ready to run')
+if pclient then
+   pclient:start(params,grad_params)
+   assert(rank == pclient.rank)
+   print('pc ' .. rank .. ' started')
+end
+
+-- run optimizer
+sys.tic() -- time the training procedure
 for i = 1, iterations do
     local epoch = i / loader.ntrain
 
@@ -432,4 +450,9 @@ for i = 1, iterations do
     end
 end
 
+-- stop optimizer clients
+if pclient then
+   pclient:stop()
+end
 
+print(rank,'total training time is', sys.toc())

‹ projects	cluster-rnn a distributed Torch7 RNN cluster over MPI
	Log \| Files \| Refs \| README

A	asyncsgd/codeblock.lua	\|	27	+++++++++++++++++++++++++++
M	asyncsgd/goot.lua	\|	1	+
D	asyncsgd/mlaunch.lua	\|	119	-------------------------------------------------------------------------------
D	asyncsgd/optim-eamsgd.lua	\|	74	--------------------------------------------------------------------------
R	asyncsgd/init.lua -> init.lua	\|	0
A	mlaunch.lua	\|	119	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	optim-eamsgd.lua	\|	75	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	asyncsgd/pclient.lua -> pclient.lua	\|	0
R	asyncsgd/pserver.lua -> pserver.lua	\|	0
M	train.lua	\|	23	+++++++++++++++++++++++