commit daa6e794ac441e478a3efa2b7794ea1d0d2451d4
parent 4f4198ba67f6ea53b20bf19baba0aebb4fb9fa58
Author: umhau <umhau@users.noreply.github.com>
Date: Tue, 14 Feb 2017 17:53:57 -0500
implement EAMSGD in word-rnn
Diffstat:
10 files changed, 245 insertions(+), 193 deletions(-)
diff --git a/asyncsgd/codeblock.lua b/asyncsgd/codeblock.lua
@@ -0,0 +1,26 @@
+-------------------------------------------------------------------------------
+-- Author: umhau (umhau@alum.gcc.edu)
+-------------------------------------------------------------------------------
+
+
+
+-- NOTES ----------------------------------------------------------------------
+
+
+
+-- MPI SETTINGS ---------------------------------------------------------------
+-- most of these are set in the mlaunch file. These are mostly duplicates.
+
+local state = state or {}
+local mb = opt.mb or 128
+
+require 'optim'
+
+opti = optim.eamsgd
+state.optim = {
+ lr = lr,
+ pclient = pclient,
+ su = su,
+ mva = mva,
+ mom = mom,
+}
+\ No newline at end of file
diff --git a/asyncsgd/goot.lua b/asyncsgd/goot.lua
@@ -10,6 +10,7 @@
local opt = opt or {}
-- location of training data
+-- looks like it's designed to continue processing data after an interruption.
local data_root = opt.data_root or
io.popen('echo $HOME'):read() .. '/data/torch7/mnist10'
diff --git a/asyncsgd/mlaunch.lua b/asyncsgd/mlaunch.lua
@@ -1,119 +0,0 @@
--- mpi launch
--- Author: Sixin Zhang (zsx@cims.nyu.edu)
--- Author: Umhau (umhau@alum.gcc.edu)
--- mpirun -n 12 luajit mlaunch.lua
-
---[[ NOTES --------------------------------------------------------------------
-
-This script is used to launch mpi. The user's script goes at the bottom, where
-goot.lua has been referenced. Edit this file only to change the variables
-noted.
-
-MPI is configured so this script will be running separately on each available
-core on each available machine in the cluster - so the 'ranks' below will range
-from 0-7 if there are two machines with 4 CPU cores each. I don't have any
-GPUs, so I can't speak to how those are presented.
-
---]]
-
--- VARIABLES ------------------------------------------------------------------
-
-local oncuda = false -- Set for working with CPUs. Change this if using GPUs.
-local torchfile = 'goot.lua' -- name of torch file to run with MPI
-local iterations = 10 -- i.e., epochs. don't need that many for testing.
-
--- there's other EAMSGD variables that can be tuned below. I'll do that later.
-
--- GPU SETTINGS ---------------------------------------------------------------
-
-local AGPU = nil
-if oncuda then
- require 'cutorch'
- AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine
-end
-
-local gpuid = -1
-
--- MPI CONFIGURATION ----------------------------------------------------------
-
-dofile('init.lua')
-mpiT.Init()
-
-local world = mpiT.COMM_WORLD
-local rank = mpiT.get_rank(world)
-local size = mpiT.get_size(world)
-
-local conf = {}
-conf.rank = rank
-conf.world = world
-conf.sranks = {}
-conf.cranks = {}
-for i = 0,size-1 do
- if math.fmod(i,2)==0 then
- table.insert(conf.sranks,i)
- else
- table.insert(conf.cranks,i)
- end
-end
-
-opt = {}
---[[ -- delete this stuff later.
-opt.name = 'downpour'
-opt.lr = 1e-4
-opt.su = 1
---]]
-opt.name = 'eamsgd' -- using most efficient optimizer
---opt.lr = 1e-1
-opt.su = 100
-opt.mva = 0.9/6 -- this is \beta/p when p=6
-opt.lr = 1e-2 -- order of magnitude from the other - what's the difference?
-opt.mom = 0.99
-
-opt.maxepoch = iterations
-
--- determine if the current node should be server or client. Seems like there
--- should be more clients than servers...investigate later. (change the '2'?)
-if math.fmod(rank,2)==0 then
- -- if the rank # is even, it's a server
- print('[server] rank',rank,'use cpu')
- torch.setdefaulttensortype('torch.FloatTensor')
- local ps = pServer(conf)
- ps:start()
-
-else
- -- if node rank # is odd, it's a client. This means we have to choose how
- -- to process the metric $#!7-ton of data that's going to be directed this
- -- way. So, this is where we configure our GPUs or CPUs.
-
- if AGPU then
- -- if not nil, GPUs are enabled
- require 'cunn'
- -- use CUDA
- local gpus = cutorch.getDeviceCount()
- -- how many GPUs available on this machine?
- gpuid = AGPU[(rank%(size/2)) % gpus + 1]
- -- use the node's rank to set the ID of each(?) GPU
- cutorch.setDevice(gpuid)
- print('[client] rank ' .. rank .. ' use gpu ' .. gpuid)
- torch.setdefaulttensortype('torch.CudaTensor')
-
- else
- -- if the GPU flag is set FALSE, we're using CPUs
- print('[client] rank ' .. rank .. ' use cpu')
- torch.setdefaulttensortype('torch.FloatTensor')
- end
-
- -- done with configuring the processors. These are settings specific to the
- -- node at hand, now that we know exactly what it's going to be doing.
- opt.gpuid = gpuid -- Tell the optimizer if GPUs are available.
- opt.pc = pClient(conf) -- MPI settings for communicating with the other nodes.
- opt.rank = rank -- Simple access to the node number.
-
- -- Time to run the training algorithm. This is not an arbitrary script,
- -- and must contain some cruicial settings.
- dofile(torchfile)
-
-end
-
--- clean up the MPI communication channels.
-mpiT.Finalize()
diff --git a/asyncsgd/optim-eamsgd.lua b/asyncsgd/optim-eamsgd.lua
@@ -1,74 +0,0 @@
--- Async EASGD/EAMSGD
--- Author: Sixin Zhang (zsx@cims.nyu.edu)
--- when mom==0, it is the easgd
-require 'optim'
-
-function optim.eamsgd(opfunc, w, config, state)
- local config = config or {}
- local state = state or config
-
- local lr = config.lr or 0 -- learning rate \eta
- local lrd = config.lrd or 0 -- learning rate decay
- local lrp = config.lrp or 0 -- learning rate decay power
- local mom = config.mom or 0 -- momentum term \delta
- local l2wd = config.l2wd or 0
-
- local pc = config.pclient or nil
- local mva = config.mva or 0 -- moving rate \alpha
- local su = config.su or 1 -- comm period \tau
-
- state.pversion = state.pversion or 0
- state.dusync = state.dusync or 0
-
- local fx,dfdx
- local function localupdate()
- if lr ~= 0 then
- if mom > 0 then
- if not state.vt then
- state.vt = w:clone():zero()
- end
- state.vt:mul(mom)
- w:add(state.vt)
- end
- fx,dfdx = opfunc(w)
- if l2wd ~= 0 then dfdx:add(l2wd, w) end
- local clr = lr
- if lrd ~= 0 and lrp > 0 then
- clr = lr / math.pow(1+state.pversion*lrd,lrp)
- end
- w:add(-clr,dfdx)
- if mom > 0 then
- state.vt:add(-clr,dfdx)
- end
- state.pversion = state.pversion + 1
- end
- end
-
- if (pc and su>0 and mva>0) then
- if (state.pversion%su == 0) then
- if not config.suw then -- need 2 copies
- config.suw = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
- config.sug = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
- pc:reset(config.suw,config.sug)
- end
- pc:async_recv_param() -- suw=w*
- local synctime = sys.clock()
- pc:wait() -- sug is sent and suw is recv
- state.dusync = state.dusync + sys.clock()-synctime
- config.sug:copy(w) -- sug=w
- config.sug:add(-1,config.suw) -- sug=w-w*
- config.sug:mul(mva) -- sug=mva*(w-w*)
- pc:async_send_grad() -- apply w*=w*+mva*(w-w*)
- local synctime = sys.clock()
- pc:ping() -- overlap aio and computation
- state.dusync = state.dusync + sys.clock()-synctime
- localupdate()
- w:add(-1,config.sug) -- w=w+mva*(w*-w)
- else
- localupdate()
- end
- else
- assert(false)
- end
- return w,{fx}
-end
diff --git a/asyncsgd/init.lua b/init.lua
diff --git a/mlaunch.lua b/mlaunch.lua
@@ -0,0 +1,119 @@
+-- mpi launch
+-- Author: Sixin Zhang (zsx@cims.nyu.edu)
+-- Author: Umhau (umhau@alum.gcc.edu)
+-- mpirun -n 12 luajit mlaunch.lua
+
+--[[ NOTES --------------------------------------------------------------------
+
+This script is used to launch mpi. The user's script goes at the bottom, where
+goot.lua has been referenced. Edit this file only to change the variables
+noted.
+
+MPI is configured so this script will be running separately on each available
+core on each available machine in the cluster - so the 'ranks' below will range
+from 0-7 if there are two machines with 4 CPU cores each. I don't have any
+GPUs, so I can't speak to how those are presented.
+
+--]]
+
+-- VARIABLES ------------------------------------------------------------------
+
+local oncuda = false -- Set for working with CPUs. Change this if using GPUs.
+local torchfile = 'goot.lua' -- name of torch file to run with MPI
+local iterations = 10 -- i.e., epochs. don't need that many for testing.
+
+-- there's other EAMSGD variables that can be tuned below. I'll do that later.
+
+-- GPU SETTINGS ---------------------------------------------------------------
+
+local AGPU = nil
+if oncuda then
+ require 'cutorch'
+ AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine
+end
+
+local gpuid = -1
+
+-- MPI CONFIGURATION ----------------------------------------------------------
+
+dofile('init.lua')
+mpiT.Init()
+
+local world = mpiT.COMM_WORLD
+local rank = mpiT.get_rank(world)
+local size = mpiT.get_size(world)
+
+local conf = {}
+conf.rank = rank
+conf.world = world
+conf.sranks = {}
+conf.cranks = {}
+for i = 0,size-1 do
+ if math.fmod(i,2)==0 then
+ table.insert(conf.sranks,i)
+ else
+ table.insert(conf.cranks,i)
+ end
+end
+
+opt = {}
+--[[ -- delete this stuff later.
+opt.name = 'downpour'
+opt.lr = 1e-4
+opt.su = 1
+--]]
+opt.name = 'eamsgd' -- using most efficient optimizer
+--opt.lr = 1e-1
+opt.communicationPeriod = 100
+opt.movingRateAlpha = 0.9/6 -- this is \beta/p when p=6
+opt.learningRate = 1e-1 --1e-2 -- order of magnitude from the other - what's the difference?
+opt.momentum = 0.99
+
+opt.maxepoch = iterations
+
+-- determine if the current node should be server or client. Seems like there
+-- should be more clients than servers...investigate later. (change the '2'?)
+if math.fmod(rank,2)==0 then
+ -- if the rank # is even, it's a server
+ print('[server] rank',rank,'use cpu')
+ torch.setdefaulttensortype('torch.FloatTensor')
+ local ps = pServer(conf)
+ ps:start()
+
+else
+ -- if node rank # is odd, it's a client. This means we have to choose how
+ -- to process the metric $#!7-ton of data that's going to be directed this
+ -- way. So, this is where we configure our GPUs or CPUs.
+
+ if AGPU then
+ -- if not nil, GPUs are enabled
+ require 'cunn'
+ -- use CUDA
+ local gpus = cutorch.getDeviceCount()
+ -- how many GPUs available on this machine?
+ gpuid = AGPU[(rank%(size/2)) % gpus + 1]
+ -- use the node's rank to set the ID of each(?) GPU
+ cutorch.setDevice(gpuid)
+ print('[client] rank ' .. rank .. ' use gpu ' .. gpuid)
+ torch.setdefaulttensortype('torch.CudaTensor')
+
+ else
+ -- if the GPU flag is set FALSE, we're using CPUs
+ print('[client] rank ' .. rank .. ' use cpu')
+ torch.setdefaulttensortype('torch.FloatTensor')
+ end
+
+ -- done with configuring the processors. These are settings specific to the
+ -- node at hand, now that we know exactly what it's going to be doing.
+ opt.gpuid = gpuid -- Tell the optimizer if GPUs are available.
+ opt.pclient = pClient(conf) -- MPI settings for communicating with the other nodes.
+ opt.rank = rank -- Simple access to the node number.
+
+ -- Time to run the training algorithm. This is not an arbitrary script,
+ -- and must contain some cruicial settings.
+ dofile(torchfile)
+
+end
+
+-- clean up the MPI communication channels.
+mpiT.Finalize()
diff --git a/optim-eamsgd.lua b/optim-eamsgd.lua
@@ -0,0 +1,75 @@
+-- Async EASGD/EAMSGD
+-- Author: Sixin Zhang (zsx@cims.nyu.edu)
+-- Author: umhau (umhau@alum.gcc.edu)
+-- when mom==0, it is the easgd
+require 'optim'
+
+function optim.eamsgd(opfunc, w, config, state)
+ local config = config or {}
+ local state = state or config
+
+ local lr = config.learningRate or 0 -- learning rate \eta
+ local lrd = config.learningRateDecay or 0 -- learning rate decay
+ local lrp = config.learningRateDecayPower or 0 -- learning rate decay power
+ local mom = config.momentum or 0 -- momentum term \delta
+ local l2wd = config.l2wd or 0
+
+ local pc = config.pclient or nil
+ local mva = config.movingRateAlpha or 0 -- moving rate \alpha
+ local su = config.communicationPeriod or 1 -- comm period \tau
+
+ state.pversion = state.pversion or 0
+ state.dusync = state.dusync or 0
+
+ local fx,dfdx
+ local function localupdate()
+ if lr ~= 0 then
+ if mom > 0 then
+ if not state.vt then
+ state.vt = w:clone():zero()
+ end
+ state.vt:mul(mom)
+ w:add(state.vt)
+ end
+ fx,dfdx = opfunc(w)
+ if l2wd ~= 0 then dfdx:add(l2wd, w) end
+ local clr = lr
+ if lrd ~= 0 and lrp > 0 then
+ clr = lr / math.pow(1+state.pversion*lrd,lrp)
+ end
+ w:add(-clr,dfdx)
+ if mom > 0 then
+ state.vt:add(-clr,dfdx)
+ end
+ state.pversion = state.pversion + 1
+ end
+ end
+
+ if (pc and su>0 and mva>0) then
+ if (state.pversion%su == 0) then
+ if not config.suw then -- need 2 copies
+ config.suw = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
+ config.sug = torch.Tensor():typeAs(w):resizeAs(w):fill(0)
+ pc:reset(config.suw,config.sug)
+ end
+ pc:async_recv_param() -- suw=w*
+ local synctime = sys.clock()
+ pc:wait() -- sug is sent and suw is recv
+ state.dusync = state.dusync + sys.clock()-synctime
+ config.sug:copy(w) -- sug=w
+ config.sug:add(-1,config.suw) -- sug=w-w*
+ config.sug:mul(mva) -- sug=mva*(w-w*)
+ pc:async_send_grad() -- apply w*=w*+mva*(w-w*)
+ local synctime = sys.clock()
+ pc:ping() -- overlap aio and computation
+ state.dusync = state.dusync + sys.clock()-synctime
+ localupdate()
+ w:add(-1,config.sug) -- w=w+mva*(w*-w)
+ else
+ localupdate()
+ end
+ else
+ assert(false)
+ end
+ return w,{fx}
+end
diff --git a/asyncsgd/pclient.lua b/pclient.lua
diff --git a/asyncsgd/pserver.lua b/pserver.lua
diff --git a/train.lua b/train.lua
@@ -346,10 +346,28 @@ elseif opt.optimizer == 'sgd' then
optim_state.momentum = 0.99
optim_state.nesterov = true
optim_state.dampening = 0
+elseif opt.optimizer == 'eamsgd' then
+ optimizer = optim.eamsgd
+ optim_state.learningRate = opt.learningRate
+ optim_state.momentum = opt.momentum
+ optim_state.pclient = opt.pclient
+ optim_state.communicationPeriod = opt.communicationPeriod
+ optim_state.movingRateAlpha = opt.movingRateAlpha
else
optimizer = optim.rmsprop
end
+-- initialize MPI optimizer clients
+rank = opt.rank
+print('i am ' .. rank .. ' ready to run')
+if pclient then
+ pclient:start(params,grad_params)
+ assert(rank == pclient.rank)
+ print('pc ' .. rank .. ' started')
+end
+
+-- run optimizer
+sys.tic() -- time the training procedure
for i = 1, iterations do
local epoch = i / loader.ntrain
@@ -432,4 +450,9 @@ for i = 1, iterations do
end
end
+-- stop optimizer clients
+if pclient then
+ pclient:stop()
+end
+print(rank,'total training time is', sys.toc())