‹ projects

cluster-rnn

a distributed Torch7 RNN cluster over MPI
Log | Files | Refs | README

commit daa6e794ac441e478a3efa2b7794ea1d0d2451d4
parent 4f4198ba67f6ea53b20bf19baba0aebb4fb9fa58
Author: umhau <umhau@users.noreply.github.com>
Date:   Tue, 14 Feb 2017 17:53:57 -0500

implement EAMSGD in word-rnn

Diffstat:
Aasyncsgd/codeblock.lua | 27+++++++++++++++++++++++++++
Masyncsgd/goot.lua | 1+
Dasyncsgd/mlaunch.lua | 119-------------------------------------------------------------------------------
Dasyncsgd/optim-eamsgd.lua | 74--------------------------------------------------------------------------
Rasyncsgd/init.lua -> init.lua | 0
Amlaunch.lua | 119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aoptim-eamsgd.lua | 75+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rasyncsgd/pclient.lua -> pclient.lua | 0
Rasyncsgd/pserver.lua -> pserver.lua | 0
Mtrain.lua | 23+++++++++++++++++++++++
10 files changed, 245 insertions(+), 193 deletions(-)

diff --git a/asyncsgd/codeblock.lua b/asyncsgd/codeblock.lua @@ -0,0 +1,26 @@ +------------------------------------------------------------------------------- +-- Author: umhau (umhau@alum.gcc.edu) +------------------------------------------------------------------------------- + + + +-- NOTES ---------------------------------------------------------------------- + + + +-- MPI SETTINGS --------------------------------------------------------------- +-- most of these are set in the mlaunch file. These are mostly duplicates. + +local state = state or {} +local mb = opt.mb or 128 + +require 'optim' + +opti = optim.eamsgd +state.optim = { + lr = lr, + pclient = pclient, + su = su, + mva = mva, + mom = mom, +} +\ No newline at end of file diff --git a/asyncsgd/goot.lua b/asyncsgd/goot.lua @@ -10,6 +10,7 @@ local opt = opt or {} -- location of training data +-- looks like it's designed to continue processing data after an interruption. local data_root = opt.data_root or io.popen('echo $HOME'):read() .. '/data/torch7/mnist10' diff --git a/asyncsgd/mlaunch.lua b/asyncsgd/mlaunch.lua @@ -1,119 +0,0 @@ --- mpi launch --- Author: Sixin Zhang (zsx@cims.nyu.edu) --- Author: Umhau (umhau@alum.gcc.edu) --- mpirun -n 12 luajit mlaunch.lua - ---[[ NOTES -------------------------------------------------------------------- - -This script is used to launch mpi. The user's script goes at the bottom, where -goot.lua has been referenced. Edit this file only to change the variables -noted. - -MPI is configured so this script will be running separately on each available -core on each available machine in the cluster - so the 'ranks' below will range -from 0-7 if there are two machines with 4 CPU cores each. I don't have any -GPUs, so I can't speak to how those are presented. - ---]] - --- VARIABLES ------------------------------------------------------------------ - -local oncuda = false -- Set for working with CPUs. Change this if using GPUs. -local torchfile = 'goot.lua' -- name of torch file to run with MPI -local iterations = 10 -- i.e., epochs. don't need that many for testing. - --- there's other EAMSGD variables that can be tuned below. I'll do that later. - --- GPU SETTINGS --------------------------------------------------------------- - -local AGPU = nil -if oncuda then - require 'cutorch' - AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine -end - -local gpuid = -1 - --- MPI CONFIGURATION ---------------------------------------------------------- - -dofile('init.lua') -mpiT.Init() - -local world = mpiT.COMM_WORLD -local rank = mpiT.get_rank(world) -local size = mpiT.get_size(world) - -local conf = {} -conf.rank = rank -conf.world = world -conf.sranks = {} -conf.cranks = {} -for i = 0,size-1 do - if math.fmod(i,2)==0 then - table.insert(conf.sranks,i) - else - table.insert(conf.cranks,i) - end -end - -opt = {} ---[[ -- delete this stuff later. -opt.name = 'downpour' -opt.lr = 1e-4 -opt.su = 1 ---]] -opt.name = 'eamsgd' -- using most efficient optimizer ---opt.lr = 1e-1 -opt.su = 100 -opt.mva = 0.9/6 -- this is \beta/p when p=6 -opt.lr = 1e-2 -- order of magnitude from the other - what's the difference? -opt.mom = 0.99 - -opt.maxepoch = iterations - --- determine if the current node should be server or client. Seems like there --- should be more clients than servers...investigate later. (change the '2'?) -if math.fmod(rank,2)==0 then - -- if the rank # is even, it's a server - print('[server] rank',rank,'use cpu') - torch.setdefaulttensortype('torch.FloatTensor') - local ps = pServer(conf) - ps:start() - -else - -- if node rank # is odd, it's a client. This means we have to choose how - -- to process the metric $#!7-ton of data that's going to be directed this - -- way. So, this is where we configure our GPUs or CPUs. - - if AGPU then - -- if not nil, GPUs are enabled - require 'cunn' - -- use CUDA - local gpus = cutorch.getDeviceCount() - -- how many GPUs available on this machine? - gpuid = AGPU[(rank%(size/2)) % gpus + 1] - -- use the node's rank to set the ID of each(?) GPU - cutorch.setDevice(gpuid) - print('[client] rank ' .. rank .. ' use gpu ' .. gpuid) - torch.setdefaulttensortype('torch.CudaTensor') - - else - -- if the GPU flag is set FALSE, we're using CPUs - print('[client] rank ' .. rank .. ' use cpu') - torch.setdefaulttensortype('torch.FloatTensor') - end - - -- done with configuring the processors. These are settings specific to the - -- node at hand, now that we know exactly what it's going to be doing. - opt.gpuid = gpuid -- Tell the optimizer if GPUs are available. - opt.pc = pClient(conf) -- MPI settings for communicating with the other nodes. - opt.rank = rank -- Simple access to the node number. - - -- Time to run the training algorithm. This is not an arbitrary script, - -- and must contain some cruicial settings. - dofile(torchfile) - -end - --- clean up the MPI communication channels. -mpiT.Finalize() diff --git a/asyncsgd/optim-eamsgd.lua b/asyncsgd/optim-eamsgd.lua @@ -1,74 +0,0 @@ --- Async EASGD/EAMSGD --- Author: Sixin Zhang (zsx@cims.nyu.edu) --- when mom==0, it is the easgd -require 'optim' - -function optim.eamsgd(opfunc, w, config, state) - local config = config or {} - local state = state or config - - local lr = config.lr or 0 -- learning rate \eta - local lrd = config.lrd or 0 -- learning rate decay - local lrp = config.lrp or 0 -- learning rate decay power - local mom = config.mom or 0 -- momentum term \delta - local l2wd = config.l2wd or 0 - - local pc = config.pclient or nil - local mva = config.mva or 0 -- moving rate \alpha - local su = config.su or 1 -- comm period \tau - - state.pversion = state.pversion or 0 - state.dusync = state.dusync or 0 - - local fx,dfdx - local function localupdate() - if lr ~= 0 then - if mom > 0 then - if not state.vt then - state.vt = w:clone():zero() - end - state.vt:mul(mom) - w:add(state.vt) - end - fx,dfdx = opfunc(w) - if l2wd ~= 0 then dfdx:add(l2wd, w) end - local clr = lr - if lrd ~= 0 and lrp > 0 then - clr = lr / math.pow(1+state.pversion*lrd,lrp) - end - w:add(-clr,dfdx) - if mom > 0 then - state.vt:add(-clr,dfdx) - end - state.pversion = state.pversion + 1 - end - end - - if (pc and su>0 and mva>0) then - if (state.pversion%su == 0) then - if not config.suw then -- need 2 copies - config.suw = torch.Tensor():typeAs(w):resizeAs(w):fill(0) - config.sug = torch.Tensor():typeAs(w):resizeAs(w):fill(0) - pc:reset(config.suw,config.sug) - end - pc:async_recv_param() -- suw=w* - local synctime = sys.clock() - pc:wait() -- sug is sent and suw is recv - state.dusync = state.dusync + sys.clock()-synctime - config.sug:copy(w) -- sug=w - config.sug:add(-1,config.suw) -- sug=w-w* - config.sug:mul(mva) -- sug=mva*(w-w*) - pc:async_send_grad() -- apply w*=w*+mva*(w-w*) - local synctime = sys.clock() - pc:ping() -- overlap aio and computation - state.dusync = state.dusync + sys.clock()-synctime - localupdate() - w:add(-1,config.sug) -- w=w+mva*(w*-w) - else - localupdate() - end - else - assert(false) - end - return w,{fx} -end diff --git a/asyncsgd/init.lua b/init.lua diff --git a/mlaunch.lua b/mlaunch.lua @@ -0,0 +1,119 @@ +-- mpi launch +-- Author: Sixin Zhang (zsx@cims.nyu.edu) +-- Author: Umhau (umhau@alum.gcc.edu) +-- mpirun -n 12 luajit mlaunch.lua + +--[[ NOTES -------------------------------------------------------------------- + +This script is used to launch mpi. The user's script goes at the bottom, where +goot.lua has been referenced. Edit this file only to change the variables +noted. + +MPI is configured so this script will be running separately on each available +core on each available machine in the cluster - so the 'ranks' below will range +from 0-7 if there are two machines with 4 CPU cores each. I don't have any +GPUs, so I can't speak to how those are presented. + +--]] + +-- VARIABLES ------------------------------------------------------------------ + +local oncuda = false -- Set for working with CPUs. Change this if using GPUs. +local torchfile = 'goot.lua' -- name of torch file to run with MPI +local iterations = 10 -- i.e., epochs. don't need that many for testing. + +-- there's other EAMSGD variables that can be tuned below. I'll do that later. + +-- GPU SETTINGS --------------------------------------------------------------- + +local AGPU = nil +if oncuda then + require 'cutorch' + AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine +end + +local gpuid = -1 + +-- MPI CONFIGURATION ---------------------------------------------------------- + +dofile('init.lua') +mpiT.Init() + +local world = mpiT.COMM_WORLD +local rank = mpiT.get_rank(world) +local size = mpiT.get_size(world) + +local conf = {} +conf.rank = rank +conf.world = world +conf.sranks = {} +conf.cranks = {} +for i = 0,size-1 do + if math.fmod(i,2)==0 then + table.insert(conf.sranks,i) + else + table.insert(conf.cranks,i) + end +end + +opt = {} +--[[ -- delete this stuff later. +opt.name = 'downpour' +opt.lr = 1e-4 +opt.su = 1 +--]] +opt.name = 'eamsgd' -- using most efficient optimizer +--opt.lr = 1e-1 +opt.communicationPeriod = 100 +opt.movingRateAlpha = 0.9/6 -- this is \beta/p when p=6 +opt.learningRate = 1e-1 --1e-2 -- order of magnitude from the other - what's the difference? +opt.momentum = 0.99 + +opt.maxepoch = iterations + +-- determine if the current node should be server or client. Seems like there +-- should be more clients than servers...investigate later. (change the '2'?) +if math.fmod(rank,2)==0 then + -- if the rank # is even, it's a server + print('[server] rank',rank,'use cpu') + torch.setdefaulttensortype('torch.FloatTensor') + local ps = pServer(conf) + ps:start() + +else + -- if node rank # is odd, it's a client. This means we have to choose how + -- to process the metric $#!7-ton of data that's going to be directed this + -- way. So, this is where we configure our GPUs or CPUs. + + if AGPU then + -- if not nil, GPUs are enabled + require 'cunn' + -- use CUDA + local gpus = cutorch.getDeviceCount() + -- how many GPUs available on this machine? + gpuid = AGPU[(rank%(size/2)) % gpus + 1] + -- use the node's rank to set the ID of each(?) GPU + cutorch.setDevice(gpuid) + print('[client] rank ' .. rank .. ' use gpu ' .. gpuid) + torch.setdefaulttensortype('torch.CudaTensor') + + else + -- if the GPU flag is set FALSE, we're using CPUs + print('[client] rank ' .. rank .. ' use cpu') + torch.setdefaulttensortype('torch.FloatTensor') + end + + -- done with configuring the processors. These are settings specific to the + -- node at hand, now that we know exactly what it's going to be doing. + opt.gpuid = gpuid -- Tell the optimizer if GPUs are available. + opt.pclient = pClient(conf) -- MPI settings for communicating with the other nodes. + opt.rank = rank -- Simple access to the node number. + + -- Time to run the training algorithm. This is not an arbitrary script, + -- and must contain some cruicial settings. + dofile(torchfile) + +end + +-- clean up the MPI communication channels. +mpiT.Finalize() diff --git a/optim-eamsgd.lua b/optim-eamsgd.lua @@ -0,0 +1,75 @@ +-- Async EASGD/EAMSGD +-- Author: Sixin Zhang (zsx@cims.nyu.edu) +-- Author: umhau (umhau@alum.gcc.edu) +-- when mom==0, it is the easgd +require 'optim' + +function optim.eamsgd(opfunc, w, config, state) + local config = config or {} + local state = state or config + + local lr = config.learningRate or 0 -- learning rate \eta + local lrd = config.learningRateDecay or 0 -- learning rate decay + local lrp = config.learningRateDecayPower or 0 -- learning rate decay power + local mom = config.momentum or 0 -- momentum term \delta + local l2wd = config.l2wd or 0 + + local pc = config.pclient or nil + local mva = config.movingRateAlpha or 0 -- moving rate \alpha + local su = config.communicationPeriod or 1 -- comm period \tau + + state.pversion = state.pversion or 0 + state.dusync = state.dusync or 0 + + local fx,dfdx + local function localupdate() + if lr ~= 0 then + if mom > 0 then + if not state.vt then + state.vt = w:clone():zero() + end + state.vt:mul(mom) + w:add(state.vt) + end + fx,dfdx = opfunc(w) + if l2wd ~= 0 then dfdx:add(l2wd, w) end + local clr = lr + if lrd ~= 0 and lrp > 0 then + clr = lr / math.pow(1+state.pversion*lrd,lrp) + end + w:add(-clr,dfdx) + if mom > 0 then + state.vt:add(-clr,dfdx) + end + state.pversion = state.pversion + 1 + end + end + + if (pc and su>0 and mva>0) then + if (state.pversion%su == 0) then + if not config.suw then -- need 2 copies + config.suw = torch.Tensor():typeAs(w):resizeAs(w):fill(0) + config.sug = torch.Tensor():typeAs(w):resizeAs(w):fill(0) + pc:reset(config.suw,config.sug) + end + pc:async_recv_param() -- suw=w* + local synctime = sys.clock() + pc:wait() -- sug is sent and suw is recv + state.dusync = state.dusync + sys.clock()-synctime + config.sug:copy(w) -- sug=w + config.sug:add(-1,config.suw) -- sug=w-w* + config.sug:mul(mva) -- sug=mva*(w-w*) + pc:async_send_grad() -- apply w*=w*+mva*(w-w*) + local synctime = sys.clock() + pc:ping() -- overlap aio and computation + state.dusync = state.dusync + sys.clock()-synctime + localupdate() + w:add(-1,config.sug) -- w=w+mva*(w*-w) + else + localupdate() + end + else + assert(false) + end + return w,{fx} +end diff --git a/asyncsgd/pclient.lua b/pclient.lua diff --git a/asyncsgd/pserver.lua b/pserver.lua diff --git a/train.lua b/train.lua @@ -346,10 +346,28 @@ elseif opt.optimizer == 'sgd' then optim_state.momentum = 0.99 optim_state.nesterov = true optim_state.dampening = 0 +elseif opt.optimizer == 'eamsgd' then + optimizer = optim.eamsgd + optim_state.learningRate = opt.learningRate + optim_state.momentum = opt.momentum + optim_state.pclient = opt.pclient + optim_state.communicationPeriod = opt.communicationPeriod + optim_state.movingRateAlpha = opt.movingRateAlpha else optimizer = optim.rmsprop end +-- initialize MPI optimizer clients +rank = opt.rank +print('i am ' .. rank .. ' ready to run') +if pclient then + pclient:start(params,grad_params) + assert(rank == pclient.rank) + print('pc ' .. rank .. ' started') +end + +-- run optimizer +sys.tic() -- time the training procedure for i = 1, iterations do local epoch = i / loader.ntrain @@ -432,4 +450,9 @@ for i = 1, iterations do end end +-- stop optimizer clients +if pclient then + pclient:stop() +end +print(rank,'total training time is', sys.toc())