mlaunch.lua - cluster-rnn - a distributed Torch7 RNN cluster over MPI

mlaunch.lua (4597B)
      1 -- mpi launch
      2 -- Author: Sixin Zhang (zsx@cims.nyu.edu)
      3 -- Author: Umhau (umhau@alum.gcc.edu)
      4 -- mpirun -n 12 luajit mlaunch.lua
      5 
      6 --[[ NOTES --------------------------------------------------------------------
      7 
      8 This script is used to launch mpi.  The user's script goes at the bottom, where
      9 goot.lua has been referenced.   Edit this file only to change the variables 
     10 noted.
     11 
     12 MPI is configured so this script will be running separately on each available 
     13 core on each available machine in the cluster - so the 'ranks' below will range
     14 from 0-7 if there are two machines with 4 CPU cores each.  I don't have any 
     15 GPUs, so I can't speak to how those are presented.
     16 
     17 The paper refers to some of the parameters with greek symbols.  These are noted
     18 as relevant in the variables section below.
     19 
     20 --]]
     21 
     22 -- VARIABLES ------------------------------------------------------------------
     23 
     24 local oncuda = false -- Set for working with CPUs. Change this if using GPUs.
     25 local torchfile = 'train.lua' -- name of torch file to run with MPI
     26 local iterations = 10 -- i.e., epochs.  don't need that many for testing.
     27 
     28 -- advanced parameters
     29 local communicationPeriod = 64 -- \tau
     30 local movingRateAlpha     = 0.001 --0.9/6 -- \alpha
     31 local learningRate        = 5e-3 --\eta
     32 local momentum            = 0.99 -- \delta
     33 
     34 -- very advanced parameters
     35 local learningRateDecay = 1e-4
     36 local learningRateDecayPower = 0.5 -- see pg. 23, fig 9 of associated paper
     37 
     38 -- there's other EAMSGD variables that can be tuned below. I'll do that later.
     39 
     40 -- GPU SETTINGS ---------------------------------------------------------------
     41 
     42 local AGPU = nil
     43 if oncuda then
     44    require 'cutorch'
     45    AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine
     46 end
     47 
     48 local gpuid = -1
     49 
     50 -- MPI CONFIGURATION ----------------------------------------------------------
     51 
     52 dofile('init.lua')
     53 mpiT.Init()
     54 
     55 local world = mpiT.COMM_WORLD
     56 local rank = mpiT.get_rank(world)
     57 local size = mpiT.get_size(world)
     58 
     59 local conf = {}
     60 conf.rank = rank
     61 conf.world = world
     62 conf.sranks = {}
     63 conf.cranks = {}
     64 for i = 0,size-1 do
     65    if math.fmod(i,2)==0 then
     66       table.insert(conf.sranks,i)
     67    else
     68       table.insert(conf.cranks,i)
     69    end
     70 end
     71 
     72 mpiOptions = {}
     73 --[[ -- delete this stuff later.
     74 mpiOptions.name = 'downpour'
     75 mpiOptions.lr = 1e-4
     76 mpiOptions.su = 1
     77 --]]
     78 mpiOptions.name = 'eamsgd' -- using most efficient optimizer
     79 --mpiOptions.lr = 1e-1
     80 mpiOptions.communicationPeriod = communicationPeriod
     81 mpiOptions.movingRateAlpha = movingRateAlpha -- this is \beta/p when p=6
     82 mpiOptions.learningRate = learningRate -- order of magnitude from the other - what's the difference?
     83 mpiOptions.momentum = momentum
     84 
     85 mpiOptions.maxepoch = iterations
     86 
     87 mpiOptions.learningRateDecay = learningRateDecay
     88 mpiOptions.learningRateDecayPower = learningRateDecayPower
     89 
     90 -- determine if the current node should be server or client. Seems like there
     91 -- should be more clients than servers...investigate later.  (change the '2'?)
     92 if math.fmod(rank,2)==0 then
     93    -- if the rank # is even, it's a server
     94    print('[server] rank',rank,'use cpu')
     95    torch.setdefaulttensortype('torch.FloatTensor')  
     96    local ps = pServer(conf)
     97    ps:start()
     98 
     99 else
    100    -- if node rank # is odd, it's a client.  This means we have to choose how 
    101    -- to process the metric $#!7-ton of data that's going to be directed this 
    102    -- way.  So, this is where we configure our GPUs or CPUs.  
    103 
    104    if AGPU then
    105       -- if not nil, GPUs are enabled
    106       require 'cunn'
    107       -- use CUDA
    108       local gpus = cutorch.getDeviceCount()
    109       -- how many GPUs available on this machine?
    110       gpuid = AGPU[(rank%(size/2)) % gpus + 1]
    111       -- use the node's rank to set the ID of each(?) GPU
    112       cutorch.setDevice(gpuid)
    113       print('[client] rank ' .. rank .. ' use gpu ' .. gpuid)
    114       torch.setdefaulttensortype('torch.CudaTensor')
    115 
    116    else
    117       -- if the GPU flag is set FALSE, we're using CPUs
    118       print('[client] rank ' .. rank .. ' use cpu')
    119       torch.setdefaulttensortype('torch.FloatTensor')
    120    end
    121 
    122    -- done with configuring the processors.  These are settings specific to the
    123    -- node at hand, now that we know exactly what it's going to be doing.
    124    mpiOptions.gpuid = gpuid       -- Tell the optimizer if GPUs are available.
    125    mpiOptions.pclient = pClient(conf)  -- MPI settings for communicating with the other nodes.
    126    mpiOptions.rank = rank         -- Simple access to the node number.  
    127 
    128    -- Time to run the training algorithm.  This is not an arbitrary script,
    129    -- and must contain some cruicial settings.
    130    dofile(torchfile)
    131 
    132 end
    133 
    134 -- clean up the MPI communication channels.
    135 mpiT.Finalize()
‹ projects	cluster-rnn a distributed Torch7 RNN cluster over MPI
	Log \| Files \| Refs \| README