mlaunch.lua (4597B)
1 -- mpi launch 2 -- Author: Sixin Zhang (zsx@cims.nyu.edu) 3 -- Author: Umhau (umhau@alum.gcc.edu) 4 -- mpirun -n 12 luajit mlaunch.lua 5 6 --[[ NOTES -------------------------------------------------------------------- 7 8 This script is used to launch mpi. The user's script goes at the bottom, where 9 goot.lua has been referenced. Edit this file only to change the variables 10 noted. 11 12 MPI is configured so this script will be running separately on each available 13 core on each available machine in the cluster - so the 'ranks' below will range 14 from 0-7 if there are two machines with 4 CPU cores each. I don't have any 15 GPUs, so I can't speak to how those are presented. 16 17 The paper refers to some of the parameters with greek symbols. These are noted 18 as relevant in the variables section below. 19 20 --]] 21 22 -- VARIABLES ------------------------------------------------------------------ 23 24 local oncuda = false -- Set for working with CPUs. Change this if using GPUs. 25 local torchfile = 'train.lua' -- name of torch file to run with MPI 26 local iterations = 10 -- i.e., epochs. don't need that many for testing. 27 28 -- advanced parameters 29 local communicationPeriod = 64 -- \tau 30 local movingRateAlpha = 0.001 --0.9/6 -- \alpha 31 local learningRate = 5e-3 --\eta 32 local momentum = 0.99 -- \delta 33 34 -- very advanced parameters 35 local learningRateDecay = 1e-4 36 local learningRateDecayPower = 0.5 -- see pg. 23, fig 9 of associated paper 37 38 -- there's other EAMSGD variables that can be tuned below. I'll do that later. 39 40 -- GPU SETTINGS --------------------------------------------------------------- 41 42 local AGPU = nil 43 if oncuda then 44 require 'cutorch' 45 AGPU = {1,2,3,4,5,6} -- use the first 6 gpus on each machine 46 end 47 48 local gpuid = -1 49 50 -- MPI CONFIGURATION ---------------------------------------------------------- 51 52 dofile('init.lua') 53 mpiT.Init() 54 55 local world = mpiT.COMM_WORLD 56 local rank = mpiT.get_rank(world) 57 local size = mpiT.get_size(world) 58 59 local conf = {} 60 conf.rank = rank 61 conf.world = world 62 conf.sranks = {} 63 conf.cranks = {} 64 for i = 0,size-1 do 65 if math.fmod(i,2)==0 then 66 table.insert(conf.sranks,i) 67 else 68 table.insert(conf.cranks,i) 69 end 70 end 71 72 mpiOptions = {} 73 --[[ -- delete this stuff later. 74 mpiOptions.name = 'downpour' 75 mpiOptions.lr = 1e-4 76 mpiOptions.su = 1 77 --]] 78 mpiOptions.name = 'eamsgd' -- using most efficient optimizer 79 --mpiOptions.lr = 1e-1 80 mpiOptions.communicationPeriod = communicationPeriod 81 mpiOptions.movingRateAlpha = movingRateAlpha -- this is \beta/p when p=6 82 mpiOptions.learningRate = learningRate -- order of magnitude from the other - what's the difference? 83 mpiOptions.momentum = momentum 84 85 mpiOptions.maxepoch = iterations 86 87 mpiOptions.learningRateDecay = learningRateDecay 88 mpiOptions.learningRateDecayPower = learningRateDecayPower 89 90 -- determine if the current node should be server or client. Seems like there 91 -- should be more clients than servers...investigate later. (change the '2'?) 92 if math.fmod(rank,2)==0 then 93 -- if the rank # is even, it's a server 94 print('[server] rank',rank,'use cpu') 95 torch.setdefaulttensortype('torch.FloatTensor') 96 local ps = pServer(conf) 97 ps:start() 98 99 else 100 -- if node rank # is odd, it's a client. This means we have to choose how 101 -- to process the metric $#!7-ton of data that's going to be directed this 102 -- way. So, this is where we configure our GPUs or CPUs. 103 104 if AGPU then 105 -- if not nil, GPUs are enabled 106 require 'cunn' 107 -- use CUDA 108 local gpus = cutorch.getDeviceCount() 109 -- how many GPUs available on this machine? 110 gpuid = AGPU[(rank%(size/2)) % gpus + 1] 111 -- use the node's rank to set the ID of each(?) GPU 112 cutorch.setDevice(gpuid) 113 print('[client] rank ' .. rank .. ' use gpu ' .. gpuid) 114 torch.setdefaulttensortype('torch.CudaTensor') 115 116 else 117 -- if the GPU flag is set FALSE, we're using CPUs 118 print('[client] rank ' .. rank .. ' use cpu') 119 torch.setdefaulttensortype('torch.FloatTensor') 120 end 121 122 -- done with configuring the processors. These are settings specific to the 123 -- node at hand, now that we know exactly what it's going to be doing. 124 mpiOptions.gpuid = gpuid -- Tell the optimizer if GPUs are available. 125 mpiOptions.pclient = pClient(conf) -- MPI settings for communicating with the other nodes. 126 mpiOptions.rank = rank -- Simple access to the node number. 127 128 -- Time to run the training algorithm. This is not an arbitrary script, 129 -- and must contain some cruicial settings. 130 dofile(torchfile) 131 132 end 133 134 -- clean up the MPI communication channels. 135 mpiT.Finalize()