Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
883a468
make file change for caffe
immars Mar 16, 2015
dd25c8a
.gitignore change for Makefile.config
immars Mar 16, 2015
1dbef2b
initial commit
immars Mar 18, 2015
044476c
bugfix
immars Mar 19, 2015
8dab9d2
worket init into run()
immars Mar 20, 2015
64574f6
caffe_lan script
immars Mar 28, 2015
115c9c3
fix for solver state sync
immars Mar 30, 2015
cef5955
fix for large solverstate
immars Mar 31, 2015
f995212
support fb_only
immars Apr 1, 2015
72a63db
script improvement
immars Apr 2, 2015
a4cb45e
sync mode
immars Apr 2, 2015
b400195
caffe_sync
immars Apr 15, 2015
8f15de4
o3
immars Apr 15, 2015
0b79ec0
deprecated flags
immars Apr 15, 2015
13838fd
worker forward in main thread
immars Apr 17, 2015
3c4fd19
multithread share memory
immars Apr 18, 2015
0d15bcc
init solver on different gpu
immars Apr 20, 2015
89fdd85
remove log
immars Apr 20, 2015
b67d4c9
weight_ready removd
immars Apr 20, 2015
f89c517
log change
immars Apr 20, 2015
f292349
log for debug
immars Apr 20, 2015
4ffc00b
init solver on Forwarder::start
immars Apr 20, 2015
9fa84fe
debug check NAN
immars Apr 20, 2015
03a07b1
checkNAN in sync
immars Apr 21, 2015
60f101c
bugfix
immars Apr 21, 2015
00fa8ec
disable checknan
immars Apr 21, 2015
424c9d3
async shared memory
immars Apr 21, 2015
ed588a5
async shared workder
immars Apr 21, 2015
deac22d
server/worker run() looping
immars Apr 21, 2015
3cc4c5b
better killing caffe in lan
immars Apr 21, 2015
0182415
sync initial weight pull
immars Apr 21, 2015
2ef5388
optimize diff lock
immars Apr 21, 2015
9f558ec
wantedVersion init error
immars Apr 21, 2015
8d01e49
benchmark log
immars Apr 21, 2015
97e6da8
better lock to mu_version
immars Apr 21, 2015
3b34dbe
try adding in gpu
immars Apr 21, 2015
01a69b7
p2p access test
immars Apr 21, 2015
3105d59
double buffer for diff
immars Apr 23, 2015
e857a54
benchmark log
immars Apr 23, 2015
dae815f
pre sync back buffer to gpu before swapping to front
immars Apr 23, 2015
4539181
log benchmarking accumulateDiff
immars Apr 23, 2015
36e2702
fix for server: back buf clear to 0
immars Apr 23, 2015
d5446f7
remove benchmark log
immars Apr 25, 2015
891dc83
pull iteration count from server
immars Apr 27, 2015
d7e6eaf
copyWeight after test phase
immars Apr 27, 2015
2953f2b
script change
immars Apr 27, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
*.pb.cc
.*
/script/van*
Makefile.config
29 changes: 24 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@

CONFIG_FILE := Makefile.config
include $(CONFIG_FILE)

CC = g++

# OPT = -O0 -ggdb
# OPT = -O0 -ggdb -DDEBUG
OPT = -O3 -ggdb

THIRD_PATH=$(shell pwd)/third_party
THIRD_PATH=$(shell pwd -L )/third_party

STATIC_THIRD_LIB=0
ifeq ($(STATIC_THIRD_LIB), 1)
THIRD_LIB=$(addprefix $(THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a libglog.a libz.a libsnappy.a)
else
THIRD_LIB=-L$(THIRD_PATH)/lib -lgflags -lzmq -lprotobuf -lglog -lz -lsnappy
THIRD_LIB=-L$(THIRD_PATH)/lib -lgflags -lzmq -lprotobuf -lglog -lz -lsnappy -L$(CUDA_PATH)/lib64 -lcudart
endif
# THIRD_LIB+=-ltcmalloc_and_profiler

WARN = -Wall -Wno-unused-function -finline-functions -Wno-sign-compare #-Wconversion
INCPATH = -I./src -I$(THIRD_PATH)/include
INCPATH = -I./src -I$(THIRD_PATH)/include -I/usr/include/eigen3 -I$(CAFFE_PATH)/include -I$(CAFFE_PATH)/build/src -I$(CUDA_PATH)/include
CFLAGS = -std=c++0x $(WARN) $(OPT) $(INCPATH)
LDFLAGS += $(THIRD_LIB) -lpthread -lrt
LDFLAGS += $(THIRD_LIB) -lboost_thread -lboost_system -lpthread -lrt -lcaffe -L$(CAFFE_PATH)/build/lib -Wl,-rpath=$(CAFFE_PATH)/build/lib -Wl,-rpath=$(THIRD_PATH)/lib

PS_LIB = build/libps.a
PS_MAIN = build/libpsmain.a
Expand All @@ -30,6 +35,20 @@ app: build/ps
build/hello: build/app/hello_world/main.o $(PS_LIB) $(PS_MAIN)
$(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@

build/caffe: build/app/caffe/caffe_main.o $(PS_LIB)
$(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@

build/caffe_sync: build/app/caffe/caffe_synced.o $(PS_LIB)
$(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@

build/caffe_share: build/app/caffe/caffe_share.o $(PS_LIB)
$(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@

build/caffe_async_share: build/app/caffe/caffe_async_share.o $(PS_LIB)
$(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@

caffe_all: build/caffe build/caffe_sync build/caffe_share build/caffe_async_share

sys_srcs = $(wildcard src/util/*.cc) $(wildcard src/data/*.cc) \
$(wildcard src/system/*.cc) $(wildcard src/filter/*.cc)
sys_protos = $(wildcard src/*/proto/*.proto)
Expand Down
8 changes: 8 additions & 0 deletions Makefile.config.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##

# CUDA path for include
CUDA_PATH := /usr/local/cuda

# caffe source path
# need to build caffe successfully
CAFFE_PATH := /data/ML/caffe/caffe
17 changes: 17 additions & 0 deletions conf/caffe.as.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# async shared
PS_PATH /home/immars/work/ML/distributed/parameter_server/build/caffe_async_share
PUSH 4
PULL 8
SCHEDULER 192.168.1.108

SERVER 192.168.1.108 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt 0 snapshot/bvlc_maxmap_iter_1210000.solverstate
# SERVER 192.168.1.108 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt 0
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt 0 W0,W1,W2,W3
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.maxmap.prototxt 1
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.maxmap.prototxt 2
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.maxmap.prototxt 3
WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt 0 W0,W1,W2,W3
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.maxmap.prototxt 1
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.maxmap.prototxt 2
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.maxmap.prototxt 3

6 changes: 6 additions & 0 deletions conf/caffe.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
PS_PATH /home/immars/work/ML/distributed/parameter_server/build/caffe
SCHEDULER 192.168.1.108

SERVER 192.168.1.108 /home/immars/work/ML/caffe/caffe/models/bvlc_reference_caffenet/S0 solver.prototxt -1
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_reference_caffenet/W0 solver.prototxt 0
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_reference_caffenet/W1 solver.prototxt 1
16 changes: 16 additions & 0 deletions conf/caffe.googlenet.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
PS_PATH /home/immars/work/ML/distributed/parameter_server/build/caffe_sync
PUSH 8
PULL 8
SCHEDULER 192.168.1.108

# SERVER 192.168.1.108 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.adadelta.prototxt -1 snapshot/bvlc_googlenet_iter_220000.solverstate
SERVER 192.168.1.108 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.adadelta.s.prototxt -1 snapshot/bvlc_googlenet_iter_140000.solverstate
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W0 solver.adadelta.c.prototxt 0
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.adadelta.c.prototxt 1
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.adadelta.c.prototxt 2
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.adadelta.c.prototxt 3
WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W0 solver.adadelta.c.prototxt 0
WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.adadelta.c.prototxt 1
WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.adadelta.c.prototxt 2
WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.adadelta.c.prototxt 3

15 changes: 15 additions & 0 deletions conf/caffe.maxmap.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
PS_PATH /home/immars/work/ML/distributed/parameter_server/build/caffe_sync
PUSH 8
PULL 8
SCHEDULER 192.168.1.108

SERVER 192.168.1.108 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt -1
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W0 solver.maxmap.prototxt 0
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.maxmap.prototxt 1
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.maxmap.prototxt 2
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.maxmap.prototxt 3
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W0 solver.maxmap.prototxt 0
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.maxmap.prototxt 1
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.maxmap.prototxt 2
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.maxmap.prototxt 3

15 changes: 15 additions & 0 deletions conf/caffe.share.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
PS_PATH /home/immars/work/ML/distributed/parameter_server/build/caffe_share
PUSH 8
PULL 8
SCHEDULER 192.168.1.108

SERVER 192.168.1.108 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt -1
WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt -1 W0,W1,W2,W3
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.maxmap.prototxt 1
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.maxmap.prototxt 2
# WORKER 192.168.1.110 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.maxmap.prototxt 3
WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML solver.maxmap.prototxt 0 W0,W1,W2,W3
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W1 solver.maxmap.prototxt 1
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W2 solver.maxmap.prototxt 2
# WORKER 192.168.1.112 /home/immars/work/ML/caffe/caffe/models/bvlc_googlenet/ML/W3 solver.maxmap.prototxt 3

29 changes: 29 additions & 0 deletions script/caffe_kill_lan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
# set -x
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../third_party/lib

if [ $# -lt 1 ]; then
echo "usage: ./caffe_kill_lan.sh conf_path"
echo "solver.prototxt resides in subdirectories[S0,S1,...,W0,W1,...] of root_dir"
exit -1;
fi

conf=$1
tmp=$( mktemp )
grep -v ^# $conf > $tmp
conf=$tmp
app=$( grep PS_PATH $conf | awk -F'/' '{print $NF;}' )
echo "app: $app"
echo "kill local"

killall caffe || killall $app

echo "kill servers"
# kill servers
grep -E "WORKER|SERVER" $conf | awk '{print $2;}' | sort | uniq | awk -v q="'" -v app="$app" '
{
cmd="ssh immars@" $0 " \"killall caffe || killall " app " \" ";
print cmd;
system(cmd);
}
'
62 changes: 62 additions & 0 deletions script/caffe_lan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash
# set -x
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../third_party/lib
if [ $# -lt 1 ]; then
echo "usage: ./local.sh num_servers num_workers root_dir solver.prototxt [args..]"
echo "solver.prototxt resides in subdirectories[S0,S1,...,W0,W1,...] of root_dir"
exit -1;
fi

conf=$1
tmp=$( mktemp )
grep -v ^# $conf > $tmp
conf=$tmp

bin=$( grep PS_PATH $conf | awk '{print $2;}' )
sch_ip=$( grep SCHEDULER $conf | awk '{print $2;}' )
num_servers=$( grep SERVER $conf | wc -l )
num_workers=$( grep WORKER $conf | wc -l )
pullstep=$( grep PULL $conf | awk '{print $2;}' )
pushstep=$( grep PUSH $conf | awk '{print $2;}' )

arg="-num_servers ${num_servers} -num_workers ${num_workers} $@" #" -app ${dir}/$@"

echo "$bin $conf $sch_ip $num_servers $num_workers"
killall -q caffe

silence=">/dev/null 2>/dev/null"


# start the scheduler
Sch="role:SCHEDULER,hostname:'$sch_ip',port:8001,id:'H'"
${bin} -my_node ${Sch} -scheduler ${Sch} ${arg} >/dev/null 2>/dev/null &

# start servers
grep SERVER $conf | awk -v bin=$bin -v sch=$Sch -v nums=$num_servers -v numw=$num_workers -vpullstep=$pullstep -vpushstep=$pushstep -v q="'" '
BEGIN{port=9600;id=0;}
{
ip=$2;wd=$3;solver=$4;gpu=$5;snapshot=$6;
if(""!=snapshot){
snapshot= " --snapshot=" snapshot;
}
cmd="ssh -f -n immars@" ip " \"source /etc/profile && cd " wd " && nohup " bin " -num_servers " nums " -num_workers " numw " -my_node \\\"role:SERVER,hostname:" q ip q ",port:" port ",id:" q "S" id q "\\\" -scheduler \\\"" sch "\\\" --solver=" solver " --pullstep=" pullstep " --pushstep=" pushstep " --gpu=" gpu " " snapshot " >" wd "/stdout.txt 2>&1 < /dev/null &\" ";
print cmd;
system(cmd);
port=port+1;id=id+1;
}
'

grep WORKER $conf | awk -v bin=$bin -v sch=$Sch -v nums=$num_servers -v numw=$num_workers -vpullstep=$pullstep -vpushstep=$pushstep -v q="'" '
BEGIN{port=9500;id=0;}
{
ip=$2;wd=$3;solver=$4;gpu=$5;workers=$6;
if(""!=workers){
workers = " --workers=" workers;
}
cmd="ssh -f -n immars@" ip " \"source /etc/profile && cd " wd " && nohup " bin " -num_servers " nums " -num_workers " numw " -my_node \\\"role:WORKER,hostname:" q ip q ",port:" port ",id:" q "W" id q "\\\" -scheduler \\\"" sch "\\\" --solver=" solver " --pullstep=" pullstep " --pushstep=" pushstep " --synced=true --gpu=" gpu " " workers " >" wd "/stdout.txt 2>&1 < /dev/null &\" ";
print cmd;
system(cmd);
port=port+1;id=id+1;
}
'

54 changes: 54 additions & 0 deletions script/caffe_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash
# set -x
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../third_party/lib
if [ $# -lt 3 ]; then
echo "usage: ./local.sh num_servers num_workers root_dir solver.prototxt [args..]"
echo "solver.prototxt resides in subdirectories[S0,S1,...,W0,W1,...] of root_dir"
exit -1;
fi

bin=$( pwd -L )/build/caffe_sync
num_servers=$1
shift
num_workers=$1
shift
root_dir=$1
shift
solver=$1
shift
arg="-num_servers ${num_servers} -num_workers ${num_workers} $@" #" -app ${dir}/$@"


# killall -q $(basename ${bin})
killall -q ${bin}
sleep 1

silence=">/dev/null 2>/dev/null"


# start the scheduler
Sch="role:SCHEDULER,hostname:'127.0.0.1',port:8001,id:'H'"
${bin} -my_node ${Sch} -scheduler ${Sch} ${arg} &

# start servers
for ((i=0; i<${num_servers}; ++i)); do
port=$((9600 + ${i}))
id=S${i}
N="role:SERVER,hostname:'127.0.0.1',port:${port},id:'${id}'"
# HEAPPROFILE=/tmp/S${i} \
# CPUPROFILE=/tmp/S${i} \
echo "cd $root_dir/$id/ && ${bin} -my_node ${N} -scheduler ${Sch} --solver=$solver ${arg} >$root_dir/$id/stdout.txt 2>&1 &"
cd $root_dir/$id/ && ${bin} -my_node ${N} -scheduler ${Sch} --pullstep=2 --pushstep=2 --solver=$solver ${arg} >$root_dir/$id/stdout.txt 2>&1 &
done

# start workers
for ((i=0; i<${num_workers}; ++i)); do
port=$((9500 + ${i}))
id=W${i}
N="role:WORKER,hostname:'127.0.0.1',port:${port},id:'${id}'"
# HEAPPROFILE=/tmp/W${i} \
# CPUPROFILE=/tmp/W${i} \
cd $root_dir/$id/ && ${bin} -my_node ${N} -scheduler ${Sch} --pullstep=2 --pushstep=2 --solver=$solver ${arg} >$root_dir/$id/stdout.txt 2>&1 &
done

wait
Loading