|
84 | 84 | parser.add_argument('--disable-dynamic-topology', action='store_true',
|
85 | 85 | default=False, help=('Disable each iteration to transmit one neighbor ' +
|
86 | 86 | 'per iteration dynamically.'))
|
87 |
| -parser.add_argument('--virtual-topology', type=str, default="power2", |
| 87 | +parser.add_argument('--virtual-topology', type=str, default="expo2", |
88 | 88 | help='The underlying virtual topology. Supporting options are ' +
|
89 |
| - '[power2(Default), ring, mesh, star].') |
| 89 | + '[expo2(Default), ring, mesh, star].') |
90 | 90 |
|
91 | 91 | args = parser.parse_args()
|
92 | 92 | args.cuda = (not args.no_cuda) and (torch.cuda.is_available())
|
|
100 | 100 | bf.init()
|
101 | 101 | torch.manual_seed(args.seed)
|
102 | 102 | if args.dist_optimizer != 'horovod':
|
103 |
| - if args.virtual_topology == "power2": |
| 103 | + if args.virtual_topology == "expo2": |
104 | 104 | pass
|
105 | 105 | elif args.virtual_topology == "ring":
|
106 | 106 | bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1))
|
107 | 107 | elif args.virtual_topology == "InnerOuterRing":
|
108 | 108 | assert bf.is_homogeneous, "InnerOuterRing topo should be used only homogeneous environment"
|
109 | 109 | bf.set_topology(topology_util.InnerOuterRingGraph(
|
110 | 110 | bf.size(), local_size=bf.local_size()))
|
111 |
| - elif args.virtual_topology == "InnerOuterExp2": |
112 |
| - assert bf.is_homogeneous, "InnerOuterExp2 topo should be used under homogeneous environment" |
113 |
| - bf.set_topology(topology_util.InnerOuterExp2Graph( |
| 111 | + elif args.virtual_topology == "InnerOuterExpo2": |
| 112 | + assert bf.is_homogeneous, "InnerOuterExpo2 topo should be used under homogeneous environment" |
| 113 | + bf.set_topology(topology_util.InnerOuterExpo2Graph( |
114 | 114 | bf.size(), local_size=bf.local_size()))
|
115 | 115 | else:
|
116 | 116 | raise ValueError("Unknown args.virtual_topology, supporting options are " +
|
117 |
| - "[power2(Default), ring, mesh, star,InnerOuterRing, InnerOuterExp2].") |
| 117 | + "[expo2(Default), ring, mesh, star,InnerOuterRing, InnerOuterExpo2].") |
118 | 118 |
|
119 | 119 | if args.cuda:
|
120 | 120 | print("using cuda.")
|
|
126 | 126 |
|
127 | 127 | cudnn.benchmark = True
|
128 | 128 |
|
129 |
| -# If set > 0, will resume training from a given checkpoint. |
130 |
| -resume_from_epoch = 0 |
131 |
| -# for try_epoch in range(args.epochs, 0, -1): |
132 |
| -# if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): |
133 |
| -# resume_from_epoch = try_epoch |
134 |
| -# break |
135 |
| - |
136 |
| -# Bluefog: broadcast resume_from_epoch from rank 0 (which will have |
137 |
| -# checkpoints) to other ranks. |
138 |
| -resume_from_epoch = bf.broadcast( |
139 |
| - torch.tensor(resume_from_epoch), # pylint: disable=not-callable |
140 |
| - root_rank=0, |
141 |
| - name="resume_from_epoch", |
142 |
| -).item() |
143 |
| - |
144 | 129 | # Bluefog: print logs on the first worker.
|
145 | 130 | verbose = 1 if bf.rank() == 0 else 0
|
146 | 131 |
|
|
234 | 219 | '[neighbor_allreduce, gradient_allreduce, allreduce, ' +
|
235 | 220 | 'win_put, horovod]')
|
236 | 221 |
|
237 |
| -print("resume_from_epoch: ", resume_from_epoch) |
238 |
| -# Restore from a previous checkpoint, if initial_epoch is specified. |
239 |
| -# Bluefog: restore on the first worker which will broadcast weights to other workers. |
240 |
| -# if resume_from_epoch > 0 and bf.rank() == 0: |
241 |
| -# filepath = args.checkpoint_format.format(epoch=resume_from_epoch) |
242 |
| -# checkpoint = torch.load(filepath) |
243 |
| -# model.load_state_dict(checkpoint["model"]) |
244 |
| -# optimizer.load_state_dict(checkpoint["optimizer"]) |
245 |
| - |
246 | 222 | # Bluefog: broadcast parameters & optimizer state.
|
247 | 223 | bf.broadcast_parameters(model.state_dict(), root_rank=0)
|
248 | 224 | bf.broadcast_optimizer_state(optimizer, root_rank=0)
|
@@ -345,8 +321,8 @@ def adjust_learning_rate(epoch, batch_idx):
|
345 | 321 | bf.size(),
|
346 | 322 | local_size=bf.local_size(),
|
347 | 323 | self_rank=bf.rank())
|
348 |
| - elif args.virtual_topology == 'InnerOuterExp2': |
349 |
| - dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExp2DynamicSendRecvRanks( |
| 324 | + elif args.virtual_topology == 'InnerOuterExpo2': |
| 325 | + dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks( |
350 | 326 | bf.size(),
|
351 | 327 | local_size=bf.local_size(),
|
352 | 328 | self_rank=bf.rank())
|
@@ -421,7 +397,7 @@ def avg(self):
|
421 | 397 | return self.sum / self.n
|
422 | 398 |
|
423 | 399 |
|
424 |
| -for epoch in range(resume_from_epoch, args.epochs): |
| 400 | +for epoch in range(args.epochs): |
425 | 401 | train(epoch)
|
426 | 402 | validate(epoch)
|
427 | 403 | # save_checkpoint(epoch)
|
0 commit comments