Bluefog-Lib · Nov 2, 2020
diff --git a/‎bluefog/common/basics.py
+5-5 b/‎bluefog/common/basics.py
+5-5
diff --git a/‎bluefog/common/mpi_controller.cc
+1-1 b/‎bluefog/common/mpi_controller.cc
+1-1
diff --git a/‎bluefog/common/topology_util.py
+14-14 b/‎bluefog/common/topology_util.py
+14-14
diff --git a/‎docs/timeline.rst
+1-1 b/‎docs/timeline.rst
+1-1
diff --git a/‎examples/pytorch_ImageNet_Resnet50.py
+4-4 b/‎examples/pytorch_ImageNet_Resnet50.py
+4-4
diff --git a/‎examples/pytorch_average_consensus.py
+8-8 b/‎examples/pytorch_average_consensus.py
+8-8
diff --git a/‎examples/pytorch_benchmark.py
+9-9 b/‎examples/pytorch_benchmark.py
+9-9
diff --git a/‎examples/pytorch_cifar10_resnet.py
+10-34 b/‎examples/pytorch_cifar10_resnet.py
+10-34
@@ -51,7 +51,7 @@ def init(self, topology_fn: Optional[Callable[[int], networkx.DiGraph]] = None,
         Args:
           topology_fn: A callable function that takes size as input and return
             networkx.DiGraph object to decide the topology. If not provided
-            a default power graph (base 2) structure is called.
+            a default exponential graph (base 2) structure is called.
           is_weighted: If set to true, the neighbor ops like (win_update, neighbor_allreduce) will
             execute the weighted average instead, where the weight is the value used in
             topology matrix (including self).
@@ -60,7 +60,7 @@ def init(self, topology_fn: Optional[Callable[[int], networkx.DiGraph]] = None,
         if topology_fn:
             topo = topology_fn(self.size())
         else:
-            topo = topology_util.PowerGraph(self.size())
+            topo = topology_util.ExponentialGraph(self.size())
         self.set_topology(topo, is_weighted)
         atexit.register(self.shutdown)
 
@@ -191,7 +191,7 @@ def set_topology(self, topology: Optional[networkx.DiGraph] = None,
 
         Args:
           Topo: A networkx.DiGraph object to decide the topology. If not provided
-            a default power graph (base 2) structure is used.
+            a default exponential graph (base 2) structure is used.
           is_weighted: If set to true, the win_update and neighbor_allreduce will execute the
             weighted average instead, where the weights are the value used in topology matrix
             (including self weight). Note win_get/win_put/win_accumulate do not use this weight
@@ -207,10 +207,10 @@ def set_topology(self, topology: Optional[networkx.DiGraph] = None,
             >>> bf.set_topology(topology_util.RingGraph(bf.size()))
         """
         if topology is None:
-            topology = topology_util.PowerGraph(size=self.size())
+            topology = topology_util.ExponentialGraph(size=self.size())
             if self.local_rank() == 0:
                 logger.info(
-                    "Topology is not specified. Default Power Two topology is used.")
+                    "Topology is not specified. Default Exponential Two topology is used.")
 
         if not isinstance(topology, networkx.DiGraph):
             raise TypeError("topology must be a networkx.DiGraph obejct.")
 
@@ -825,7 +825,7 @@ void MPIController::WinCreate(TensorTableEntry& entry) {
   win_manager->SetGlobalWin(global_mpi_win_ptr);
 
   // Build extra buffers for win_put.
-  // For example: size=4 power two ring topology
+  // For example: size=4 exponential two ring topology
   // r\s   0    1    2    3
   //  0    g    x         x
   //  1    x    g    x
 
@@ -63,18 +63,18 @@ def GetSendWeights(topo: nx.DiGraph, rank: int) -> Tuple[float, Dict[int, float]
     return self_weight, neighbor_weights
 
 
-def PowerTwoRingGraph(size: int) -> nx.DiGraph:
+def ExponentialTwoGraph(size: int) -> nx.DiGraph:
     """Generate graph topology such that each points only
-    connected to a point such that the index difference is power of 2.
+    connected to a point such that the index difference is the power of 2.
 
-    Example: A PowerTwoRingGraph with 12 nodes:
+    Example: A ExponentialTwoGraph with 12 nodes:
 
     .. plot::
         :context: close-figs
 
         >>> import networkx as nx
         >>> from bluefog.common import topology_util
-        >>> G = topology_util.PowerTwoRingGraph(12)
+        >>> G = topology_util.ExponentialTwoGraph(12)
         >>> nx.draw_circular(G)
     """
     assert size > 0
@@ -96,18 +96,18 @@ def isPowerOf(x, base):
     return False
 
 
-def PowerGraph(size: int, base: int = 2) -> nx.DiGraph:
+def ExponentialGraph(size: int, base: int = 2) -> nx.DiGraph:
     """Generate graph topology such that each points only
     connected to a point such that the index difference is power of base. (Default is 2)
 
-    Example: A PowerGraph with 12 nodes:
+    Example: A ExponentialGraph with 12 nodes:
 
     .. plot::
         :context: close-figs
 
         >>> import networkx as nx
         >>> from bluefog.common import topology_util
-        >>> G = topology_util.PowerGraph(12)
+        >>> G = topology_util.ExponentialGraph(12)
         >>> nx.draw_circular(G)
     """
     x = [1.0]
@@ -125,20 +125,20 @@ def PowerGraph(size: int, base: int = 2) -> nx.DiGraph:
     return G
 
 
-def SymmetricPowerGraph(size: int, base: int = 4) -> nx.DiGraph:
+def SymmetricExponentialGraph(size: int, base: int = 4) -> nx.DiGraph:
     """
      Generate symmeteric graph topology such that for the first half of nodes
      only connected to a point such that the index difference is power of base (Default is 4)
      and the connectivity for the second half of nodes just mirrored to the first half.
 
-    Example: A SymmetricPowerGraph with 12 nodes
+    Example: A SymmetricExponentialGraph with 12 nodes
 
     .. plot::
         :context: close-figs
 
         >>> import networkx as nx
         >>> from bluefog.common import topology_util
-        >>> G = topology_util.SymmetricPowerGraph(12)
+        >>> G = topology_util.SymmetricExponentialGraph(12)
         >>> nx.draw_circular(G)
     """
     x = [1.0]
@@ -339,7 +339,7 @@ def InnerOuterRingGraph(world_size: int, local_size: int) -> nx.DiGraph:
     return G
 
 
-def InnerOuterExp2Graph(world_size: int, local_size: int) -> nx.DiGraph:
+def InnerOuterExpo2Graph(world_size: int, local_size: int) -> nx.DiGraph:
     """Generate Inner Ring and Outer Exponential-2 Graph.
 
     Within one machine all inner rank/processes is fully-connected and all
@@ -349,7 +349,7 @@ def InnerOuterExp2Graph(world_size: int, local_size: int) -> nx.DiGraph:
 
         >>> import networkx as nx
         >>> from bluefog.common import topology_util
-        >>> G = topology_util.InnerOuterExp2Graph(12, 3)
+        >>> G = topology_util.InnerOuterExpo2Graph(12, 3)
         >>> nx.draw_circular(G)
     """
     total_nodes = world_size
@@ -541,7 +541,7 @@ def GetInnerOuterRingDynamicSendRecvRanks(
         index += 1
 
 
-def GetInnerOuterExp2DynamicSendRecvRanks(
+def GetInnerOuterExpo2DynamicSendRecvRanks(
         world_size: int, local_size: int, self_rank: int
     ) -> Iterator[Tuple[List[int], List[int]]]:
     """
@@ -560,7 +560,7 @@ def GetInnerOuterExp2DynamicSendRecvRanks(
 
         >>> from bluefog.common import topology_util
         >>> world_size, local_size = bf.size(), bf.local_size()
-        >>> gen = topology_util.GetInnerOuterExp2DynamicSendRecvRanks(world_size, local_size, 0)
+        >>> gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(world_size, local_size, 0)
         >>> for _ in range(10):
         >>>     print(next(gen))
     """
 
@@ -36,7 +36,7 @@ Example I: Logistic regression with neighbor_allreduce
 ------------------------------------------------------
 In the first example, we show the timeline when running decentralized SGD for 
 logistic regression, see the figure below. In this example, each rank is connected
-via an undirected power-2 topology. We exploit the 
+via an undirected exponential-2 topology. We exploit the 
 primitive ``neighbor_allreduce`` to perform the neighbor averaging.
 
 .. image:: ./_static/bf_timeline_example1a.png
 
@@ -72,9 +72,9 @@
 parser.add_argument("--disable-dynamic-topology", action="store_true",
                     default=False, help=("Disable each iteration to transmit one neighbor " +
                                          "per iteration dynamically."))
-parser.add_argument('--virtual-topology', type=str, default="power2",
+parser.add_argument('--virtual-topology', type=str, default="expo2",
                     help='The underlying virtual topology. Supporting options are ' +
-                    '[power2(Default), ring, mesh, star].')
+                    '[expo2(Default), ring, mesh, star].')
 
 args = parser.parse_args()
 args.cuda = not args.no_cuda and torch.cuda.is_available()
@@ -88,13 +88,13 @@
 torch.manual_seed(args.seed)
 
 if args.dist_optimizer != 'horovod':
-    if args.virtual_topology == "power2":
+    if args.virtual_topology == "expo2":
         pass
     elif args.virtual_topology == "ring":
         bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1))
     else:
         raise ValueError("Unknown args.virtual_topology, supporting options are " +
-                         "[power2(Default), ring, mesh, star].")
+                         "[expo2(Default), ring, mesh, star].")
 
 if args.cuda:
     # Bluefog: pin GPU to local rank.
 
@@ -27,9 +27,9 @@
                     help='maximum iterations')
 parser.add_argument('--local-size', type=int, default=4,
                     help='number of nodes per machine')
-parser.add_argument('--virtual-topology', type=str, default="power2",
+parser.add_argument('--virtual-topology', type=str, default="expo2",
                     help='The underlying virtual topology. Supporting options are ' +
-                    '[power2(Default), ring, mesh, star, InnerOuterRing].')
+                    '[expo2(Default), ring, mesh, star, InnerOuterRing].')
 parser.add_argument('--asynchronous-mode', action='store_true', default=False,
                     help='Use one-sided ops to run asynchronous push sum algorithm')
 parser.add_argument('--no-cuda', action='store_true', default=False,
@@ -58,12 +58,12 @@
 else:
     x = torch.randn(args.data_size, dtype=torch.double)
 
-if args.virtual_topology == "power2":
+if args.virtual_topology == "expo2":
     pass
-elif args.virtual_topology == "power3":
-    bf.set_topology(topology_util.PowerGraph(bf.size(), base=3))
-elif args.virtual_topology == "power4":
-    bf.set_topology(topology_util.PowerGraph(bf.size(), base=4))
+elif args.virtual_topology == "expo3":
+    bf.set_topology(topology_util.ExponentialGraph(bf.size(), base=3))
+elif args.virtual_topology == "expo4":
+    bf.set_topology(topology_util.ExponentialGraph(bf.size(), base=4))
 elif args.virtual_topology == "ring":
     bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1))
 elif args.virtual_topology == "mesh":
@@ -77,7 +77,7 @@
     bf.set_topology(topology_util.FullyConnectedGraph(bf.size()))
 else:
     raise ValueError("Unknown args.virtual_topology, supporting options are " +
-                     "[power2(Default), ring, mesh, star].")
+                     "[expo2(Default), ring, mesh, star].")
 
 x_bar = bf.allreduce(x, average=True)
 mse = [torch.norm(x-x_bar, p=2) / torch.norm(x_bar, p=2)]
 
@@ -60,9 +60,9 @@
 parser.add_argument('--disable-dynamic-topology', action='store_true',
                     default=False, help=('Disable each iteration to transmit one neighbor ' +
                                          'per iteration dynamically.'))
-parser.add_argument('--virtual-topology', type=str, default="power2",
+parser.add_argument('--virtual-topology', type=str, default="expo2",
                     help='The underlying virtual topology. Supporting options are ' +
-                    '[power2(Default), ring, mesh, star, InnerOuterRing, InnerOuterExp2].')
+                    '[expo2(Default), ring, mesh, star, InnerOuterRing, InnerOuterExpo2].')
 
 
 args = parser.parse_args()
@@ -73,21 +73,21 @@
 
 bf.init()
 if args.dist_optimizer != 'horovod':
-    if args.virtual_topology == "power2":
+    if args.virtual_topology == "expo2":
         pass
     elif args.virtual_topology == "ring":
         bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1))
     elif args.virtual_topology == "InnerOuterRing":
         assert bf.is_homogeneous, "InnerOuterRing topo should be used only homogeneous environment"
         bf.set_topology(topology_util.InnerOuterRingGraph(
             bf.size(), local_size=bf.local_size() if args.local_size == -1 else args.local_size))
-    elif args.virtual_topology == "InnerOuterExp2":
-        assert bf.is_homogeneous, "InnerOuterExp2 topo should be used under homogeneous environment"
-        bf.set_topology(topology_util.InnerOuterExp2Graph(
+    elif args.virtual_topology == "InnerOuterExpo2":
+        assert bf.is_homogeneous, "InnerOuterExpo2 topo should be used under homogeneous environment"
+        bf.set_topology(topology_util.InnerOuterExpo2Graph(
             bf.size(), local_size=bf.local_size() if args.local_size == -1 else args.local_size))
     else:
         raise ValueError("Unknown args.virtual_topology, supporting options are " +
-                         "[power2(Default), ring, mesh, star，InnerOuterRing， InnerOuterExp2].")
+                         "[expo2(Default), ring, mesh, star，InnerOuterRing， InnerOuterExpo2].")
 
 if args.cuda:
     torch.cuda.set_device(bf.local_rank())
@@ -176,8 +176,8 @@ def forward(self, x):
             bf.size(),
             local_size=bf.local_size() if args.local_size == -1 else args.local_size,
             self_rank=bf.rank())
-    elif args.virtual_topology == 'InnerOuterExp2':
-        dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExp2DynamicSendRecvRanks(
+    elif args.virtual_topology == 'InnerOuterExpo2':
+        dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(
             bf.size(),
             local_size=bf.local_size() if args.local_size == -1 else args.local_size,
             self_rank=bf.rank())
 
@@ -84,9 +84,9 @@
 parser.add_argument('--disable-dynamic-topology', action='store_true',
                     default=False, help=('Disable each iteration to transmit one neighbor ' +
                                          'per iteration dynamically.'))
-parser.add_argument('--virtual-topology', type=str, default="power2",
+parser.add_argument('--virtual-topology', type=str, default="expo2",
                     help='The underlying virtual topology. Supporting options are ' +
-                    '[power2(Default), ring, mesh, star].')
+                    '[expo2(Default), ring, mesh, star].')
 
 args = parser.parse_args()
 args.cuda = (not args.no_cuda) and (torch.cuda.is_available())
@@ -100,21 +100,21 @@
 bf.init()
 torch.manual_seed(args.seed)
 if args.dist_optimizer != 'horovod':
-    if args.virtual_topology == "power2":
+    if args.virtual_topology == "expo2":
         pass
     elif args.virtual_topology == "ring":
         bf.set_topology(topology_util.RingGraph(bf.size(), connect_style=1))
     elif args.virtual_topology == "InnerOuterRing":
         assert bf.is_homogeneous, "InnerOuterRing topo should be used only homogeneous environment"
         bf.set_topology(topology_util.InnerOuterRingGraph(
             bf.size(), local_size=bf.local_size()))
-    elif args.virtual_topology == "InnerOuterExp2":
-        assert bf.is_homogeneous, "InnerOuterExp2 topo should be used under homogeneous environment"
-        bf.set_topology(topology_util.InnerOuterExp2Graph(
+    elif args.virtual_topology == "InnerOuterExpo2":
+        assert bf.is_homogeneous, "InnerOuterExpo2 topo should be used under homogeneous environment"
+        bf.set_topology(topology_util.InnerOuterExpo2Graph(
             bf.size(), local_size=bf.local_size()))
     else:
         raise ValueError("Unknown args.virtual_topology, supporting options are " +
-                         "[power2(Default), ring, mesh, star，InnerOuterRing， InnerOuterExp2].")
+                         "[expo2(Default), ring, mesh, star，InnerOuterRing， InnerOuterExpo2].")
 
 if args.cuda:
     print("using cuda.")
@@ -126,21 +126,6 @@
 
 cudnn.benchmark = True
 
-# If set > 0, will resume training from a given checkpoint.
-resume_from_epoch = 0
-# for try_epoch in range(args.epochs, 0, -1):
-#     if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
-#         resume_from_epoch = try_epoch
-#         break
-
-# Bluefog: broadcast resume_from_epoch from rank 0 (which will have
-# checkpoints) to other ranks.
-resume_from_epoch = bf.broadcast(
-    torch.tensor(resume_from_epoch),  # pylint: disable=not-callable
-    root_rank=0,
-    name="resume_from_epoch",
-).item()
-
 # Bluefog: print logs on the first worker.
 verbose = 1 if bf.rank() == 0 else 0
 
@@ -234,15 +219,6 @@
                      '[neighbor_allreduce, gradient_allreduce, allreduce, ' +
                      'win_put, horovod]')
 
-print("resume_from_epoch: ", resume_from_epoch)
-# Restore from a previous checkpoint, if initial_epoch is specified.
-# Bluefog: restore on the first worker which will broadcast weights to other workers.
-# if resume_from_epoch > 0 and bf.rank() == 0:
-#     filepath = args.checkpoint_format.format(epoch=resume_from_epoch)
-#     checkpoint = torch.load(filepath)
-#     model.load_state_dict(checkpoint["model"])
-#     optimizer.load_state_dict(checkpoint["optimizer"])
-
 # Bluefog: broadcast parameters & optimizer state.
 bf.broadcast_parameters(model.state_dict(), root_rank=0)
 bf.broadcast_optimizer_state(optimizer, root_rank=0)
@@ -345,8 +321,8 @@ def adjust_learning_rate(epoch, batch_idx):
             bf.size(),
             local_size=bf.local_size(),
             self_rank=bf.rank())
-    elif args.virtual_topology == 'InnerOuterExp2':
-        dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExp2DynamicSendRecvRanks(
+    elif args.virtual_topology == 'InnerOuterExpo2':
+        dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(
             bf.size(),
             local_size=bf.local_size(),
             self_rank=bf.rank())
@@ -421,7 +397,7 @@ def avg(self):
         return self.sum / self.n
 
 
-for epoch in range(resume_from_epoch, args.epochs):
+for epoch in range(args.epochs):
     train(epoch)
     validate(epoch)
     # save_checkpoint(epoch)