diff --git a/shifu-tensorflow-eval/pom.xml b/shifu-tensorflow-eval/pom.xml
index 9b18e55..6e4f62e 100644
--- a/shifu-tensorflow-eval/pom.xml
+++ b/shifu-tensorflow-eval/pom.xml
@@ -108,6 +108,11 @@
hdp-yarn
${project.build.directory}
+
+ net.lingala.zip4j
+ zip4j
+ 1.3.2
+
org.tensorflow
tensorflow
diff --git a/shifu-tensorflow-on-yarn/src/main/resources/backup.py b/shifu-tensorflow-on-yarn/src/main/resources/backup.py
index 709e598..19b3fb9 100644
--- a/shifu-tensorflow-on-yarn/src/main/resources/backup.py
+++ b/shifu-tensorflow-on-yarn/src/main/resources/backup.py
@@ -2,24 +2,13 @@
Author: Tommy Mulc
"""
+import json
+import logging
# from __future__ import print_function
import os
-import tensorflow as tf
-import argparse
import time
-import sys
-import logging
-import gzip
-from StringIO import StringIO
-import random
-import numpy as np
-from tensorflow.python.platform import gfile
-from tensorflow.python.framework import ops
-from tensorflow.python.saved_model import builder
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.saved_model import tag_constants
-import json
+
+import tensorflow as tf
REPLICAS_TO_AGGREGATE_RATIO = 1
FEATURE_COUNT = 30
@@ -27,7 +16,7 @@
VALID_TRAINING_DATA_RATIO = 0.3
DELIMITER = '|'
BATCH_SIZE = 10
-EPOCH = 10 # TODO: should consider recovery from checkpoint, we need to reduce current global step
+EPOCH = 10 # TODO: should consider recovery from checkpoint, we need to reduce current global step
WORKING_DIR = "hdfs://horton/user/webai/.yarn/"
# read from env
@@ -49,16 +38,16 @@ def main(_):
# allows this node know about all other nodes
if job_name == 'ps': # checks if parameter server
- server = tf.train.Server(cluster,
- job_name="ps",
- task_index=task_index)
+ server = tf.compat.v1.train.Server(cluster,
+ job_name="ps",
+ task_index=task_index)
server.join()
else: # it must be a worker server
logging.info("Loading data from worker index = %d" % task_index)
- server = tf.train.Server(cluster,
- job_name="worker",
- task_index=task_index)
+ server = tf.compat.v1.train.Server(cluster,
+ job_name="worker",
+ task_index=task_index)
logging.info("backup worker join!!")
@@ -66,6 +55,5 @@ def main(_):
time.sleep(1000000)
-
if __name__ == '__main__':
- tf.app.run()
+ tf.compat.v1.app.run()