|
| 1 | +import pickle |
| 2 | +import numpy as np |
| 3 | + |
| 4 | +def values_to_cdf(values): |
| 5 | + cdf_list = [] |
| 6 | + values.sort() |
| 7 | + count = 0 |
| 8 | + for v in values: |
| 9 | + count += 1 |
| 10 | + cdf_list.append(count / len(values)) |
| 11 | + return cdf_list |
| 12 | + |
| 13 | + |
| 14 | +def parse_result_file( |
| 15 | + model: str, |
| 16 | + dataset: str, |
| 17 | + slo_multiplier: int, |
| 18 | + arrival: str, |
| 19 | + BATCH_DECISION_PATH: str, |
| 20 | + APPARATE_LATENCY_PATH: str, |
| 21 | + OPTIMAL_LATENCY_PATH: str, |
| 22 | +): |
| 23 | + print(f"model {model}, dataset {dataset}") |
| 24 | + |
| 25 | + if "slo_multiplier" in BATCH_DECISION_PATH: # CV workload, with SLO multiplier |
| 26 | + batch_decision_path = BATCH_DECISION_PATH.format(model=model, |
| 27 | + slo_multiplier=slo_multiplier, arrival=arrival) |
| 28 | + apparate_latency_path = APPARATE_LATENCY_PATH.format( |
| 29 | + model=model, dataset=dataset, slo_multiplier=slo_multiplier, arrival=arrival |
| 30 | + ) |
| 31 | + optimal_latency_path = OPTIMAL_LATENCY_PATH.format( |
| 32 | + model=model, dataset=dataset, slo_multiplier=4, arrival=arrival |
| 33 | + ) |
| 34 | + else: # NLP workload, azure arrival traice |
| 35 | + batch_decision_path = BATCH_DECISION_PATH.format(model=model, arrival=arrival) |
| 36 | + apparate_latency_path = APPARATE_LATENCY_PATH.format( |
| 37 | + model=model, dataset=dataset, |
| 38 | + arrival=arrival, |
| 39 | + ) |
| 40 | + optimal_latency_path = OPTIMAL_LATENCY_PATH.format( |
| 41 | + model=model, dataset=dataset, |
| 42 | + arrival=arrival, |
| 43 | + ) |
| 44 | + |
| 45 | + with open(batch_decision_path, "rb") as f1, open(apparate_latency_path, "rb") as f2, open(optimal_latency_path, "rb") as f3: |
| 46 | + batch_decision, apparate_latency, optimal_latency = pickle.load(f1), pickle.load(f2), pickle.load(f3) |
| 47 | + |
| 48 | + per_request_stats = batch_decision["per_request_stats"] # every item: queuing delay, inference time |
| 49 | + per_request_stats = [x for x in per_request_stats if x is not None] |
| 50 | + total_num_requests = sum([1 for x in per_request_stats if x is not None]) |
| 51 | + |
| 52 | + length = min(len(apparate_latency), len(optimal_latency)) |
| 53 | + apparate_latency = apparate_latency[:length] |
| 54 | + optimal_latency = optimal_latency[:length] |
| 55 | + num_served_requests = len(apparate_latency) # NOTE(ruipan): might be smaller than total_num_requests b/c some are dropped |
| 56 | + print(f"num_served_requests {num_served_requests}") |
| 57 | + queuing_delays = [s[0] for s in per_request_stats if s is not None] |
| 58 | + queuing_delays = queuing_delays[:num_served_requests] |
| 59 | + model_inference_time_vanilla = [s[1] for s in per_request_stats[:num_served_requests]] |
| 60 | + model_inference_time_ee = apparate_latency |
| 61 | + model_inference_time_optimal = optimal_latency |
| 62 | + |
| 63 | + serving_time_vanilla = [sum(x) for x in zip(queuing_delays, model_inference_time_vanilla)] |
| 64 | + serving_time_ee = [sum(x) for x in zip(queuing_delays, model_inference_time_ee)] |
| 65 | + serving_time_optimal = [sum(x) for x in zip(queuing_delays, model_inference_time_optimal)] |
| 66 | + |
| 67 | + apparate_serving_improvement = 100 * (1 - np.median(serving_time_ee) / np.median(serving_time_vanilla)) |
| 68 | + optimal_serving_improvement = 100 * (1 - np.median(serving_time_optimal) / np.median(serving_time_vanilla)) |
| 69 | + |
| 70 | + return { |
| 71 | + "apparate_serving_improvement": apparate_serving_improvement, |
| 72 | + "optimal_serving_improvement": optimal_serving_improvement, |
| 73 | + "serving_time_vanilla": serving_time_vanilla, |
| 74 | + "serving_time_ee": serving_time_ee, |
| 75 | + "serving_time_optimal": serving_time_optimal, |
| 76 | + } |
0 commit comments