|
| 1 | +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_ |
| 17 | +#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_ |
| 18 | + |
| 19 | +#include <optional> |
| 20 | +#include <string> |
| 21 | + |
| 22 | +#include "absl/status/statusor.h" |
| 23 | +#include "absl/strings/str_cat.h" |
| 24 | +#include "absl/strings/str_format.h" |
| 25 | +#include "xla/tsl/platform/statusor.h" |
| 26 | +#include "xprof/convert/smart_suggestion/signal_provider.h" |
| 27 | +#include "xprof/convert/smart_suggestion/smart_suggestion_rule.h" |
| 28 | +#include "plugin/xprof/protobuf/smart_suggestion.pb.h" |
| 29 | + |
| 30 | +namespace tensorflow { |
| 31 | +namespace profiler { |
| 32 | + |
| 33 | +// If MXU utilization is higher than kComputeBoundMxuUtilizationThreshold and |
| 34 | +// HBM bandwidth utilization is lower than kComputeBoundHbmUtilizationThreshold, |
| 35 | +// it is considered compute bound. |
| 36 | +constexpr double kComputeBoundMxuUtilizationThreshold = 70; |
| 37 | +constexpr double kComputeBoundHbmUtilizationThreshold = 50; |
| 38 | + |
| 39 | +// Rule to detect compute-bound bottleneck. |
| 40 | +class ComputeBoundRule : public SmartSuggestionRule { |
| 41 | + public: |
| 42 | + bool MeetsConditions(const SignalProvider& signal_provider) const override { |
| 43 | + absl::StatusOr<double> hbm_utilization_percent = |
| 44 | + signal_provider.GetHbmUtilization(); |
| 45 | + absl::StatusOr<double> mxu_utilization_percent = |
| 46 | + signal_provider.GetMxuUtilization(); |
| 47 | + if (!hbm_utilization_percent.ok() || !mxu_utilization_percent.ok()) { |
| 48 | + return false; |
| 49 | + } |
| 50 | + |
| 51 | + return *mxu_utilization_percent > kComputeBoundMxuUtilizationThreshold && |
| 52 | + *hbm_utilization_percent < kComputeBoundHbmUtilizationThreshold; |
| 53 | + } |
| 54 | + |
| 55 | + absl::StatusOr<std::optional<SmartSuggestion>> GenerateSuggestion( |
| 56 | + const SignalProvider& signal_provider) const override { |
| 57 | + SmartSuggestion suggestion; |
| 58 | + suggestion.set_rule_name("ComputeBoundRule"); |
| 59 | + |
| 60 | + TF_ASSIGN_OR_RETURN(double hbm_utilization_percent, |
| 61 | + signal_provider.GetHbmUtilization()); |
| 62 | + TF_ASSIGN_OR_RETURN(double mxu_utilization_percent, |
| 63 | + signal_provider.GetMxuUtilization()); |
| 64 | + |
| 65 | + std::string suggestion_text = absl::StrCat( |
| 66 | + "<p>Your program is likely bottlenecked by <b>Compute Operations</b>: " |
| 67 | + "High MXU utilization of <b>", |
| 68 | + absl::StrFormat("%.1f", mxu_utilization_percent), |
| 69 | + "%</b> and low HBM Bandwidth utilization of <b>", |
| 70 | + absl::StrFormat("%.1f", hbm_utilization_percent), |
| 71 | + "%</b> indicates that the primary bottleneck is the raw processing " |
| 72 | + "power of the hardware. Please consider the following optimizations: " |
| 73 | + "</p>", |
| 74 | + "<ul>" |
| 75 | + "<li><b>Use Mixed Precision:</b> Using bfloat16 for computations and " |
| 76 | + "storing weights can significantly speed up matrix multiplications " |
| 77 | + "and reduce memory usage. Ensure that this does not negatively impact " |
| 78 | + "your model's convergence.</li>" |
| 79 | + "<li><b>Optimize Your Kernels:</b> If you are using custom operations, " |
| 80 | + "profile them to identify any inefficiencies. For standard " |
| 81 | + "operations, ensure you are using the latest version of your framework " |
| 82 | + "and libraries (e.g., CUDA, cuDNN for GPUs) which often include " |
| 83 | + "optimized kernels.</li>" |
| 84 | + "<li><b>Experiment with Batch Size:</b> While a large batch size can " |
| 85 | + "improve hardware utilization, an excessively large batch size might " |
| 86 | + "not always be optimal. Experiment with different batch sizes to find " |
| 87 | + "the sweet spot for your specific model and hardware.</li>" |
| 88 | + "<li><b>Re-evaluate the model architecture:</b> Consider if there are " |
| 89 | + "more computationally efficient alternatives to your current model or " |
| 90 | + "its components. For example, some layers are inherently more " |
| 91 | + "computationally expensive than others. Research and experiment with " |
| 92 | + "newer, more efficient architectures that can achieve similar " |
| 93 | + "performance with fewer floating-point operations (FLOPs)." |
| 94 | + "</ul>"); |
| 95 | + |
| 96 | + suggestion.set_suggestion_text(suggestion_text); |
| 97 | + return suggestion; |
| 98 | + } |
| 99 | +}; |
| 100 | + |
| 101 | +} // namespace profiler |
| 102 | +} // namespace tensorflow |
| 103 | + |
| 104 | +#endif // THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_ |
0 commit comments