Add ComputeBoundRule to Smart Suggestion

Profiler Team · copybara-github · commit 867884ebd5e9 · 2025-11-03T08:37:30.000-08:00
PiperOrigin-RevId: 827512012
diff --git a/xprof/convert/smart_suggestion/BUILD b/xprof/convert/smart_suggestion/BUILD
@@ -110,6 +110,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "compute_bound_rule",
+    hdrs = ["compute_bound_rule.h"],
+    deps = [
+        ":signal_provider",
+        ":smart_suggestion_rule",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
+        "@xla//xla/tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "smart_suggestion_rule_factory",
     hdrs = ["smart_suggestion_rule_factory.h"],
@@ -122,6 +136,7 @@ cc_library(
     name = "all_rules",
     hdrs = ["all_rules.h"],
     deps = [
+        ":compute_bound_rule",
         ":data_transfer_bound_rule",
         ":host_processing_bound_rule",
         ":input_bound_rule",
@@ -188,3 +203,18 @@ cc_test(
         "@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
     ],
 )
+
+cc_test(
+    name = "compute_bound_rule_test",
+    srcs = ["compute_bound_rule_test.cc"],
+    deps = [
+        ":compute_bound_rule",
+        ":signal_provider",
+        ":tool_data_provider",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@org_xprof//plugin/xprof/protobuf:input_pipeline_proto_cc",
+        "@org_xprof//plugin/xprof/protobuf:overview_page_proto_cc",
+        "@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
+    ],
+)
diff --git a/xprof/convert/smart_suggestion/all_rules.h b/xprof/convert/smart_suggestion/all_rules.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_ALL_RULES_H_
 #define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_ALL_RULES_H_
 
+#include "xprof/convert/smart_suggestion/compute_bound_rule.h"
 #include "xprof/convert/smart_suggestion/data_transfer_bound_rule.h"
 #include "xprof/convert/smart_suggestion/host_processing_bound_rule.h"
 #include "xprof/convert/smart_suggestion/input_bound_rule.h"
@@ -28,6 +29,7 @@ namespace profiler {
 // Registers all smart suggestion rules.
 inline void RegisterAllRules(SmartSuggestionRuleFactory* f) {
   // go/keep-sorted start
+  f->Register<ComputeBoundRule>();
   f->Register<DataTransferBoundRule>();
   f->Register<HostProcessingBoundRule>();
   f->Register<InputBoundRule>();
diff --git a/xprof/convert/smart_suggestion/compute_bound_rule.h b/xprof/convert/smart_suggestion/compute_bound_rule.h
@@ -0,0 +1,104 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_
+#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xprof/convert/smart_suggestion/signal_provider.h"
+#include "xprof/convert/smart_suggestion/smart_suggestion_rule.h"
+#include "plugin/xprof/protobuf/smart_suggestion.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// If MXU utilization is higher than kComputeBoundMxuUtilizationThreshold and
+// HBM bandwidth utilization is lower than kComputeBoundHbmUtilizationThreshold,
+// it is considered compute bound.
+constexpr double kComputeBoundMxuUtilizationThreshold = 70;
+constexpr double kComputeBoundHbmUtilizationThreshold = 50;
+
+// Rule to detect compute-bound bottleneck.
+class ComputeBoundRule : public SmartSuggestionRule {
+ public:
+  bool MeetsConditions(const SignalProvider& signal_provider) const override {
+    absl::StatusOr<double> hbm_utilization_percent =
+        signal_provider.GetHbmUtilization();
+    absl::StatusOr<double> mxu_utilization_percent =
+        signal_provider.GetMxuUtilization();
+    if (!hbm_utilization_percent.ok() || !mxu_utilization_percent.ok()) {
+      return false;
+    }
+
+    return *mxu_utilization_percent > kComputeBoundMxuUtilizationThreshold &&
+           *hbm_utilization_percent < kComputeBoundHbmUtilizationThreshold;
+  }
+
+  absl::StatusOr<std::optional<SmartSuggestion>> GenerateSuggestion(
+      const SignalProvider& signal_provider) const override {
+    SmartSuggestion suggestion;
+    suggestion.set_rule_name("ComputeBoundRule");
+
+    TF_ASSIGN_OR_RETURN(double hbm_utilization_percent,
+                        signal_provider.GetHbmUtilization());
+    TF_ASSIGN_OR_RETURN(double mxu_utilization_percent,
+                        signal_provider.GetMxuUtilization());
+
+    std::string suggestion_text = absl::StrCat(
+        "<p>Your program is likely bottlenecked by <b>Compute Operations</b>: "
+        "High MXU utilization of <b>",
+        absl::StrFormat("%.1f", mxu_utilization_percent),
+        "%</b> and low HBM Bandwidth utilization of <b>",
+        absl::StrFormat("%.1f", hbm_utilization_percent),
+        "%</b> indicates that the primary bottleneck is the raw processing "
+        "power of the hardware. Please consider the following optimizations: "
+        "</p>",
+        "<ul>"
+        "<li><b>Use Mixed Precision:</b> Using bfloat16 for computations and "
+        "storing weights can significantly speed up matrix multiplications "
+        "and reduce memory usage. Ensure that this does not negatively impact "
+        "your model's convergence.</li>"
+        "<li><b>Optimize Your Kernels:</b> If you are using custom operations, "
+        "profile them to identify any inefficiencies. For standard "
+        "operations, ensure you are using the latest version of your framework "
+        "and libraries (e.g., CUDA, cuDNN for GPUs) which often include "
+        "optimized kernels.</li>"
+        "<li><b>Experiment with Batch Size:</b> While a large batch size can "
+        "improve hardware utilization, an excessively large batch size might "
+        "not always be optimal. Experiment with different batch sizes to find "
+        "the sweet spot for your specific model and hardware.</li>"
+        "<li><b>Re-evaluate the model architecture:</b> Consider if there are "
+        "more computationally efficient alternatives to your current model or "
+        "its components. For example, some layers are inherently more "
+        "computationally expensive than others. Research and experiment with "
+        "newer, more efficient architectures that can achieve similar "
+        "performance with fewer floating-point operations (FLOPs)."
+        "</ul>");
+
+    suggestion.set_suggestion_text(suggestion_text);
+    return suggestion;
+  }
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_
diff --git a/xprof/convert/smart_suggestion/compute_bound_rule_test.cc b/xprof/convert/smart_suggestion/compute_bound_rule_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xprof/convert/smart_suggestion/compute_bound_rule.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "testing/base/public/gmock.h"
+#include "<gtest/gtest.h>"
+#include "absl/status/statusor.h"
+#include "xprof/convert/smart_suggestion/signal_provider.h"
+#include "xprof/convert/smart_suggestion/tool_data_provider.h"
+#include "plugin/xprof/protobuf/input_pipeline.pb.h"
+#include "plugin/xprof/protobuf/overview_page.pb.h"
+#include "plugin/xprof/protobuf/smart_suggestion.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Return;
+using ::testing::status::IsOkAndHolds;
+
+// Mock ToolDataProvider
+class MockToolDataProvider : public ToolDataProvider {
+ public:
+  MOCK_METHOD(absl::StatusOr<const OverviewPage*>, GetOverviewPage, (),
+              (override));
+  MOCK_METHOD(absl::StatusOr<const InputPipelineAnalysisResult*>,
+              GetInputPipelineAnalysisResult, (), (override));
+};
+
+TEST(ComputeBoundRuleTest, MeetsConditions) {
+  auto mock_tool_data_provider = std::make_unique<MockToolDataProvider>();
+  OverviewPage overview_page;
+  overview_page.mutable_analysis()->set_mxu_utilization_percent(71.0);
+  overview_page.mutable_analysis()
+      ->set_memory_bw_utilization_relative_to_hw_limit_percent(49.0);
+
+  EXPECT_CALL(*mock_tool_data_provider, GetOverviewPage())
+      .WillRepeatedly(Return(&overview_page));
+
+  SignalProvider signal_provider(std::move(mock_tool_data_provider));
+  ComputeBoundRule rule;
+
+  absl::StatusOr<std::optional<SmartSuggestion>> suggestion =
+      rule.Apply(signal_provider);
+  EXPECT_THAT(suggestion, IsOkAndHolds(testing::Not(Eq(std::nullopt))));
+  EXPECT_EQ((*suggestion)->rule_name(), "ComputeBoundRule");
+  EXPECT_THAT((*suggestion)->suggestion_text(),
+              testing::HasSubstr(
+                  "71.0%</b> and low HBM Bandwidth utilization of <b>49.0%"));
+}
+
+TEST(ComputeBoundRuleTest, MxuUtilizationTooLow) {
+  auto mock_tool_data_provider = std::make_unique<MockToolDataProvider>();
+  OverviewPage overview_page;
+  overview_page.mutable_analysis()->set_mxu_utilization_percent(69.0);
+  overview_page.mutable_analysis()
+      ->set_memory_bw_utilization_relative_to_hw_limit_percent(49.0);
+
+  EXPECT_CALL(*mock_tool_data_provider, GetOverviewPage())
+      .WillRepeatedly(Return(&overview_page));
+
+  SignalProvider signal_provider(std::move(mock_tool_data_provider));
+  ComputeBoundRule rule;
+
+  absl::StatusOr<std::optional<SmartSuggestion>> suggestion =
+      rule.Apply(signal_provider);
+  EXPECT_THAT(suggestion, IsOkAndHolds(Eq(std::nullopt)));
+}
+
+TEST(ComputeBoundRuleTest, HbmUtilizationTooHigh) {
+  auto mock_tool_data_provider = std::make_unique<MockToolDataProvider>();
+  OverviewPage overview_page;
+  overview_page.mutable_analysis()->set_mxu_utilization_percent(71.0);
+  overview_page.mutable_analysis()
+      ->set_memory_bw_utilization_relative_to_hw_limit_percent(51.0);
+
+  EXPECT_CALL(*mock_tool_data_provider, GetOverviewPage())
+      .WillRepeatedly(Return(&overview_page));
+
+  SignalProvider signal_provider(std::move(mock_tool_data_provider));
+  ComputeBoundRule rule;
+
+  absl::StatusOr<std::optional<SmartSuggestion>> suggestion =
+      rule.Apply(signal_provider);
+  EXPECT_THAT(suggestion, IsOkAndHolds(Eq(std::nullopt)));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow