Skip to content

Commit 867884e

Browse files
Profiler Teamcopybara-github
authored andcommitted
Add ComputeBoundRule to Smart Suggestion
PiperOrigin-RevId: 827512012
1 parent 424d7b7 commit 867884e

File tree

4 files changed

+244
-0
lines changed

4 files changed

+244
-0
lines changed

xprof/convert/smart_suggestion/BUILD

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,20 @@ cc_library(
110110
],
111111
)
112112

113+
cc_library(
114+
name = "compute_bound_rule",
115+
hdrs = ["compute_bound_rule.h"],
116+
deps = [
117+
":signal_provider",
118+
":smart_suggestion_rule",
119+
"@com_google_absl//absl/status:statusor",
120+
"@com_google_absl//absl/strings",
121+
"@com_google_absl//absl/strings:str_format",
122+
"@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
123+
"@xla//xla/tsl/platform:statusor",
124+
],
125+
)
126+
113127
cc_library(
114128
name = "smart_suggestion_rule_factory",
115129
hdrs = ["smart_suggestion_rule_factory.h"],
@@ -122,6 +136,7 @@ cc_library(
122136
name = "all_rules",
123137
hdrs = ["all_rules.h"],
124138
deps = [
139+
":compute_bound_rule",
125140
":data_transfer_bound_rule",
126141
":host_processing_bound_rule",
127142
":input_bound_rule",
@@ -188,3 +203,18 @@ cc_test(
188203
"@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
189204
],
190205
)
206+
207+
cc_test(
208+
name = "compute_bound_rule_test",
209+
srcs = ["compute_bound_rule_test.cc"],
210+
deps = [
211+
":compute_bound_rule",
212+
":signal_provider",
213+
":tool_data_provider",
214+
"@com_google_absl//absl/status:statusor",
215+
"@com_google_googletest//:gtest_main",
216+
"@org_xprof//plugin/xprof/protobuf:input_pipeline_proto_cc",
217+
"@org_xprof//plugin/xprof/protobuf:overview_page_proto_cc",
218+
"@org_xprof//plugin/xprof/protobuf:smart_suggestion_proto_cc",
219+
],
220+
)

xprof/convert/smart_suggestion/all_rules.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ limitations under the License.
1616
#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_ALL_RULES_H_
1717
#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_ALL_RULES_H_
1818

19+
#include "xprof/convert/smart_suggestion/compute_bound_rule.h"
1920
#include "xprof/convert/smart_suggestion/data_transfer_bound_rule.h"
2021
#include "xprof/convert/smart_suggestion/host_processing_bound_rule.h"
2122
#include "xprof/convert/smart_suggestion/input_bound_rule.h"
@@ -28,6 +29,7 @@ namespace profiler {
2829
// Registers all smart suggestion rules.
2930
inline void RegisterAllRules(SmartSuggestionRuleFactory* f) {
3031
// go/keep-sorted start
32+
f->Register<ComputeBoundRule>();
3133
f->Register<DataTransferBoundRule>();
3234
f->Register<HostProcessingBoundRule>();
3335
f->Register<InputBoundRule>();
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#ifndef THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_
17+
#define THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_
18+
19+
#include <optional>
20+
#include <string>
21+
22+
#include "absl/status/statusor.h"
23+
#include "absl/strings/str_cat.h"
24+
#include "absl/strings/str_format.h"
25+
#include "xla/tsl/platform/statusor.h"
26+
#include "xprof/convert/smart_suggestion/signal_provider.h"
27+
#include "xprof/convert/smart_suggestion/smart_suggestion_rule.h"
28+
#include "plugin/xprof/protobuf/smart_suggestion.pb.h"
29+
30+
namespace tensorflow {
31+
namespace profiler {
32+
33+
// If MXU utilization is higher than kComputeBoundMxuUtilizationThreshold and
34+
// HBM bandwidth utilization is lower than kComputeBoundHbmUtilizationThreshold,
35+
// it is considered compute bound.
36+
constexpr double kComputeBoundMxuUtilizationThreshold = 70;
37+
constexpr double kComputeBoundHbmUtilizationThreshold = 50;
38+
39+
// Rule to detect compute-bound bottleneck.
40+
class ComputeBoundRule : public SmartSuggestionRule {
41+
public:
42+
bool MeetsConditions(const SignalProvider& signal_provider) const override {
43+
absl::StatusOr<double> hbm_utilization_percent =
44+
signal_provider.GetHbmUtilization();
45+
absl::StatusOr<double> mxu_utilization_percent =
46+
signal_provider.GetMxuUtilization();
47+
if (!hbm_utilization_percent.ok() || !mxu_utilization_percent.ok()) {
48+
return false;
49+
}
50+
51+
return *mxu_utilization_percent > kComputeBoundMxuUtilizationThreshold &&
52+
*hbm_utilization_percent < kComputeBoundHbmUtilizationThreshold;
53+
}
54+
55+
absl::StatusOr<std::optional<SmartSuggestion>> GenerateSuggestion(
56+
const SignalProvider& signal_provider) const override {
57+
SmartSuggestion suggestion;
58+
suggestion.set_rule_name("ComputeBoundRule");
59+
60+
TF_ASSIGN_OR_RETURN(double hbm_utilization_percent,
61+
signal_provider.GetHbmUtilization());
62+
TF_ASSIGN_OR_RETURN(double mxu_utilization_percent,
63+
signal_provider.GetMxuUtilization());
64+
65+
std::string suggestion_text = absl::StrCat(
66+
"<p>Your program is likely bottlenecked by <b>Compute Operations</b>: "
67+
"High MXU utilization of <b>",
68+
absl::StrFormat("%.1f", mxu_utilization_percent),
69+
"%</b> and low HBM Bandwidth utilization of <b>",
70+
absl::StrFormat("%.1f", hbm_utilization_percent),
71+
"%</b> indicates that the primary bottleneck is the raw processing "
72+
"power of the hardware. Please consider the following optimizations: "
73+
"</p>",
74+
"<ul>"
75+
"<li><b>Use Mixed Precision:</b> Using bfloat16 for computations and "
76+
"storing weights can significantly speed up matrix multiplications "
77+
"and reduce memory usage. Ensure that this does not negatively impact "
78+
"your model's convergence.</li>"
79+
"<li><b>Optimize Your Kernels:</b> If you are using custom operations, "
80+
"profile them to identify any inefficiencies. For standard "
81+
"operations, ensure you are using the latest version of your framework "
82+
"and libraries (e.g., CUDA, cuDNN for GPUs) which often include "
83+
"optimized kernels.</li>"
84+
"<li><b>Experiment with Batch Size:</b> While a large batch size can "
85+
"improve hardware utilization, an excessively large batch size might "
86+
"not always be optimal. Experiment with different batch sizes to find "
87+
"the sweet spot for your specific model and hardware.</li>"
88+
"<li><b>Re-evaluate the model architecture:</b> Consider if there are "
89+
"more computationally efficient alternatives to your current model or "
90+
"its components. For example, some layers are inherently more "
91+
"computationally expensive than others. Research and experiment with "
92+
"newer, more efficient architectures that can achieve similar "
93+
"performance with fewer floating-point operations (FLOPs)."
94+
"</ul>");
95+
96+
suggestion.set_suggestion_text(suggestion_text);
97+
return suggestion;
98+
}
99+
};
100+
101+
} // namespace profiler
102+
} // namespace tensorflow
103+
104+
#endif // THIRD_PARTY_XPROF_CONVERT_SMART_SUGGESTION_COMPUTE_BOUND_RULE_H_
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#include "xprof/convert/smart_suggestion/compute_bound_rule.h"
17+
18+
#include <memory>
19+
#include <optional>
20+
#include <utility>
21+
22+
#include "testing/base/public/gmock.h"
23+
#include "<gtest/gtest.h>"
24+
#include "absl/status/statusor.h"
25+
#include "xprof/convert/smart_suggestion/signal_provider.h"
26+
#include "xprof/convert/smart_suggestion/tool_data_provider.h"
27+
#include "plugin/xprof/protobuf/input_pipeline.pb.h"
28+
#include "plugin/xprof/protobuf/overview_page.pb.h"
29+
#include "plugin/xprof/protobuf/smart_suggestion.pb.h"
30+
31+
namespace tensorflow {
32+
namespace profiler {
33+
namespace {
34+
35+
using ::testing::Eq;
36+
using ::testing::Return;
37+
using ::testing::status::IsOkAndHolds;
38+
39+
// Mock ToolDataProvider
40+
class MockToolDataProvider : public ToolDataProvider {
41+
public:
42+
MOCK_METHOD(absl::StatusOr<const OverviewPage*>, GetOverviewPage, (),
43+
(override));
44+
MOCK_METHOD(absl::StatusOr<const InputPipelineAnalysisResult*>,
45+
GetInputPipelineAnalysisResult, (), (override));
46+
};
47+
48+
TEST(ComputeBoundRuleTest, MeetsConditions) {
49+
auto mock_tool_data_provider = std::make_unique<MockToolDataProvider>();
50+
OverviewPage overview_page;
51+
overview_page.mutable_analysis()->set_mxu_utilization_percent(71.0);
52+
overview_page.mutable_analysis()
53+
->set_memory_bw_utilization_relative_to_hw_limit_percent(49.0);
54+
55+
EXPECT_CALL(*mock_tool_data_provider, GetOverviewPage())
56+
.WillRepeatedly(Return(&overview_page));
57+
58+
SignalProvider signal_provider(std::move(mock_tool_data_provider));
59+
ComputeBoundRule rule;
60+
61+
absl::StatusOr<std::optional<SmartSuggestion>> suggestion =
62+
rule.Apply(signal_provider);
63+
EXPECT_THAT(suggestion, IsOkAndHolds(testing::Not(Eq(std::nullopt))));
64+
EXPECT_EQ((*suggestion)->rule_name(), "ComputeBoundRule");
65+
EXPECT_THAT((*suggestion)->suggestion_text(),
66+
testing::HasSubstr(
67+
"71.0%</b> and low HBM Bandwidth utilization of <b>49.0%"));
68+
}
69+
70+
TEST(ComputeBoundRuleTest, MxuUtilizationTooLow) {
71+
auto mock_tool_data_provider = std::make_unique<MockToolDataProvider>();
72+
OverviewPage overview_page;
73+
overview_page.mutable_analysis()->set_mxu_utilization_percent(69.0);
74+
overview_page.mutable_analysis()
75+
->set_memory_bw_utilization_relative_to_hw_limit_percent(49.0);
76+
77+
EXPECT_CALL(*mock_tool_data_provider, GetOverviewPage())
78+
.WillRepeatedly(Return(&overview_page));
79+
80+
SignalProvider signal_provider(std::move(mock_tool_data_provider));
81+
ComputeBoundRule rule;
82+
83+
absl::StatusOr<std::optional<SmartSuggestion>> suggestion =
84+
rule.Apply(signal_provider);
85+
EXPECT_THAT(suggestion, IsOkAndHolds(Eq(std::nullopt)));
86+
}
87+
88+
TEST(ComputeBoundRuleTest, HbmUtilizationTooHigh) {
89+
auto mock_tool_data_provider = std::make_unique<MockToolDataProvider>();
90+
OverviewPage overview_page;
91+
overview_page.mutable_analysis()->set_mxu_utilization_percent(71.0);
92+
overview_page.mutable_analysis()
93+
->set_memory_bw_utilization_relative_to_hw_limit_percent(51.0);
94+
95+
EXPECT_CALL(*mock_tool_data_provider, GetOverviewPage())
96+
.WillRepeatedly(Return(&overview_page));
97+
98+
SignalProvider signal_provider(std::move(mock_tool_data_provider));
99+
ComputeBoundRule rule;
100+
101+
absl::StatusOr<std::optional<SmartSuggestion>> suggestion =
102+
rule.Apply(signal_provider);
103+
EXPECT_THAT(suggestion, IsOkAndHolds(Eq(std::nullopt)));
104+
}
105+
106+
} // namespace
107+
} // namespace profiler
108+
} // namespace tensorflow

0 commit comments

Comments
 (0)