-
Notifications
You must be signed in to change notification settings - Fork 40
Add the feature of automatically generating candidates #274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
8a33d90
4fefa16
7a2a228
8697d88
d6e6f20
719e1ac
e22e843
ac6f79b
7d77aa4
48d4c89
c7ec3a8
bfc7e64
1e1d931
75899ec
2574842
49e59b3
527ff85
fb1512d
5ebd65b
31255b7
82d0d1a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -238,7 +238,7 @@ flagcxResult_t cudaAdaptorEventCreate(flagcxEvent_t *event) { | |
| (*event) = NULL; | ||
| flagcxCalloc(event, 1); | ||
| DEVCHECK(cudaEventCreateWithFlags((cudaEvent_t *)(*event), | ||
| cudaEventDisableTiming)); | ||
| cudaEventDefault)); | ||
|
Comment on lines
240
to
+241
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using DEVCHECK(cudaEventCreateWithFlags((cudaEvent_t *)(*event),
cudaEventDisableTiming)); DEVCHECK(cudaEventCreateWithFlags((cudaEvent_t *)(*event),
cudaEventDisableTiming));
mikethegoblin marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The change from If timing is not needed, then this change is fine.
Comment on lines
240
to
+241
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| return flagcxSuccess; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,19 +1,87 @@ | ||
| #include "tuner/tuner_util.h" | ||
|
|
||
| #ifdef USE_NVIDIA_ADAPTOR | ||
| static struct flagcxEnvConfig config1 = { | ||
| "defaultConfig1", | ||
| 1, | ||
| {FLAGCX_ENV_TYPE_CREATION, "NCCL_P2P_NVL_CHUNKSIZE", "1024", "524288"}}; | ||
| static struct flagcxEnvConfig config2 = { | ||
| "defaultConfig2", | ||
| 1, | ||
| {FLAGCX_ENV_TYPE_CREATION, "NCCL_P2P_NVL_CHUNKSIZE", "524288", "524288"}}; | ||
|
|
||
| // demo | ||
| flagcxResult_t loadConfigList(std::vector<struct flagcxEnvConfig> &cfgList) { | ||
| cfgList.push_back(config1); | ||
| cfgList.push_back(config2); | ||
| return flagcxSuccess; | ||
|
|
||
| // Safely copy std::string to char buffer, ensuring NUL termination and truncation | ||
| static void safeStrCopy(char *dst, size_t dstSize, const std::string &src) { | ||
| if (dstSize == 0) return; | ||
| size_t copyLen = std::min(dstSize - 1, src.size()); | ||
| if (copyLen > 0) memcpy(dst, src.data(), copyLen); | ||
| dst[copyLen] = '\0'; | ||
|
Comment on lines
+4
to
+8
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The static void safeStrCopy(char *dst, size_t dstSize, const std::string &src) {
if (dstSize == 0) return;
size_t copyLen = std::min(dstSize - 1, src.size());
if (copyLen > 0) memcpy(dst, src.data(), copyLen);
dst[copyLen] = '\0';
if (copyLen < src.size()) {
INFO(FLAGCX_INIT, "String truncated during safeStrCopy");
}
}static void safeStrCopy(char *dst, size_t dstSize, const std::string &src) {
if (dstSize == 0) return;
size_t copyLen = std::min(dstSize - 1, src.size());
if (copyLen > 0) memcpy(dst, src.data(), copyLen);
dst[copyLen] = '\0';
if (copyLen < src.size()) {
INFO(FLAGCX_INIT, "String truncated during safeStrCopy");
}
} |
||
| } | ||
|
|
||
| // Generate all combinations and return a vector of flagcxEnvConfig | ||
| flagcxResult_t generateCandidate(std::vector<struct flagcxEnvConfig> &cfgList) { | ||
|
|
||
| // Return empty if there are no environment variables | ||
| if (vars.empty()){ | ||
| INFO(FLAGCX_INIT, "Invalid number of environment variables: 0"); | ||
| return flagcxInvalidArgument; | ||
| } | ||
|
Comment on lines
+15
to
+18
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The log message indicates an invalid number of environment variables, but the function returns INFO(FLAGCX_INIT, "Invalid number of environment variables: 0");
return flagcxSuccess;There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to return flagcxSuccess and do nothing.
Comment on lines
+15
to
+18
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Returning if (vars.empty()){
INFO(FLAGCX_INIT, "Invalid number of environment variables: 0");
return flagcxSuccess; // Or flagcxSuccess if an empty config is acceptable
} |
||
|
|
||
| // If the number of variables exceeds the structure capacity, truncate | ||
| if (vars.size() > (size_t)FLAGCX_ENV_LIST_MAX_LENGTH) { | ||
| INFO(FLAGCX_INIT, "The number of environment variables exceeds the maximum length defined by FLAGCX_ENV_LIST_MAX_LENGTH"); | ||
| return flagcxInvalidArgument; | ||
|
||
| } | ||
|
|
||
| // Prepare candidate value lists for each variable (at least one empty string to ensure uniform combination logic) | ||
| std::vector<std::vector<std::string>> lists; | ||
| lists.reserve(vars.size()); | ||
| for (const auto &v : vars) { | ||
| if (v.choices.empty()) { | ||
| lists.emplace_back(std::vector<std::string>{""}); | ||
| } else { | ||
| lists.emplace_back(v.choices); | ||
| } | ||
| } | ||
|
|
||
| // Use an index vector to iterate through the Cartesian product (multi-dimensional counter) | ||
| size_t nvars = lists.size(); | ||
| std::vector<size_t> idx(nvars, 0); | ||
| bool done = (nvars == 0); | ||
| unsigned long numCandidate = 0; | ||
|
|
||
| while (!done) { | ||
| // Construct a flagcxEnvConfig and zero-initialize | ||
| flagcxEnvConfig cfg; | ||
| memset(&cfg, 0, sizeof(cfg)); // this zeroes commTag and all fields; adjust if you want non-zero defaults | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using flagcxEnvConfig cfg;
memset(&cfg, 0, sizeof(cfg)); // this zeroes commTag and all fields; adjust if you want non-zero defaults
cfg.commTag.tag[0] = '\0'; // Ensure null terminationThere was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using flagcxEnvConfig cfg;
memset(&cfg, 0, sizeof(cfg)); // Zero-initialize cfg to ensure all fields are properly set
Comment on lines
+47
to
+48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two lines for declaring and then zero-initializing flagcxEnvConfig cfg{}; // this zeroes commTag and all fields; adjust if you want non-zero defaults |
||
|
|
||
| std::string tagStr = "Config " + std::to_string(numCandidate); | ||
| safeStrCopy(cfg.commTag.tag, sizeof(cfg.commTag.tag), tagStr); | ||
|
||
| cfg.envCount = 0; | ||
|
|
||
| // Fill envs | ||
| for (size_t i = 0; i < nvars; ++i) { | ||
| flagcxEnvEntity &ent = cfg.envs[i]; | ||
| // type | ||
| ent.type = FLAGCX_ENV_TYPE_CREATION; | ||
| // name | ||
| safeStrCopy(ent.name, sizeof(ent.name), vars[i].name); | ||
| // value | ||
| const std::string &val = lists[i][idx[i]]; | ||
| safeStrCopy(ent.value, sizeof(ent.value), val); | ||
| // defaultValue | ||
| safeStrCopy(ent.defaultValue, sizeof(ent.defaultValue), vars[i].defaultValue); | ||
|
|
||
| cfg.envCount++; | ||
| // Stop if exceeding the maximum allowed envs (should not happen since we truncated vars earlier) | ||
| if (cfg.envCount >= FLAGCX_ENV_LIST_MAX_LENGTH) break; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check
Comment on lines
+73
to
+74
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check |
||
| } | ||
|
|
||
| cfgList.push_back(cfg); | ||
|
|
||
| // Increment counter (from least significant to most significant) | ||
| for (int i = (int)nvars - 1; i >= 0; --i) { | ||
| idx[i]++; | ||
| if (idx[i] < lists[i].size()) break; | ||
| idx[i] = 0; | ||
| if (i == 0) done = true; | ||
| } | ||
| numCandidate += 1; | ||
| } | ||
|
|
||
| return flagcxSuccess; | ||
| } | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,8 +3,56 @@ | |
|
|
||
| #include "tuner.h" // struct flagcxEnvConfig | ||
| #include <vector> | ||
| #include <string> | ||
|
|
||
| // This is a demonstration function that provide a way to load all config list for a specific GPU. | ||
| flagcxResult_t loadConfigList(std::vector<struct flagcxEnvConfig> &cfgList); | ||
|
|
||
| struct EnvVar { | ||
| std::string name; | ||
| std::vector<std::string> choices; | ||
| std::string defaultValue; | ||
| EnvVar(std::string n="") : name(std::move(n)) {} | ||
| EnvVar(std::string n, std::vector<std::string> c, std::string d = "") | ||
| : name(std::move(n)), choices(std::move(c)), defaultValue(std::move(d)) {} | ||
| }; | ||
|
|
||
| flagcxResult_t generateCandidate(std::vector<struct flagcxEnvConfig> &cfgList); | ||
| std::vector<struct flagcxEnvConfig> generateCandidate(std::vector<EnvVar> vars); | ||
|
||
| static void safeStrCopy(char *dst, size_t dstSize, const std::string &src); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Declaring a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function |
||
|
|
||
| #ifdef USE_NVIDIA_ADAPTOR | ||
|
|
||
| static EnvVar algo( | ||
| "NCCL_ALGO", | ||
| {"ring", "tree"}, | ||
| "ring" | ||
| ); | ||
|
|
||
| static EnvVar proto( | ||
| "NCCL_PROTO", | ||
| {"LL", "LL128", "Simple"}, | ||
| "Simple" | ||
| ); | ||
|
|
||
| static EnvVar thread( | ||
| "NCCL_NTHREADS", | ||
| {"128", "256"}, | ||
| "256" | ||
| ); | ||
|
|
||
| static EnvVar minChannel( | ||
| "NCCL_MIN_NCHANNELS", | ||
| {"16", "32"}, | ||
| "16" | ||
| ); | ||
|
|
||
| static EnvVar chunkSize( | ||
| "NCCL_P2P_NET_CHUNKSIZE", | ||
| {"1024", "2048"}, | ||
| "1024" | ||
| ); | ||
| static std::vector<EnvVar> vars = {algo, proto, thread, minChannel, chunkSize}; | ||
|
|
||
| #endif | ||
|
|
||
| #endif // end include guard | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -8,6 +8,7 @@ | |||||||||||||||
| #include <sstream> | ||||||||||||||||
| #include <string> | ||||||||||||||||
| #include <vector> | ||||||||||||||||
| #include <iostream> | ||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||||||
|
|
||||||||||||||||
| // A category of collective operation. the minimal unit for tuning. | ||||||||||||||||
| struct TunerCollCategory { | ||||||||||||||||
|
|
@@ -142,7 +143,9 @@ flagcxResult_t flagcxTunerInit(size_t nRanks, size_t nNodes, | |||||||||||||||
| flagcxDebugLogger_t logFunction, | ||||||||||||||||
| void **context) { | ||||||||||||||||
| struct flagcxTunerContext *ctx = new struct flagcxTunerContext; | ||||||||||||||||
| FLAGCXCHECK(loadConfigList(ctx->configList)); | ||||||||||||||||
| FLAGCXCHECK(generateCandidate(ctx->configList)); | ||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider adding error handling for the
Suggested change
|
||||||||||||||||
| INFO(FLAGCX_TUNING, | ||||||||||||||||
| "Candidate number: %ld.", ctx->configList.size()); | ||||||||||||||||
|
Comment on lines
+147
to
+148
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The log message "Candidate number: %ld." is useful, but consider adding more details about the candidates themselves, such as the range of values being explored. This would provide more insight into the tuning process and help in debugging.
Suggested change
|
||||||||||||||||
| ctx->logger = logFunction; | ||||||||||||||||
| *context = ctx; | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The change from
cudaEventDisableTimingtocudaEventDefaultmight affect the profiling accuracy, as disabling timing could provide more precise measurements. Verify that enabling timing doesn't introduce significant overhead or inaccuracies in the measurements. If timing is not crucial, keepingcudaEventDisableTimingcould be beneficial for performance.