-
Notifications
You must be signed in to change notification settings - Fork 40
Fix the recv_buffer issue encountered during LLaMA3 training #273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,8 @@ flagcxResult_t flagcxAlgoTimeEstimator::getAlgoTime(float *time) { | |
| const char *interServerTopoFile = | ||
| flagcxGetEnv("FLAGCX_INTERSERVER_ROUTE_FILE"); | ||
| if (enableTopoDetect && interServerTopoFile && | ||
| strcmp(enableTopoDetect, "TRUE") == 0) { | ||
| (strcmp(enableTopoDetect, "TRUE") == 0 || | ||
| strcmp(enableTopoDetect, "True") == 0)) { | ||
|
Comment on lines
+13
to
+14
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of checking for "TRUE" and "True" separately, using a case-insensitive comparison would be more robust and cleaner. The strcasecmp(enableTopoDetect, "TRUE") == 0) { |
||
| // algo time estimator depends on cluster level topology detection | ||
| float preHomoTime, heteroTime, postHomoTime; | ||
| INFO(FLAGCX_GRAPH, "COST_MODEL: getting time for prehomo funcs"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -216,7 +216,7 @@ static flagcxResult_t flagcxCommInitRankFunc(struct flagcxAsyncJob *job_) { | |||||
| } | ||||||
| FLAGCXCHECK(flagcxNetInit(comm)); | ||||||
| INFO(FLAGCX_INIT, "Using network %s", comm->netAdaptor->name); | ||||||
| if (env && strcmp(env, "TRUE") == 0) { | ||||||
| if (env && (strcmp(env, "TRUE") == 0 || strcmp(env, "True") == 0)) { | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To make the environment variable check more robust, consider using a case-insensitive string comparison. The
Suggested change
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This env is useless and we can remove it. |
||||||
| INFO(FLAGCX_INIT, "getting busId for cudaDev %d", comm->cudaDev); | ||||||
| FLAGCXCHECK(getBusId(comm->cudaDev, &comm->busId)); | ||||||
| INFO(FLAGCX_INIT, "getting commHash for rank %d", comm->rank); | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -537,7 +537,8 @@ flagcxResult_t flagcxGetLocalNetFromGpu(int apu, int *dev, | |
| } | ||
|
|
||
| if (strlen(name) == 0 && enable_topo_detect && | ||
| strcmp(enable_topo_detect, "TRUE") == 0) { | ||
| (strcmp(enable_topo_detect, "TRUE") == 0 || | ||
| strcmp(enable_topo_detect, "True") == 0)) { | ||
|
Comment on lines
+540
to
+541
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A case-insensitive comparison using (strcmp(enable_topo_detect, "TRUE") == 0 || strcasecmp(enable_topo_detect, "True") == 0)) { |
||
| FLAGCXCHECK(flagcxTopoGetLocalNet(comm->topoServer, comm->rank, dev)); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -484,9 +484,10 @@ flagcxResult_t flagcxCommInitRank(flagcxComm_t *comm, int nranks, | |
| struct flagcxNicDistance *nicDistanceData; | ||
| FLAGCXCHECK(flagcxCalloc(&nicDistanceData, nranks)); | ||
| const char *enableTopoDetect = flagcxGetEnv("FLAGCX_ENABLE_TOPO_DETECT"); | ||
| if (enableTopoDetect && strcmp(enableTopoDetect, "TRUE") == | ||
| 0) { // safety check nic distance is only | ||
| // available after topo detection | ||
| if (enableTopoDetect && (strcmp(enableTopoDetect, "TRUE") == 0 || | ||
| strcmp(enableTopoDetect, "True") == | ||
| 0)) { // safety check nic distance is only | ||
|
Comment on lines
+487
to
+489
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This logic for checking the environment variable is duplicated in several files. Using a case-insensitive comparison like if (enableTopoDetect && strcasecmp(enableTopoDetect, "TRUE") == 0) { // safety check nic distance is only |
||
| // available after topo detection | ||
| FLAGCXCHECK(flagcxGetNicDistance((*comm)->hetero_comm->topoServer, rank, | ||
| nicDistanceData + rank)); | ||
| } else { | ||
|
|
@@ -1762,4 +1763,4 @@ flagcxResult_t flagcxGroupEnd(flagcxComm_t comm) { | |
| } | ||
| } | ||
| return flagcxSuccess; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Try strcasecmp instead