Skip to content

Commit c51e373

Browse files
committed
Initial code for getXferTelemetry API.
1 parent d8a5721 commit c51e373

File tree

4 files changed

+105
-16
lines changed

4 files changed

+105
-16
lines changed

src/api/cpp/nixl.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,18 @@ class nixlAgent {
289289
nixl_status_t
290290
getXferStatus (nixlXferReqH* req_hndl) const;
291291

292+
293+
/**
294+
* @brief Get the telemetry data associated with `req_hndl`.
295+
*
296+
* @param req_hndl Transfer request handle obtained from makeXferReq/createXferReq
297+
* @param telemetry [out] Output telemetry information
298+
* @return nixl_status_t Error code if call was not successful
299+
*/
300+
nixl_status_t
301+
getXferTelemetry (const nixlXferReqH* req_hndl,
302+
nixl_xfer_telem_t &telemetry) const;
303+
292304
/**
293305
* @brief Query the backend associated with `req_hndl`. E.g., if for genNotif
294306
* the same backend as a transfer is desired.

src/api/cpp/nixl_types.h

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <string>
2121
#include <unordered_map>
2222
#include <optional>
23+
#include <chrono>
2324

2425

2526
/*** Forward declarations ***/
@@ -62,7 +63,8 @@ enum nixl_status_t {
6263
NIXL_ERR_UNKNOWN = -8,
6364
NIXL_ERR_NOT_SUPPORTED = -9,
6465
NIXL_ERR_REMOTE_DISCONNECT = -10,
65-
NIXL_ERR_CANCELED = -11
66+
NIXL_ERR_CANCELED = -11,
67+
NIXL_ERR_NO_TELEMETRY = -12
6668
};
6769

6870
/**
@@ -223,6 +225,65 @@ struct nixlAgentOptionalArgs {
223225
*/
224226
using nixl_opt_args_t = nixlAgentOptionalArgs;
225227

228+
/**
229+
* @brief A typedefs for a point in time
230+
*/
231+
using chrono_point_t = std::chrono::steady_clock::time_point;
232+
233+
/**
234+
* @brief A constant indicating min chrono_point_t value
235+
*/
236+
constexpr auto min_chrono_time = std::chrono::steady_clock::time_point::min();
237+
238+
/**
239+
* @brief A typedefs for a period of time
240+
*/
241+
using chrono_period_t = std::chrono::microseconds;
242+
243+
/**
244+
* @struct nixlXferTelemetry
245+
* @brief A structure for telemetry output from agent API
246+
*/
247+
struct nixlXferTelemetry {
248+
/**
249+
* @var backendType Type of the backend performing the transfer
250+
*/
251+
nixl_backend_t backendType;
252+
253+
/**
254+
* @var startTime Time that the transfer was posted
255+
*/
256+
chrono_point_t startTime;
257+
258+
/**
259+
* @var postDuration Time it took to do the post operation
260+
*/
261+
chrono_period_t postDuration;
262+
263+
/**
264+
* @var xferDuration Time it took to complete the transfer
265+
* if checkXferReq is called late, that might impact this result
266+
*/
267+
chrono_period_t xferDuration;
268+
269+
/**
270+
* @var totalBytes Amount of bytes transfered in the request
271+
*/
272+
size_t totalBytes;
273+
274+
/**
275+
* @var descCount Number of descriptors in the transfer request.
276+
* If any merging of descriptors were performed, it will be reflected here.
277+
*/
278+
size_t descCount;
279+
};
280+
281+
/**
282+
* @brief A typedef for a nixlXferTelemetry
283+
* for telemetry output.
284+
*/
285+
using nixl_xfer_telem_t = nixlXferTelemetry;
286+
226287
/**
227288
* @brief A define for an empty string, that indicates the descriptor list is being
228289
* prepared for the local agent as an initiator in prepXferDlist method.

src/core/nixl_agent.cpp

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,24 +89,24 @@ nixlXferReqH::updateRequestStats(std::unique_ptr<nixlTelemetry> &telemetry_pub,
8989
static const std::array<std::string, 3> nixl_post_status_str = {
9090
" Posted", " Posted and Completed", " Completed"};
9191
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
92-
std::chrono::steady_clock::now() - telemetry.startTime);
92+
std::chrono::steady_clock::now() - telemetry.startTime_);
9393

9494
if (stat_status == NIXL_TELEMETRY_POST) {
9595
telemetry.postDuration_ = duration;
9696
} else if (stat_status == NIXL_TELEMETRY_POST_AND_FINISH) {
9797
telemetry.postDuration_ = duration;
9898
telemetry.xferDuration_ = duration;
9999
telemetry_pub->addPostTime(duration);
100-
telemetry_pub->addXferTime(duration, backendOp == NIXL_WRITE, telemetry.totalBytes);
100+
telemetry_pub->addXferTime(duration, backendOp == NIXL_WRITE, telemetry.totalBytes_);
101101
} else { // stat_status == NIXL_TELEMETRY_FINISH
102102
telemetry.xferDuration_ = duration;
103103
telemetry_pub->addPostTime(telemetry.postDuration_);
104-
telemetry_pub->addXferTime(duration, backendOp == NIXL_WRITE, telemetry.totalBytes);
104+
telemetry_pub->addXferTime(duration, backendOp == NIXL_WRITE, telemetry.totalBytes_);
105105
}
106106

107107
NIXL_TRACE << "[NIXL TELEMETRY]: From backend " << engine->getType()
108108
<< nixl_post_status_str[stat_status] << " Xfer with " << initiatorDescs->descCount()
109-
<< " descriptors of total size " << telemetry.totalBytes << "B in "
109+
<< " descriptors of total size " << telemetry.totalBytes_ << "B in "
110110
<< duration.count() << "us.";
111111
}
112112

@@ -751,7 +751,7 @@ nixlAgent::makeXferReq (const nixl_xfer_op_t &operation,
751751
handle->hasNotif = opt_args.hasNotif;
752752
handle->backendOp = operation;
753753
handle->status = NIXL_ERR_NOT_POSTED;
754-
handle->telemetry.totalBytes = total_bytes;
754+
handle->telemetry.totalBytes_ = total_bytes;
755755

756756
ret = handle->engine->prepXfer (handle->backendOp,
757757
*handle->initiatorDescs,
@@ -877,7 +877,7 @@ nixlAgent::createXferReq(const nixl_xfer_op_t &operation,
877877
handle->status = NIXL_ERR_NOT_POSTED;
878878
handle->notifMsg = opt_args.notifMsg;
879879
handle->hasNotif = opt_args.hasNotif;
880-
handle->telemetry.totalBytes = total_bytes;
880+
handle->telemetry.totalBytes_ = total_bytes;
881881

882882
ret1 = handle->engine->prepXfer (handle->backendOp,
883883
*handle->initiatorDescs,
@@ -941,7 +941,7 @@ nixlAgent::postXferReq(nixlXferReqH *req_hndl,
941941
return NIXL_ERR_INVALID_PARAM;
942942
}
943943

944-
if (data->telemetry_) req_hndl->telemetry.startTime = std::chrono::steady_clock::now();
944+
if (data->telemetry_) req_hndl->telemetry.startTime_ = std::chrono::steady_clock::now();
945945

946946
NIXL_SHARED_LOCK_GUARD(data->lock);
947947
// Check if the remote was invalidated before post/repost
@@ -1049,6 +1049,25 @@ nixlAgent::getXferStatus (nixlXferReqH *req_hndl) const {
10491049
return req_hndl->status;
10501050
}
10511051

1052+
nixl_status_t
1053+
nixlAgent::getXferTelemetry (const nixlXferReqH* req_hndl,
1054+
nixl_xfer_telem_t &telemetry) const{
1055+
if (!data->telemetry_)
1056+
return NIXL_ERR_NO_TELEMETRY;
1057+
1058+
if (req_hndl->status <0)
1059+
return req_hndl->status;
1060+
1061+
// NIXL_SUCCESS or NIXL_IN_PROG, values are initialized
1062+
telemetry.backendType = req_hndl->engine->getType();
1063+
telemetry.startTime = req_hndl->telemetry.startTime_;
1064+
telemetry.postDuration = req_hndl->telemetry.postDuration_;
1065+
telemetry.xferDuration = req_hndl->telemetry.xferDuration_;
1066+
telemetry.totalBytes = req_hndl->telemetry.totalBytes_;
1067+
telemetry.descCount = req_hndl->initiatorDescs->descCount();
1068+
1069+
return req_hndl->status;
1070+
}
10521071

10531072
nixl_status_t
10541073
nixlAgent::queryXferBackend(const nixlXferReqH* req_hndl,

src/core/transfer_request.h

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#ifndef __TRANSFER_REQUEST_H_
1818
#define __TRANSFER_REQUEST_H_
1919

20-
#include <chrono>
2120
#include <string>
2221
#include <unordered_map>
2322
#include <memory>
@@ -26,9 +25,6 @@
2625
#include "backend_engine.h"
2726
#include "telemetry.h"
2827

29-
using chrono_point_t = std::chrono::steady_clock::time_point;
30-
using std::chrono::microseconds;
31-
3228
enum nixl_telemetry_stat_status_t {
3329
NIXL_TELEMETRY_POST = 0,
3430
NIXL_TELEMETRY_POST_AND_FINISH = 1,
@@ -52,11 +48,12 @@ class nixlXferReqH {
5248
nixl_xfer_op_t backendOp;
5349
nixl_status_t status;
5450

51+
// Initializing time values, as getXferTelem might be called before postXfer.
5552
struct {
56-
chrono_point_t startTime;
57-
microseconds postDuration_ = microseconds(0);
58-
microseconds xferDuration_ = microseconds(0);
59-
size_t totalBytes;
53+
chrono_point_t startTime_ = min_chrono_time;
54+
chrono_period_t postDuration_ = chrono_period_t(0);
55+
chrono_period_t xferDuration_ = chrono_period_t(0);
56+
size_t totalBytes_;
6057
} telemetry;
6158

6259
public:

0 commit comments

Comments
 (0)