pytorch · GregoryComer · Jan 16, 2026 · Jan 16, 2026
@@ -40,7 +40,8 @@ void reg_count(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-    StagingBuffer buffer(context(), vkapi::kFloat, 1);
+    StagingBuffer buffer(
+        context(), vkapi::kFloat, 1, vkapi::CopyDirection::DEVICE_TO_HOST);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "reg_count_" + std::to_string(nreg);
@@ -164,7 +165,11 @@ void warp_size(const App& app, const bool verbose = false) {
   uint32_t NITER;
 
   auto bench = [&](uint32_t nthread) {
-    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StagingBuffer out_buf(
+        context(),
+        vkapi::kInt,
+        app.nthread_logic,
+        vkapi::CopyDirection::DEVICE_TO_HOST);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_physical";
@@ -224,7 +229,11 @@ void warp_size(const App& app, const bool verbose = false) {
   // doesn't depend on kernel timing, so the extra wait time doesn't lead to
   // inaccuracy.
   auto bench_sm = [&](uint32_t nthread) {
-    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    StagingBuffer out_buf(
+        context(),
+        vkapi::kInt,
+        app.nthread_logic,
+        vkapi::CopyDirection::DEVICE_TO_HOST);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "warp_size_scheduler";

@@ -35,8 +35,13 @@ void buf_cacheline_size(const App& app) {
   uint32_t NITER;
 
   auto bench = [&](int stride) {
-    StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-    StagingBuffer out_buf(context(), vkapi::kFloat, 1);
+    StagingBuffer in_buf(
+        context(),
+        vkapi::kFloat,
+        BUF_SIZE,
+        vkapi::CopyDirection::HOST_TO_DEVICE);
+    StagingBuffer out_buf(
+        context(), vkapi::kFloat, 1, vkapi::CopyDirection::DEVICE_TO_HOST);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "buf_cacheline_size";
@@ -132,9 +137,16 @@ void _bandwidth(
     // workgroups, once the size of the access excedes the workgroup width.
     const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
-    StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
+    StagingBuffer in_buf(
+        context(),
+        vkapi::kFloat,
+        range / sizeof(float),
+        vkapi::CopyDirection::HOST_TO_DEVICE);
     StagingBuffer out_buf(
-        context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
+        context(),
+        vkapi::kFloat,
+        VEC_WIDTH * app.nthread_logic,
+        vkapi::CopyDirection::DEVICE_TO_HOST);
     vkapi::PipelineBarrier pipeline_barrier{};
 
     auto shader_name = "buf_bandwidth_" + memtype_lower;

@@ -61,7 +61,11 @@ void tex_cacheline_concurr(const App& app) {
       vTensor in_tensor =
           api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
 
-      StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+      StagingBuffer out_buf(
+          context(),
+          vkapi::kFloat,
+          TEXEL_WIDTH,
+          vkapi::CopyDirection::DEVICE_TO_HOST);
 
       vkapi::PipelineBarrier pipeline_barrier{};
 
@@ -174,7 +178,10 @@ void tex_bandwidth(const App& app) {
       const uint32_t workgroup_width = local_x * NITER * NUNROLL;
 
       StagingBuffer out_buf(
-          context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
+          context(),
+          vkapi::kFloat,
+          VEC_WIDTH * app.nthread_logic,
+          vkapi::CopyDirection::DEVICE_TO_HOST);
       vkapi::PipelineBarrier pipeline_barrier{};
 
       auto time = benchmark_on_gpu(shader_name, 10, [&]() {