dotnet · alinpahontu2912 · Feb 13, 2026 · Feb 13, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/RELEASENOTES.md b/RELEASENOTES.md
@@ -16,6 +16,7 @@ __Bug Fixes__:
 
 __API Changes__:
 
+#1521 Add CUDA memory management APIs: `torch.cuda.empty_cache()`, `torch.cuda.memory_allocated()`, `torch.cuda.max_memory_allocated()`, `torch.cuda.reset_peak_memory_stats()`, `torch.cuda.memory_reserved()`, `torch.cuda.max_memory_reserved()`, `torch.cuda.mem_get_info()`, `torch.cuda.set_device()`, `torch.cuda.current_device()`.<br/>
 #1503 Add ReadOnlySpan overloads to many methods.<br/>
 #1478 Fix `torch.jit.ScriptModule.zero_grad`.<br/>
 #1495 Make `torchvision.io.read_image` and `torchvision.io.read_image_async` allow subsequent opening of the file for reading.<br/>

diff --git a/build/BranchInfo.props b/build/BranchInfo.props
@@ -2,7 +2,7 @@
   <PropertyGroup>
     <MajorVersion>0</MajorVersion>
     <MinorVersion>106</MinorVersion>
-    <PatchVersion>0</PatchVersion>
-    <PreviousPackageVersion>0.105.2</PreviousPackageVersion>
+    <PatchVersion>1</PatchVersion>
+    <PreviousPackageVersion>0.106.0</PreviousPackageVersion>
   </PropertyGroup>
 </Project>
diff --git a/src/Native/LibTorchSharp/THSTorch.cpp b/src/Native/LibTorchSharp/THSTorch.cpp
@@ -4,6 +4,12 @@
 #include "torch/torch.h"
 #include "torch/cuda.h"
 
+#if defined(USE_CUDA)
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAFunctions.h>
+#endif
+
 void THSTorch_manual_seed(const int64_t seed)
 {
     torch::manual_seed(seed);
@@ -145,6 +151,129 @@ void THSTorchCuda_synchronize(const int64_t device_index)
     CATCH(torch::cuda::synchronize(device_index);)
 }
 
+#if defined(USE_CUDA)
+
+void THSTorchCuda_empty_cache()
+{
+    CATCH(c10::cuda::CUDACachingAllocator::emptyCache();)
+}
+
+size_t THSTorchCuda_memory_allocated(const int64_t device_index)
+{
+    size_t res = 0;
+    CATCH(
+        auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
+        res = c10::cuda::CUDACachingAllocator::currentMemoryAllocated(device);
+    )
+    return res;
+}
+
+size_t THSTorchCuda_max_memory_allocated(const int64_t device_index)
+{
+    size_t res = 0;
+    CATCH(
+        auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
+        res = c10::cuda::CUDACachingAllocator::maxMemoryAllocated(device);
+    )
+    return res;
+}
+
+void THSTorchCuda_reset_peak_memory_stats(const int64_t device_index)
+{
+    CATCH(
+        auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
+        c10::cuda::CUDACachingAllocator::resetPeakStats(device);
+    )
+}
+
+size_t THSTorchCuda_memory_reserved(const int64_t device_index)
+{
+    size_t res = 0;
+    CATCH(
+        auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
+        res = c10::cuda::CUDACachingAllocator::currentMemoryReserved(device);
+    )
+    return res;
+}
+
+size_t THSTorchCuda_max_memory_reserved(const int64_t device_index)
+{
+    size_t res = 0;
+    CATCH(
+        auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
+        res = c10::cuda::CUDACachingAllocator::maxMemoryReserved(device);
+    )
+    return res;
+}
+
+void THSTorchCuda_mem_get_info(const int64_t device_index, size_t* free, size_t* total)
+{
+    CATCH(
+        auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
+        c10::cuda::CUDAGuard guard(device);
+        C10_CUDA_CHECK(cudaMemGetInfo(free, total));
+    )
+}
+
+void THSTorchCuda_set_device(const int64_t device_index)
+{
+    CATCH(c10::cuda::set_device(static_cast<c10::DeviceIndex>(device_index));)
+}
+
+int64_t THSTorchCuda_current_device()
+{
+    int64_t res = -1;
+    CATCH(res = static_cast<int64_t>(c10::cuda::current_device());)
+    return res;
+}
+
+#else
+
+void THSTorchCuda_empty_cache()
+{
+}
+
+size_t THSTorchCuda_memory_allocated(const int64_t device_index)
+{
+    return 0;
+}
+
+size_t THSTorchCuda_max_memory_allocated(const int64_t device_index)
+{
+    return 0;
+}
+
+void THSTorchCuda_reset_peak_memory_stats(const int64_t device_index)
+{
+}
+
+size_t THSTorchCuda_memory_reserved(const int64_t device_index)
+{
+    return 0;
+}
+
+size_t THSTorchCuda_max_memory_reserved(const int64_t device_index)
+{
+    return 0;
+}
+
+void THSTorchCuda_mem_get_info(const int64_t device_index, size_t* free, size_t* total)
+{
+    *free = 0;
+    *total = 0;
+}
+
+void THSTorchCuda_set_device(const int64_t device_index)
+{
+}
+
+int64_t THSTorchCuda_current_device()
+{
+    return -1;
+}
+
+#endif
+
 
 const char * THSTorch_get_and_reset_last_err()
 {

diff --git a/src/Native/LibTorchSharp/THSTorch.h b/src/Native/LibTorchSharp/THSTorch.h
@@ -28,6 +28,16 @@ EXPORT_API(int) THSTorchCuda_cudnn_is_available();
 EXPORT_API(int) THSTorchCuda_device_count();
 EXPORT_API(void) THSTorchCuda_synchronize(const int64_t device);
 
+EXPORT_API(void) THSTorchCuda_empty_cache();
+EXPORT_API(size_t) THSTorchCuda_memory_allocated(const int64_t device);
+EXPORT_API(size_t) THSTorchCuda_max_memory_allocated(const int64_t device);
+EXPORT_API(void) THSTorchCuda_reset_peak_memory_stats(const int64_t device);
+EXPORT_API(size_t) THSTorchCuda_memory_reserved(const int64_t device);
+EXPORT_API(size_t) THSTorchCuda_max_memory_reserved(const int64_t device);
+EXPORT_API(void) THSTorchCuda_mem_get_info(const int64_t device, size_t* free, size_t* total);
+EXPORT_API(void) THSTorchCuda_set_device(const int64_t device);
+EXPORT_API(int64_t) THSTorchCuda_current_device();
+
 EXPORT_API(bool)  THSBackend_cublas_get_allow_tf32();
 EXPORT_API(void)  THSBackend_cublas_set_allow_tf32(const bool flag);
 EXPORT_API(bool)  THSBackend_cudnn_get_allow_tf32();

diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSTorchCuda.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSTorchCuda.cs
@@ -19,5 +19,32 @@ internal static partial class NativeMethods
 
         [DllImport("LibTorchSharp")]
         internal static extern void THSTorchCuda_synchronize(long device_index);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorchCuda_empty_cache();
+
+        [DllImport("LibTorchSharp")]
+        internal static extern ulong THSTorchCuda_memory_allocated(long device_index);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern ulong THSTorchCuda_max_memory_allocated(long device_index);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorchCuda_reset_peak_memory_stats(long device_index);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern ulong THSTorchCuda_memory_reserved(long device_index);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern ulong THSTorchCuda_max_memory_reserved(long device_index);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorchCuda_mem_get_info(long device_index, out ulong free, out ulong total);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorchCuda_set_device(long device_index);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern long THSTorchCuda_current_device();
     }
 }
diff --git a/src/TorchSharp/Torch.cs b/src/TorchSharp/Torch.cs
@@ -607,6 +607,124 @@ public static void synchronize(Device? device = null)
                 TryInitializeDeviceType(device?.type ?? DeviceType.CUDA);
                 THSTorchCuda_synchronize(device?.index ?? -1);
             }
+
+            /// <summary>
+            /// Releases all unoccupied cached memory currently held by the caching allocator
+            /// so that those can be used in other GPU applications and visible in nvidia-smi.
+            /// </summary>
+            /// <remarks>
+            /// empty_cache() doesn't increase the amount of GPU memory available for PyTorch.
-            /// empty_cache() doesn't increase the amount of GPU memory available for PyTorch.
+            /// empty_cache() doesn't increase the amount of GPU memory available for TorchSharp.
-            /// empty_cache() doesn't increase the amount of GPU memory available for PyTorch.
+            /// empty_cache() doesn't increase the amount of GPU memory available for TorchSharp.
+            /// It only frees the memory that is cached by the allocator but not currently used by any tensor.
+            /// </remarks>
+            public static void empty_cache()
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                THSTorchCuda_empty_cache();
+                CheckForErrors();
+            }
+
+            /// <summary>
+            /// Returns the current GPU memory occupied by tensors in bytes for the given device.
+            /// </summary>
+            /// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
+            /// <returns>The amount of memory in bytes.</returns>
+            public static long memory_allocated(Device? device = null)
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                var result = THSTorchCuda_memory_allocated(device?.index ?? -1);
+                CheckForErrors();
+                return (long)result;
+            }
+
+            /// <summary>
+            /// Returns the maximum GPU memory occupied by tensors in bytes for the given device.
+            /// </summary>
+            /// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
+            /// <returns>The peak amount of memory in bytes.</returns>
+            /// <remarks>
+            /// By default, this returns the peak allocated memory since the beginning of this program.
+            /// reset_peak_memory_stats() can be used to reset the starting point in tracking this metric.
+            /// </remarks>
+            public static long max_memory_allocated(Device? device = null)
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                var result = THSTorchCuda_max_memory_allocated(device?.index ?? -1);
+                CheckForErrors();
+                return (long)result;
+            }
+
+            /// <summary>
+            /// Resets the starting point in tracking maximum GPU memory occupied by tensors for the given device.
+            /// </summary>
+            /// <param name="device">Selected device. Resets statistic for the current device, given by current_device(), if device is null.</param>
+            public static void reset_peak_memory_stats(Device? device = null)
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                THSTorchCuda_reset_peak_memory_stats(device?.index ?? -1);
+                CheckForErrors();
+            }
+
+            /// <summary>
+            /// Returns the current GPU memory managed by the caching allocator in bytes for the given device.
+            /// </summary>
+            /// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
+            /// <returns>The amount of reserved memory in bytes.</returns>
+            public static long memory_reserved(Device? device = null)
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                var result = THSTorchCuda_memory_reserved(device?.index ?? -1);
+                CheckForErrors();
+                return (long)result;
+            }
+
+            /// <summary>
+            /// Returns the maximum GPU memory managed by the caching allocator in bytes for the given device.
+            /// </summary>
+            /// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
+            /// <returns>The peak amount of reserved memory in bytes.</returns>
+            public static long max_memory_reserved(Device? device = null)
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                var result = THSTorchCuda_max_memory_reserved(device?.index ?? -1);
+                CheckForErrors();
+                return (long)result;
+            }
+
+            /// <summary>
+            /// Returns the free and total memory on the CUDA device using cudaMemGetInfo.
+            /// </summary>
+            /// <param name="device">Selected device. Returns info for the current device, given by current_device(), if device is null.</param>
+            /// <returns>A tuple of (free, total) memory in bytes.</returns>
+            public static (long free, long total) mem_get_info(Device? device = null)
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                THSTorchCuda_mem_get_info(device?.index ?? -1, out var free, out var total);
+                CheckForErrors();
+                return ((long)free, (long)total);
+            }
+
+            /// <summary>
+            /// Sets the current CUDA device.
+            /// </summary>
+            /// <param name="device">Selected device index.</param>
+            public static void set_device(int device)
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                THSTorchCuda_set_device(device);
+                CheckForErrors();
+            }
+
+            /// <summary>
+            /// Returns the index of the currently selected CUDA device.
+            /// </summary>
+            /// <returns>The device index.</returns>
+            public static int current_device()
+            {
+                TryInitializeDeviceType(DeviceType.CUDA);
+                var result = THSTorchCuda_current_device();
+                CheckForErrors();
+                return (int)result;
+            }
         }
 
         /// <summary>