Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASENOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ __Bug Fixes__:

__API Changes__:

#1521 Add CUDA memory management APIs: `torch.cuda.empty_cache()`, `torch.cuda.memory_allocated()`, `torch.cuda.max_memory_allocated()`, `torch.cuda.reset_peak_memory_stats()`, `torch.cuda.memory_reserved()`, `torch.cuda.max_memory_reserved()`, `torch.cuda.mem_get_info()`, `torch.cuda.set_device()`, `torch.cuda.current_device()`.<br/>
#1503 Add ReadOnlySpan overloads to many methods.<br/>
#1478 Fix `torch.jit.ScriptModule.zero_grad`.<br/>
#1495 Make `torchvision.io.read_image` and `torchvision.io.read_image_async` allow subsequent opening of the file for reading.<br/>
Expand Down
4 changes: 2 additions & 2 deletions build/BranchInfo.props
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<PropertyGroup>
<MajorVersion>0</MajorVersion>
<MinorVersion>106</MinorVersion>
<PatchVersion>0</PatchVersion>
<PreviousPackageVersion>0.105.2</PreviousPackageVersion>
<PatchVersion>1</PatchVersion>
<PreviousPackageVersion>0.106.0</PreviousPackageVersion>
</PropertyGroup>
</Project>
129 changes: 129 additions & 0 deletions src/Native/LibTorchSharp/THSTorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
#include "torch/torch.h"
#include "torch/cuda.h"

#if defined(USE_CUDA)
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAFunctions.h>
#endif

void THSTorch_manual_seed(const int64_t seed)
{
torch::manual_seed(seed);
Expand Down Expand Up @@ -145,6 +151,129 @@ void THSTorchCuda_synchronize(const int64_t device_index)
CATCH(torch::cuda::synchronize(device_index);)
}

#if defined(USE_CUDA)

void THSTorchCuda_empty_cache()
{
CATCH(c10::cuda::CUDACachingAllocator::emptyCache();)
}

size_t THSTorchCuda_memory_allocated(const int64_t device_index)
{
size_t res = 0;
CATCH(
auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
res = c10::cuda::CUDACachingAllocator::currentMemoryAllocated(device);
)
return res;
}

size_t THSTorchCuda_max_memory_allocated(const int64_t device_index)
{
size_t res = 0;
CATCH(
auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
res = c10::cuda::CUDACachingAllocator::maxMemoryAllocated(device);
)
return res;
}

void THSTorchCuda_reset_peak_memory_stats(const int64_t device_index)
{
CATCH(
auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
c10::cuda::CUDACachingAllocator::resetPeakStats(device);
)
}

size_t THSTorchCuda_memory_reserved(const int64_t device_index)
{
size_t res = 0;
CATCH(
auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
res = c10::cuda::CUDACachingAllocator::currentMemoryReserved(device);
)
return res;
}

size_t THSTorchCuda_max_memory_reserved(const int64_t device_index)
{
size_t res = 0;
CATCH(
auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
res = c10::cuda::CUDACachingAllocator::maxMemoryReserved(device);
)
return res;
}

void THSTorchCuda_mem_get_info(const int64_t device_index, size_t* free, size_t* total)
{
CATCH(
auto device = device_index < 0 ? c10::cuda::current_device() : static_cast<c10::DeviceIndex>(device_index);
c10::cuda::CUDAGuard guard(device);
C10_CUDA_CHECK(cudaMemGetInfo(free, total));
)
}

void THSTorchCuda_set_device(const int64_t device_index)
{
CATCH(c10::cuda::set_device(static_cast<c10::DeviceIndex>(device_index));)
}

int64_t THSTorchCuda_current_device()
{
int64_t res = -1;
CATCH(res = static_cast<int64_t>(c10::cuda::current_device());)
return res;
}

#else

void THSTorchCuda_empty_cache()
{
}

size_t THSTorchCuda_memory_allocated(const int64_t device_index)
{
return 0;
}

size_t THSTorchCuda_max_memory_allocated(const int64_t device_index)
{
return 0;
}

void THSTorchCuda_reset_peak_memory_stats(const int64_t device_index)
{
}

size_t THSTorchCuda_memory_reserved(const int64_t device_index)
{
return 0;
}

size_t THSTorchCuda_max_memory_reserved(const int64_t device_index)
{
return 0;
}

void THSTorchCuda_mem_get_info(const int64_t device_index, size_t* free, size_t* total)
{
*free = 0;
*total = 0;
}

void THSTorchCuda_set_device(const int64_t device_index)
{
}

int64_t THSTorchCuda_current_device()
{
return -1;
}

#endif


const char * THSTorch_get_and_reset_last_err()
{
Expand Down
10 changes: 10 additions & 0 deletions src/Native/LibTorchSharp/THSTorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ EXPORT_API(int) THSTorchCuda_cudnn_is_available();
EXPORT_API(int) THSTorchCuda_device_count();
EXPORT_API(void) THSTorchCuda_synchronize(const int64_t device);

EXPORT_API(void) THSTorchCuda_empty_cache();
EXPORT_API(size_t) THSTorchCuda_memory_allocated(const int64_t device);
EXPORT_API(size_t) THSTorchCuda_max_memory_allocated(const int64_t device);
EXPORT_API(void) THSTorchCuda_reset_peak_memory_stats(const int64_t device);
EXPORT_API(size_t) THSTorchCuda_memory_reserved(const int64_t device);
EXPORT_API(size_t) THSTorchCuda_max_memory_reserved(const int64_t device);
EXPORT_API(void) THSTorchCuda_mem_get_info(const int64_t device, size_t* free, size_t* total);
EXPORT_API(void) THSTorchCuda_set_device(const int64_t device);
EXPORT_API(int64_t) THSTorchCuda_current_device();

EXPORT_API(bool) THSBackend_cublas_get_allow_tf32();
EXPORT_API(void) THSBackend_cublas_set_allow_tf32(const bool flag);
EXPORT_API(bool) THSBackend_cudnn_get_allow_tf32();
Expand Down
27 changes: 27 additions & 0 deletions src/TorchSharp/PInvoke/LibTorchSharp.THSTorchCuda.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,32 @@ internal static partial class NativeMethods

[DllImport("LibTorchSharp")]
internal static extern void THSTorchCuda_synchronize(long device_index);

[DllImport("LibTorchSharp")]
internal static extern void THSTorchCuda_empty_cache();

[DllImport("LibTorchSharp")]
internal static extern ulong THSTorchCuda_memory_allocated(long device_index);

[DllImport("LibTorchSharp")]
internal static extern ulong THSTorchCuda_max_memory_allocated(long device_index);

[DllImport("LibTorchSharp")]
internal static extern void THSTorchCuda_reset_peak_memory_stats(long device_index);

[DllImport("LibTorchSharp")]
internal static extern ulong THSTorchCuda_memory_reserved(long device_index);

[DllImport("LibTorchSharp")]
internal static extern ulong THSTorchCuda_max_memory_reserved(long device_index);

[DllImport("LibTorchSharp")]
internal static extern void THSTorchCuda_mem_get_info(long device_index, out ulong free, out ulong total);

[DllImport("LibTorchSharp")]
internal static extern void THSTorchCuda_set_device(long device_index);

[DllImport("LibTorchSharp")]
internal static extern long THSTorchCuda_current_device();
}
}
118 changes: 118 additions & 0 deletions src/TorchSharp/Torch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,124 @@ public static void synchronize(Device? device = null)
TryInitializeDeviceType(device?.type ?? DeviceType.CUDA);
THSTorchCuda_synchronize(device?.index ?? -1);
}

/// <summary>
/// Releases all unoccupied cached memory currently held by the caching allocator
/// so that those can be used in other GPU applications and visible in nvidia-smi.
/// </summary>
/// <remarks>
/// empty_cache() doesn't increase the amount of GPU memory available for PyTorch.
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation refers to "PyTorch" but this is TorchSharp. Update this to say "doesn't increase the amount of GPU memory available for TorchSharp" to accurately reflect the library being documented.

Suggested change
/// empty_cache() doesn't increase the amount of GPU memory available for PyTorch.
/// empty_cache() doesn't increase the amount of GPU memory available for TorchSharp.

Copilot uses AI. Check for mistakes.
/// It only frees the memory that is cached by the allocator but not currently used by any tensor.
/// </remarks>
public static void empty_cache()
{
TryInitializeDeviceType(DeviceType.CUDA);
THSTorchCuda_empty_cache();
CheckForErrors();
}

/// <summary>
/// Returns the current GPU memory occupied by tensors in bytes for the given device.
/// </summary>
/// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
/// <returns>The amount of memory in bytes.</returns>
public static long memory_allocated(Device? device = null)
{
TryInitializeDeviceType(DeviceType.CUDA);
var result = THSTorchCuda_memory_allocated(device?.index ?? -1);
CheckForErrors();
return (long)result;
}

/// <summary>
/// Returns the maximum GPU memory occupied by tensors in bytes for the given device.
/// </summary>
/// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
/// <returns>The peak amount of memory in bytes.</returns>
/// <remarks>
/// By default, this returns the peak allocated memory since the beginning of this program.
/// reset_peak_memory_stats() can be used to reset the starting point in tracking this metric.
/// </remarks>
public static long max_memory_allocated(Device? device = null)
{
TryInitializeDeviceType(DeviceType.CUDA);
var result = THSTorchCuda_max_memory_allocated(device?.index ?? -1);
CheckForErrors();
return (long)result;
}

/// <summary>
/// Resets the starting point in tracking maximum GPU memory occupied by tensors for the given device.
/// </summary>
/// <param name="device">Selected device. Resets statistic for the current device, given by current_device(), if device is null.</param>
public static void reset_peak_memory_stats(Device? device = null)
{
TryInitializeDeviceType(DeviceType.CUDA);
THSTorchCuda_reset_peak_memory_stats(device?.index ?? -1);
CheckForErrors();
}

/// <summary>
/// Returns the current GPU memory managed by the caching allocator in bytes for the given device.
/// </summary>
/// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
/// <returns>The amount of reserved memory in bytes.</returns>
public static long memory_reserved(Device? device = null)
{
TryInitializeDeviceType(DeviceType.CUDA);
var result = THSTorchCuda_memory_reserved(device?.index ?? -1);
CheckForErrors();
return (long)result;
}

/// <summary>
/// Returns the maximum GPU memory managed by the caching allocator in bytes for the given device.
/// </summary>
/// <param name="device">Selected device. Returns statistic for the current device, given by current_device(), if device is null.</param>
/// <returns>The peak amount of reserved memory in bytes.</returns>
public static long max_memory_reserved(Device? device = null)
{
TryInitializeDeviceType(DeviceType.CUDA);
var result = THSTorchCuda_max_memory_reserved(device?.index ?? -1);
CheckForErrors();
return (long)result;
}

/// <summary>
/// Returns the free and total memory on the CUDA device using cudaMemGetInfo.
/// </summary>
/// <param name="device">Selected device. Returns info for the current device, given by current_device(), if device is null.</param>
/// <returns>A tuple of (free, total) memory in bytes.</returns>
public static (long free, long total) mem_get_info(Device? device = null)
{
TryInitializeDeviceType(DeviceType.CUDA);
THSTorchCuda_mem_get_info(device?.index ?? -1, out var free, out var total);
CheckForErrors();
return ((long)free, (long)total);
}

/// <summary>
/// Sets the current CUDA device.
/// </summary>
/// <param name="device">Selected device index.</param>
public static void set_device(int device)
{
TryInitializeDeviceType(DeviceType.CUDA);
THSTorchCuda_set_device(device);
CheckForErrors();
}

/// <summary>
/// Returns the index of the currently selected CUDA device.
/// </summary>
/// <returns>The device index.</returns>
public static int current_device()
{
TryInitializeDeviceType(DeviceType.CUDA);
var result = THSTorchCuda_current_device();
CheckForErrors();
return (int)result;
}
}

/// <summary>
Expand Down
Loading
Loading