Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions cpp/tensorrt_llm/common/opUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,24 @@ class PerCudaCtxPerThreadSingletonCreator
PerCudaCtxPerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
: mCreator{std::move(creator)}
, mDeleter{std::move(deleter)}
, mObservers{new std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>()}
{
}

~PerCudaCtxPerThreadSingletonCreator()
{
std::lock_guard<std::mutex> lk{mMutex};
delete mObservers;
mObservers = nullptr;
}

std::shared_ptr<T> operator()()
{
std::lock_guard<std::mutex> lk{mMutex};
CUcontext ctx{getCurrentCudaCtx()};
std::thread::id thread = std::this_thread::get_id();
auto const key = std::make_tuple(ctx, thread);
std::shared_ptr<T> result = mObservers[key].lock();
std::shared_ptr<T> result = (*mObservers)[key].lock();
if (result == nullptr)
{
TLLM_LOG_TRACE("creating singleton instance for CUDA context %lu and thread %lu", ctx, thread);
Expand All @@ -202,6 +210,11 @@ class PerCudaCtxPerThreadSingletonCreator
}
mDeleter(obj);

if (mObservers == nullptr)
{
return;
}

// Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
// frequently.
std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
Expand All @@ -210,17 +223,18 @@ class PerCudaCtxPerThreadSingletonCreator
// thread just before we lock mMutex. We can't infer that the observer is stale from the fact that
// obj is destroyed, because shared_ptr ref-count checking and observer removing are not in one
// atomic operation, and the observer may be changed to observe another instance.
if (mObservers.find(key) == mObservers.end())
auto it = mObservers->find(key);
if (it == mObservers->end())
{
return;
}
observedObjHolder = mObservers.at(key).lock();
observedObjHolder = it->second.lock();
if (observedObjHolder == nullptr)
{
mObservers.erase(key);
mObservers->erase(it);
}
}};
mObservers.at(key) = result;
(*mObservers)[key] = result;
}
else
{
Expand All @@ -235,7 +249,7 @@ class PerCudaCtxPerThreadSingletonCreator
mutable std::mutex mMutex;
// CUDA resources are per-context and per-thread.
using CacheKey = std::tuple<CUcontext, std::thread::id>;
std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>> mObservers;
std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>* mObservers;
};

} // namespace
Expand All @@ -253,6 +267,7 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()
{
TLLM_CUDA_CHECK(cublasDestroy(*handle));
delete handle;
handle = nullptr;
});
return creator();
}
Expand All @@ -270,6 +285,7 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
{
TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
delete handle;
handle = nullptr;
});
return creator();
}
1 change: 0 additions & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,5 @@ triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-T
accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/5606233)
examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233)
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] SKIP (https://nvbugs/5606266)
examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5606268)
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5626197)
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5628952)