change var mem to heap, to manualy control

yunruis · yunruis · commit 4b7e1cc69cdf · 2025-10-30T22:37:15.000-07:00
Signed-off-by: yunruis &lt;205571022+yunruis@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp
@@ -179,16 +179,24 @@ class PerCudaCtxPerThreadSingletonCreator
     PerCudaCtxPerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
         : mCreator{std::move(creator)}
         , mDeleter{std::move(deleter)}
+        , mObservers{new std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>()}
     {
     }
+    
+    ~PerCudaCtxPerThreadSingletonCreator()
+    {
+        std::lock_guard<std::mutex> lk{mMutex};
+        delete mObservers;
+        mObservers = nullptr;
+    }
 
     std::shared_ptr<T> operator()()
     {
         std::lock_guard<std::mutex> lk{mMutex};
         CUcontext ctx{getCurrentCudaCtx()};
         std::thread::id thread = std::this_thread::get_id();
         auto const key = std::make_tuple(ctx, thread);
-        std::shared_ptr<T> result = mObservers[key].lock();
+        std::shared_ptr<T> result = (*mObservers)[key].lock();
         if (result == nullptr)
         {
             TLLM_LOG_TRACE("creating singleton instance for CUDA context %lu and thread %lu", ctx, thread);
@@ -202,6 +210,11 @@ class PerCudaCtxPerThreadSingletonCreator
                     }
                     mDeleter(obj);
 
+                    if (mObservers == nullptr)
+                    {
+                        return;
+                    }
+
                     // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
                     // frequently.
                     std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
@@ -210,17 +223,18 @@ class PerCudaCtxPerThreadSingletonCreator
                     // thread just before we lock mMutex. We can't infer that the observer is stale from the fact that
                     // obj is destroyed, because shared_ptr ref-count checking and observer removing are not in one
                     // atomic operation, and the observer may be changed to observe another instance.
-                    if (mObservers.find(key) == mObservers.end())
+                    auto it = mObservers->find(key);
+                    if (it == mObservers->end())
                     {
                         return;
                     }
-                    observedObjHolder = mObservers.at(key).lock();
+                    observedObjHolder = it->second.lock();
                     if (observedObjHolder == nullptr)
                     {
-                        mObservers.erase(key);
+                        mObservers->erase(it);
                     }
                 }};
-            mObservers.at(key) = result;
+            (*mObservers)[key] = result;
         }
         else
         {
@@ -235,7 +249,7 @@ class PerCudaCtxPerThreadSingletonCreator
     mutable std::mutex mMutex;
     // CUDA resources are per-context and per-thread.
     using CacheKey = std::tuple<CUcontext, std::thread::id>;
-    std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>> mObservers;
+    std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>* mObservers;
 };
 
 } // namespace
@@ -253,6 +267,7 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()
         {
             TLLM_CUDA_CHECK(cublasDestroy(*handle));
             delete handle;
+            handle = nullptr;
         });
     return creator();
 }
@@ -270,6 +285,7 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
         {
             TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
             delete handle;
+            handle = nullptr;
         });
     return creator();
 }

Original file line number	Diff line number	Diff line change
`@@ -179,16 +179,24 @@ class PerCudaCtxPerThreadSingletonCreator`
`179`	`179`	`PerCudaCtxPerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter)`
`180`	`180`	`: mCreator{std::move(creator)}`
`181`	`181`	`, mDeleter{std::move(deleter)}`
	`182`	`+ , mObservers{new std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>()}`
`182`	`183`	`{`
`183`	`184`	`}`
	`185`	`+`
	`186`	`+ ~PerCudaCtxPerThreadSingletonCreator()`
	`187`	`+ {`
	`188`	`+ std::lock_guard<std::mutex> lk{mMutex};`
	`189`	`+ delete mObservers;`
	`190`	`+ mObservers = nullptr;`
	`191`	`+ }`
`184`	`192`
`185`	`193`	`std::shared_ptr<T> operator()()`
`186`	`194`	`{`
`187`	`195`	`std::lock_guard<std::mutex> lk{mMutex};`
`188`	`196`	`CUcontext ctx{getCurrentCudaCtx()};`
`189`	`197`	`std::thread::id thread = std::this_thread::get_id();`
`190`	`198`	`auto const key = std::make_tuple(ctx, thread);`
`191`		`- std::shared_ptr<T> result = mObservers[key].lock();`
	`199`	`+ std::shared_ptr<T> result = (*mObservers)[key].lock();`
`192`	`200`	`if (result == nullptr)`
`193`	`201`	`{`
`194`	`202`	`TLLM_LOG_TRACE("creating singleton instance for CUDA context %lu and thread %lu", ctx, thread);`
`@@ -202,6 +210,11 @@ class PerCudaCtxPerThreadSingletonCreator`
`202`	`210`	`}`
`203`	`211`	`mDeleter(obj);`
`204`	`212`
	`213`	`+ if (mObservers == nullptr)`
	`214`	`+ {`
	`215`	`+ return;`
	`216`	`+ }`
	`217`	`+`
`205`	`218`	`// Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts`
`206`	`219`	`// frequently.`
`207`	`220`	`std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.`
`@@ -210,17 +223,18 @@ class PerCudaCtxPerThreadSingletonCreator`
`210`	`223`	`// thread just before we lock mMutex. We can't infer that the observer is stale from the fact that`
`211`	`224`	`// obj is destroyed, because shared_ptr ref-count checking and observer removing are not in one`
`212`	`225`	`// atomic operation, and the observer may be changed to observe another instance.`
`213`		`- if (mObservers.find(key) == mObservers.end())`
	`226`	`+ auto it = mObservers->find(key);`
	`227`	`+ if (it == mObservers->end())`
`214`	`228`	`{`
`215`	`229`	`return;`
`216`	`230`	`}`
`217`		`- observedObjHolder = mObservers.at(key).lock();`
	`231`	`+ observedObjHolder = it->second.lock();`
`218`	`232`	`if (observedObjHolder == nullptr)`
`219`	`233`	`{`
`220`		`- mObservers.erase(key);`
	`234`	`+ mObservers->erase(it);`
`221`	`235`	`}`
`222`	`236`	`}};`
`223`		`- mObservers.at(key) = result;`
	`237`	`+ (*mObservers)[key] = result;`
`224`	`238`	`}`
`225`	`239`	`else`
`226`	`240`	`{`
`@@ -235,7 +249,7 @@ class PerCudaCtxPerThreadSingletonCreator`
`235`	`249`	`mutable std::mutex mMutex;`
`236`	`250`	`// CUDA resources are per-context and per-thread.`
`237`	`251`	`using CacheKey = std::tuple<CUcontext, std::thread::id>;`
`238`		`- std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>> mObservers;`
	`252`	`+ std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>* mObservers;`
`239`	`253`	`};`
`240`	`254`
`241`	`255`	`} // namespace`
`@@ -253,6 +267,7 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()`
`253`	`267`	`{`
`254`	`268`	`TLLM_CUDA_CHECK(cublasDestroy(*handle));`
`255`	`269`	`delete handle;`
	`270`	`+ handle = nullptr;`
`256`	`271`	`});`
`257`	`272`	`return creator();`
`258`	`273`	`}`
`@@ -270,6 +285,7 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()`
`270`	`285`	`{`
`271`	`286`	`TLLM_CUDA_CHECK(cublasLtDestroy(*handle));`
`272`	`287`	`delete handle;`
	`288`	`+ handle = nullptr;`
`273`	`289`	`});`
`274`	`290`	`return creator();`
`275`	`291`	`}`