Skip to content

子线程GPU加载模型后主线程重新加载GPU模型崩溃 #6453

@Cat-myq

Description

@Cat-myq

问题描述

针对gpu加载时被阻塞的情况,做了以下改动

  1. 主线程启动子线程,尝试在GPU端加载ncnn模型;
  2. 主线程监控子线程,若超时(判定GPU加载阻塞),则降级到CPU加载模型(此分支逻辑正常,无崩溃);
  3. 若子线程未超时且GPU加载成功,主线程尝试重新在GPU端加载同一模型到主线程的ncnn::Net实例 时,会直接崩溃;
  4. 主线程创建Net对象,然后在子线程完成加载(加载成功),主线程复用也崩溃,只能重新加载,但重新加载仍崩溃;
  5. 不加载模型,只create_gpu_instance; destroy_gpu_instance; 没有问题;
  6. 平台差异:Windows 下上述逻辑完全正常,崩溃发生在Android端。

问题模型

  • 正常yolo11n模型
  • 单层relu模型
  • 单层conv模型

相关代码

class ModelLoader {
private:
    std::atomic<bool> load_finished{false};
    std::atomic<bool> load_success{false};
    int load_timeout_ms_ = 3000; // 3秒超时阈值
    std::unique_ptr<NcnnOptionManager> ncnn_option_manager = std::make_unique<NcnnOptionManager>();

    // 工具函数:写入GPU兼容标记(业务逻辑,不影响崩溃)
    void write_gpu_compat_flag(GPU_STATUS) {}
    void cardsLog(const char* msg) { __android_log_print(ANDROID_LOG_INFO, "ModelLoader", "%s", msg); }

    // 核心加载函数
    bool loadModel(std::string mdPath_p, std::string mdPath_b, ncnn::Net* _Net) {
        if (mdPath_p.empty() || mdPath_b.empty()) { return false; }
        int loadedParam = _Net->load_param(mdPath_p.c_str());
        if (loadedParam != 0) return false;
        int loadedBin = _Net->load_model(mdPath_b.c_str());
        return loadedBin == 0;
    }

    // 加载核心逻辑(区分CPU/GPU)
    LoadStatus load_model_core(const std::string& mdPath_p, 
                               const std::string& mdPath_b, 
                               ncnn::Net* net, 
                               bool cpu_force) {
        net->opt = ncnn_option_manager->configure(!cpu_force); // 非CPU强制则启用GPU
        bool ok = loadModel(mdPath_p, mdPath_b, net);
        return cpu_force ? (ok ? LoadStatus::CHANGE_SUCCESS : LoadStatus::CHANGE_FAILED) 
                         : (ok ? LoadStatus::SUCCESS : LoadStatus::FAILED);
    }

    // GPU加载试探子线程
    void gpu_test_worker(const std::string& mdPath_p, const std::string& mdPath_b) {
        load_finished.store(false, std::memory_order_release);
        load_success.store(false, std::memory_order_release);
        try {
            std::unique_ptr<NcnnOptionManager> opt_mgr(new NcnnOptionManager());
            std::unique_ptr<ncnn::Net> test_net(new ncnn::Net());
            test_net->opt = opt_mgr->configure(true); // 子线程启用GPU加载
            bool test_ok = loadModel(mdPath_p, mdPath_b, test_net.get());
            load_success.store(test_ok, std::memory_order_release);
            test_net.reset(); // 释放子线程Net
        } catch (...) {
            load_success.store(false, std::memory_order_release);
        }
        load_finished.store(true, std::memory_order_release);
    }

public:
    LoadStatus try_gpu_first(const std::string& mdPath_p, 
                            const std::string& mdPath_b, 
                            ncnn::Net* net) {
        load_finished.store(false, std::memory_order_release);
        load_success.store(false, std::memory_order_release);

        // 启动GPU试探子线程
        std::thread test_thread(&ModelLoader::gpu_test_worker, this, mdPath_p, mdPath_b);

        // 主线程超时监控
        bool timeout = false;
        const auto start = std::chrono::steady_clock::now();
        while (true) {
            const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                std::chrono::steady_clock::now() - start
            ).count();

            if (elapsed_ms >= load_timeout_ms_) {
                timeout = true;
                break;
            }
            if (load_finished.load(std::memory_order_acquire)) {
                timeout = false;
                break;
            }
            std::this_thread::sleep_for(std::chrono::milliseconds(1));
        }

        // 处理试探结果
        LoadStatus status = LoadStatus::FAILED;
        if (timeout) {
            // 超时降级CPU:此分支无崩溃
            write_gpu_compat_flag(GPU_STATUS::BLOCK_NO_GPU);
            if (test_thread.joinable()) test_thread.detach();
            cardsLog("阻塞,降级到CPU!");
            status = load_model_core(mdPath_p, mdPath_b, net, true);
        } else {
            // 未超时:主线程尝试GPU加载 → 此处崩溃
            cardsLog("子线程GPU加载完成,主线程开始GPU加载!");
            if (test_thread.joinable()) test_thread.join();
            const bool gpu_load_ok = load_success.load(std::memory_order_acquire);
            if (gpu_load_ok) {
                cardsLog("GPU加载成功,主线程重新GPU加载模型");
                write_gpu_compat_flag(GPU_STATUS::UNBLOCK_GPU);
                // 崩溃点:调用load_model_core并传入false(启用GPU)
                status = load_model_core(mdPath_p, mdPath_b, net, false);
            } else {
                write_gpu_compat_flag(GPU_STATUS::UNBLOCK_NO_GPU);
                status = load_model_core(mdPath_p, mdPath_b, net, true);
            }
        }
        return status;
    }
};

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions