-
Notifications
You must be signed in to change notification settings - Fork 242
Open
Description
Hello,
I am trying to build hypre on our cluster with CMake with the following options:
cmake ../src -DBUILD_SHARED_LIBS=ON \
-DHYPRE_ENABLE_PRINT_ERRORS=ON \
-DHYPRE_ENABLE_CUDA=ON \
-DCMAKE_CUDA_ARCHITECTURES='80' \
-DHYPRE_ENABLE_UNIFIED_MEMORY=ON \
-DHYPRE_BUILD_EXAMPLES=ON \
-DHYPRE_BUILD_TESTS=ON \
-DHYPRE_ENABLE_GPU_AWARE_MPI=ON \
-DHYPRE_ENABLE_HYPRE_BLAS=OFF -DHYPRE_ENABLE_HYPRE_LAPACK=OFF \
-DHYPRE_ENABLE_SUPERLU=ON -DCMAKE_PREFIX_PATH=/path/to/my/superlu
make -j8
When I try to run the ij test, it runs without any error
srun --ntasks=1 --gres=gpu:a100:1 --cpus-per-task=18 --mem=64G --time=01:00:00 ./build/test/ij
However, when I run the IJ example code (ex5),
srun --ntasks=1 --gres=gpu:a100:1 --cpus-per-task=18 --mem=64G --time=01:00:00 ./build/example/ex5
I get the following error:
terminate called after throwing an instance of 'thrust::THRUST_200500_800_NS::system::system_error'
what(): after determining tmp storage requirements for exclusive_scan: cudaErrorInvalidDevice: invalid device ordinal
I also create small sanity check program to further explore what went wrong:
// hypre_sanity_5x5.cu
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <iostream>
#include <mpi.h>
#include <cuda_runtime.h>
#include "HYPRE.h"
#include "HYPRE_krylov.h"
#include "HYPRE_parcsr_ls.h"
static void my_cuda_state(const char* tag){
int dev=-1, ndev=0;
cudaGetDevice(&dev);
cudaGetDeviceCount(&ndev);
size_t f=0,t=0;
cudaMemGetInfo(&f,&t);
printf("[%s] device=%d of %d, total=%.2f GiB, free=%.2f GiB\n",
tag, dev, ndev, (double)t/(1ull<<30), (double)f/(1ull<<30));
fflush(stdout);
}
int main(int argc, char** argv)
{
MPI_Init(&argc, &argv);
int myid=0, nprocs=1;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
// ---------------- CUDA: bind this rank to GPU 0 and create context ----------------
my_cuda_state("My own Device Info check:");
// ---------------- HYPRE init (avoid HYPRE_PrintDeviceInfo) ----------------
HYPRE_Initialize();
std::cout << "HYPRE_PrintDeviceInfo:" << std::endl;
HYPRE_PrintDeviceInfo();
HYPRE_Finalize();
MPI_Finalize();
return 0;
}
and the output is
[My own Device Info check:] device=0 of 1, total=39.39 GiB, free=38.98 GiB
HYPRE_PrintDeviceInfo:
Running on "NVIDIA A100-SXM4-40GB", major 8, minor 0, total memory 0.00 GiB
MaxSharedMemoryPerBlock 49152, MaxSharedMemoryPerBlockOptin 166912
which makes me more confused, my native cuda device query can detect the GPU and valid graphic memory, however HYPRE_PrintDeviceInfo returns with total memory 0.00 GiB.
Could someone enlighten me what configuration I did wrong here? Thank you very much.
zatkins-dev
Metadata
Metadata
Assignees
Labels
No labels