add error checks for all the CUDA APIs

NCAR · Jun 5, 2024 · b9bfca3 · b9bfca3
1 parent 601a3fc
commit b9bfca3
Show file tree

Hide file tree

Showing 8 changed files with 81 additions and 88 deletions.
diff --git a/include/micm/util/cuda_dense_matrix.hpp b/include/micm/util/cuda_dense_matrix.hpp
@@ -13,9 +13,6 @@
 
 #include <type_traits>
 
-#define CHECK_CUDA_ERROR(err, msg)   micm::cuda::CheckCudaError(err, __FILE__, __LINE__, msg)
-#define CHECK_CUBLAS_ERROR(err, msg) micm::cuda::CheckCublasError(err, __FILE__, __LINE__, msg)
-
 namespace micm
 {
   /**

diff --git a/include/micm/util/cuda_matrix.cuh b/include/micm/util/cuda_matrix.cuh
@@ -12,6 +12,9 @@
 #include <string>
 #include <vector>
 
+#define CHECK_CUDA_ERROR(err, msg)   micm::cuda::CheckCudaError(err, __FILE__, __LINE__, msg)
+#define CHECK_CUBLAS_ERROR(err, msg) micm::cuda::CheckCublasError(err, __FILE__, __LINE__, msg)
+
 namespace micm
 {
   namespace cuda

diff --git a/include/micm/util/cuda_sparse_matrix.hpp b/include/micm/util/cuda_sparse_matrix.hpp
@@ -13,8 +13,6 @@
 
 #include <type_traits>
 
-#define CHECK_CUDA_ERROR(err, msg) micm::cuda::CheckCudaError(err, __FILE__, __LINE__, msg)
-
 namespace micm
 {
   template<class T, class OrderingPolicy>

diff --git a/src/process/process_set.cu b/src/process/process_set.cu
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #include <micm/util/cuda_param.hpp>
+#include <micm/util/cuda_matrix.cuh>
 
 namespace micm
 {
@@ -137,23 +138,23 @@ namespace micm
       ProcessSetParam devstruct;
 
       /// Allocate memory space on the device
-      cudaMalloc(&(devstruct.number_of_reactants_), number_of_reactants_bytes);
-      cudaMalloc(&(devstruct.reactant_ids_), reactant_ids_bytes);
-      cudaMalloc(&(devstruct.number_of_products_), number_of_products_bytes);
-      cudaMalloc(&(devstruct.product_ids_), product_ids_bytes);
-      cudaMalloc(&(devstruct.yields_), yields_bytes);
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.number_of_reactants_), number_of_reactants_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.reactant_ids_), reactant_ids_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.number_of_products_), number_of_products_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.product_ids_), product_ids_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.yields_), yields_bytes), "cudaMalloc");
 
       /// Copy the data from host to device
-      cudaMemcpy(
+      CHECK_CUDA_ERROR(cudaMemcpy(
           devstruct.number_of_reactants_,
           hoststruct.number_of_reactants_,
           number_of_reactants_bytes,
-          cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.reactant_ids_, hoststruct.reactant_ids_, reactant_ids_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(
-          devstruct.number_of_products_, hoststruct.number_of_products_, number_of_products_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.product_ids_, hoststruct.product_ids_, product_ids_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.yields_, hoststruct.yields_, yields_bytes, cudaMemcpyHostToDevice);
+          cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.reactant_ids_, hoststruct.reactant_ids_, reactant_ids_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(
+          devstruct.number_of_products_, hoststruct.number_of_products_, number_of_products_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.product_ids_, hoststruct.product_ids_, product_ids_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.yields_, hoststruct.yields_, yields_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
 
       devstruct.number_of_reactants_size_ = hoststruct.number_of_reactants_size_;
       devstruct.reactant_ids_size_ = hoststruct.reactant_ids_size_;
@@ -173,11 +174,11 @@ namespace micm
       size_t jacobian_flat_ids_bytes = sizeof(size_t) * hoststruct.jacobian_flat_ids_size_;
 
       /// Allocate memory space on the device
-      cudaMalloc(&(devstruct.jacobian_flat_ids_), jacobian_flat_ids_bytes);
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.jacobian_flat_ids_), jacobian_flat_ids_bytes), "cudaMalloc");
 
       /// Copy the data from host to device
-      cudaMemcpy(
-          devstruct.jacobian_flat_ids_, hoststruct.jacobian_flat_ids_, jacobian_flat_ids_bytes, cudaMemcpyHostToDevice);
+      CHECK_CUDA_ERROR(cudaMemcpy(
+          devstruct.jacobian_flat_ids_, hoststruct.jacobian_flat_ids_, jacobian_flat_ids_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
 
       devstruct.jacobian_flat_ids_size_ = hoststruct.jacobian_flat_ids_size_;
     }
@@ -186,14 +187,14 @@ namespace micm
     ///   members of class "CudaProcessSet" on the device
     void FreeConstData(ProcessSetParam& devstruct)
     {
-      cudaFree(devstruct.number_of_reactants_);
-      cudaFree(devstruct.reactant_ids_);
-      cudaFree(devstruct.number_of_products_);
-      cudaFree(devstruct.product_ids_);
-      cudaFree(devstruct.yields_);
+      CHECK_CUDA_ERROR(cudaFree(devstruct.number_of_reactants_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.reactant_ids_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.number_of_products_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.product_ids_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.yields_), "cudaFree");
       if (devstruct.jacobian_flat_ids_ != nullptr)
       {
-        cudaFree(devstruct.jacobian_flat_ids_);
+        CHECK_CUDA_ERROR(cudaFree(devstruct.jacobian_flat_ids_), "cudaFree");
       }
     }
 

diff --git a/src/solver/linear_solver.cu b/src/solver/linear_solver.cu
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #include <micm/util/cuda_param.hpp>
+#include <micm/util/cuda_matrix.cuh>
 
 #include <chrono>
 
@@ -92,16 +93,16 @@ namespace micm
 
       /// Create a struct whose members contain the addresses in the device memory.
       LinearSolverParam devstruct;
-      cudaMalloc(&(devstruct.nLij_Lii_), nLij_Lii_bytes);
-      cudaMalloc(&(devstruct.Lij_yj_), Lij_yj_bytes);
-      cudaMalloc(&(devstruct.nUij_Uii_), nUij_Uii_bytes);
-      cudaMalloc(&(devstruct.Uij_xj_), Uij_xj_bytes);
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.nLij_Lii_), nLij_Lii_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.Lij_yj_), Lij_yj_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.nUij_Uii_), nUij_Uii_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.Uij_xj_), Uij_xj_bytes), "cudaMalloc");
 
       /// Copy the data from host to device
-      cudaMemcpy(devstruct.nLij_Lii_, hoststruct.nLij_Lii_, nLij_Lii_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.Lij_yj_, hoststruct.Lij_yj_, Lij_yj_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.nUij_Uii_, hoststruct.nUij_Uii_, nUij_Uii_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.Uij_xj_, hoststruct.Uij_xj_, Uij_xj_bytes, cudaMemcpyHostToDevice);
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.nLij_Lii_, hoststruct.nLij_Lii_, nLij_Lii_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.Lij_yj_, hoststruct.Lij_yj_, Lij_yj_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.nUij_Uii_, hoststruct.nUij_Uii_, nUij_Uii_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.Uij_xj_, hoststruct.Uij_xj_, Uij_xj_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
 
       devstruct.nLij_Lii_size_ = hoststruct.nLij_Lii_size_;
       devstruct.Lij_yj_size_ = hoststruct.Lij_yj_size_;
@@ -115,10 +116,10 @@ namespace micm
     ///   members of class "CudaLinearSolver" on the device
     void FreeConstData(LinearSolverParam& devstruct)
     {
-      cudaFree(devstruct.nLij_Lii_);
-      cudaFree(devstruct.Lij_yj_);
-      cudaFree(devstruct.nUij_Uii_);
-      cudaFree(devstruct.Uij_xj_);
+      CHECK_CUDA_ERROR(cudaFree(devstruct.nLij_Lii_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.Lij_yj_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.nUij_Uii_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.Uij_xj_), "cudaFree");
     }
 
     void SolveKernelDriver(

diff --git a/src/solver/lu_decomposition.cu b/src/solver/lu_decomposition.cu
@@ -3,9 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #include <micm/util/cuda_param.hpp>
-
-#include <chrono>
-#include <iostream>
+#include <micm/util/cuda_matrix.cuh>
 
 namespace micm
 {
@@ -121,28 +119,28 @@ namespace micm
 
       /// Create a struct whose members contain the addresses in the device memory.
       LuDecomposeParam devstruct;
-      cudaMalloc(&(devstruct.niLU_), niLU_bytes);
-      cudaMalloc(&(devstruct.do_aik_), do_aik_bytes);
-      cudaMalloc(&(devstruct.aik_), aik_bytes);
-      cudaMalloc(&(devstruct.uik_nkj_), uik_nkj_bytes);
-      cudaMalloc(&(devstruct.lij_ujk_), lij_ujk_bytes);
-      cudaMalloc(&(devstruct.do_aki_), do_aki_bytes);
-      cudaMalloc(&(devstruct.aki_), aki_bytes);
-      cudaMalloc(&(devstruct.lki_nkj_), lki_nkj_bytes);
-      cudaMalloc(&(devstruct.lkj_uji_), lkj_uji_bytes);
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.niLU_), niLU_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.do_aik_), do_aik_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.aik_), aik_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.uik_nkj_), uik_nkj_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.lij_ujk_), lij_ujk_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.do_aki_), do_aki_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.aki_), aki_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.lki_nkj_), lki_nkj_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.lkj_uji_), lkj_uji_bytes), "cudaMalloc");
       cudaMalloc(&(devstruct.uii_), uii_bytes);
 
       /// Copy the data from host to device
-      cudaMemcpy(devstruct.niLU_, hoststruct.niLU_, niLU_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.do_aik_, hoststruct.do_aik_, do_aik_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.aik_, hoststruct.aik_, aik_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.uik_nkj_, hoststruct.uik_nkj_, uik_nkj_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.lij_ujk_, hoststruct.lij_ujk_, lij_ujk_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.do_aki_, hoststruct.do_aki_, do_aki_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.aki_, hoststruct.aki_, aki_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.lki_nkj_, hoststruct.lki_nkj_, lki_nkj_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.lkj_uji_, hoststruct.lkj_uji_, lkj_uji_bytes, cudaMemcpyHostToDevice);
-      cudaMemcpy(devstruct.uii_, hoststruct.uii_, uii_bytes, cudaMemcpyHostToDevice);
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.niLU_, hoststruct.niLU_, niLU_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.do_aik_, hoststruct.do_aik_, do_aik_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.aik_, hoststruct.aik_, aik_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.uik_nkj_, hoststruct.uik_nkj_, uik_nkj_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.lij_ujk_, hoststruct.lij_ujk_, lij_ujk_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.do_aki_, hoststruct.do_aki_, do_aki_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.aki_, hoststruct.aki_, aki_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.lki_nkj_, hoststruct.lki_nkj_, lki_nkj_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.lkj_uji_, hoststruct.lkj_uji_, lkj_uji_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.uii_, hoststruct.uii_, uii_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
       devstruct.niLU_size_ = hoststruct.niLU_size_;
 
       return devstruct;
@@ -152,16 +150,16 @@ namespace micm
     ///   members of class "CudaLuDecomposition" on the device
     void FreeConstData(LuDecomposeParam& devstruct)
     {
-      cudaFree(devstruct.niLU_);
-      cudaFree(devstruct.do_aik_);
-      cudaFree(devstruct.aik_);
-      cudaFree(devstruct.uik_nkj_);
-      cudaFree(devstruct.lij_ujk_);
-      cudaFree(devstruct.do_aki_);
-      cudaFree(devstruct.aki_);
-      cudaFree(devstruct.lki_nkj_);
-      cudaFree(devstruct.lkj_uji_);
-      cudaFree(devstruct.uii_);
+      CHECK_CUDA_ERROR(cudaFree(devstruct.niLU_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.do_aik_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.aik_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.uik_nkj_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.lij_ujk_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.do_aki_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.aki_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.lki_nkj_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.lkj_uji_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.uii_), "cudaFree");
     }
 
     void DecomposeKernelDriver(

diff --git a/src/solver/rosenbrock.cu b/src/solver/rosenbrock.cu
@@ -5,11 +5,9 @@
 #include <micm/solver/rosenbrock_solver_parameters.hpp>
 #include <micm/util/cuda_param.hpp>
 #include <micm/util/internal_error.hpp>
-
+#include <micm/util/cuda_matrix.cuh>
 #include <cublas_v2.h>
 
-#include <iostream>
-
 namespace micm
 {
   namespace cuda
@@ -48,19 +46,19 @@ namespace micm
 
       /// Create a struct whose members contain the addresses in the device memory.
       CudaRosenbrockSolverParam devstruct;
-      cudaMalloc(&(devstruct.errors_input_), errors_bytes);
-      cudaMalloc(&(devstruct.errors_output_), errors_bytes);
-      cudaMalloc(&(devstruct.jacobian_diagonal_elements_), jacobian_diagonal_elements_bytes);
-      cudaMalloc(&(devstruct.absolute_tolerance_), tolerance_bytes);
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.errors_input_), errors_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.errors_output_), errors_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.jacobian_diagonal_elements_), jacobian_diagonal_elements_bytes), "cudaMalloc");
+      CHECK_CUDA_ERROR(cudaMalloc(&(devstruct.absolute_tolerance_), tolerance_bytes), "cudaMalloc");
 
       /// Copy the data from host to device
-      cudaMemcpy(
+      CHECK_CUDA_ERROR(cudaMemcpy(
           devstruct.jacobian_diagonal_elements_,
           hoststruct.jacobian_diagonal_elements_,
           jacobian_diagonal_elements_bytes,
-          cudaMemcpyHostToDevice);
+          cudaMemcpyHostToDevice), "cudaMemcpy");
 
-      cudaMemcpy(devstruct.absolute_tolerance_, hoststruct.absolute_tolerance_, tolerance_bytes, cudaMemcpyHostToDevice);
+      CHECK_CUDA_ERROR(cudaMemcpy(devstruct.absolute_tolerance_, hoststruct.absolute_tolerance_, tolerance_bytes, cudaMemcpyHostToDevice), "cudaMemcpy");
 
       devstruct.errors_size_ = hoststruct.errors_size_;
       devstruct.jacobian_diagonal_elements_size_ = hoststruct.jacobian_diagonal_elements_size_;
@@ -73,10 +71,10 @@ namespace micm
     ///   members and temporary variables of class "CudaLuDecomposition" on the device
     void FreeConstData(CudaRosenbrockSolverParam& devstruct)
     {
-      cudaFree(devstruct.errors_input_);
-      cudaFree(devstruct.errors_output_);
-      cudaFree(devstruct.jacobian_diagonal_elements_);
-      cudaFree(devstruct.absolute_tolerance_);
+      CHECK_CUDA_ERROR(cudaFree(devstruct.errors_input_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.errors_output_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.jacobian_diagonal_elements_), "cudaFree");
+      CHECK_CUDA_ERROR(cudaFree(devstruct.absolute_tolerance_), "cudaFree");
     }
 
     // Specific CUDA device function to do reduction within a warp
@@ -308,7 +306,7 @@ namespace micm
         }
         cudaDeviceSynchronize();
 
-        cudaMemcpy(&normalized_error, &devstruct.errors_output_[0], sizeof(double), cudaMemcpyDeviceToHost);
+        CHECK_CUDA_ERROR(cudaMemcpy(&normalized_error, &devstruct.errors_output_[0], sizeof(double), cudaMemcpyDeviceToHost), "cudaMemcpy");
         normalized_error = std::sqrt(normalized_error / number_of_elements);
       }  // end of if-else for CUDA/CUBLAS implementation
       return std::max(normalized_error, 1.0e-10);

diff --git a/src/util/cuda_matrix.cu b/src/util/cuda_matrix.cu
@@ -4,10 +4,7 @@
  */
 #include <micm/util/cuda_matrix.cuh>
 #include <micm/util/internal_error.hpp>
-
 #include <cuda_runtime.h>
-
-#include <iostream>
 #include <vector>
 
 namespace micm