From 76d7a86d56cdf973f4ac8592898e3954e0b947f6 Mon Sep 17 00:00:00 2001 From: Qina Tan Date: Sat, 2 Sep 2023 09:47:33 -0600 Subject: [PATCH 1/3] added struct for sparseMatrix --- include/micm/process/cuda_process_set.cuh | 1 + include/micm/process/cuda_process_set.hpp | 29 ++++++----- include/micm/util/cuda_param.hpp | 22 +++++---- src/process/process_set.cu | 59 ++++++++++++----------- 4 files changed, 60 insertions(+), 51 deletions(-) diff --git a/include/micm/process/cuda_process_set.cuh b/include/micm/process/cuda_process_set.cuh index 59d0b5225..088d25f34 100644 --- a/include/micm/process/cuda_process_set.cuh +++ b/include/micm/process/cuda_process_set.cuh @@ -11,6 +11,7 @@ namespace micm std::chrono::nanoseconds AddJacobianTermsKernelDriver( CUDAMatrixParam& matrixParam, + CUDASparseMatrixParam& sparseMatrix, CUDAProcessSetParam& processSet); } // namespace cuda } // namespace micm diff --git a/include/micm/process/cuda_process_set.hpp b/include/micm/process/cuda_process_set.hpp index f82a11618..ae5984a81 100644 --- a/include/micm/process/cuda_process_set.hpp +++ b/include/micm/process/cuda_process_set.hpp @@ -53,12 +53,12 @@ namespace micm MatrixPolicy& forcing) const { CUDAMatrixParam matrixParam; - matrixParam.rate_constants_ = rate_constants.AsVector().data(); - matrixParam.state_variables_ = state_variables.AsVector().data(); - matrixParam.forcing_ = forcing.AsVector().data(); - matrixParam.n_grids_ = rate_constants.size(); - matrixParam.n_reactions_ = rate_constants[0].size(); - matrixParam.n_species_ = state_variables[0].size(); + matrixParam.rate_constants = rate_constants.AsVector().data(); + matrixParam.state_variables = state_variables.AsVector().data(); + matrixParam.forcing = forcing.AsVector().data(); + matrixParam.n_grids = rate_constants.size(); + matrixParam.n_reactions = rate_constants[0].size(); + matrixParam.n_species = state_variables[0].size(); CUDAProcessSetParam processSet; processSet.number_of_reactants = number_of_reactants_.data(); @@ -83,13 +83,15 @@ namespace micm SparseMatrixPolicy& jacobian) const { CUDAMatrixParam matrixParam; - matrixParam.rate_constants_ = rate_constants.AsVector().data(); - matrixParam.state_variables_ = state_variables.AsVector().data(); - matrixParam.jacobian_= jacobian.AsVector().data(); - matrixParam.n_grids_ = rate_constants.size(); - matrixParam.n_reactions_ = rate_constants[0].size(); - matrixParam.n_species_ = state_variables[0].size(); - matrixParam.jacobian_size_ = jacobian.AsVector().size(); + matrixParam.rate_constants = rate_constants.AsVector().data(); + matrixParam.state_variables = state_variables.AsVector().data(); + matrixParam.n_grids = rate_constants.size(); + matrixParam.n_reactions = rate_constants[0].size(); + matrixParam.n_species = state_variables[0].size(); + + CUDASparseMatrixParam sparseMatrix; + sparseMatrix.jacobian = jacobian.AsVector().data(); + sparseMatrix.jacobian_size = jacobian.AsVector().size(); CUDAProcessSetParam processSet; processSet.number_of_reactants = number_of_reactants_.data(); @@ -103,6 +105,7 @@ namespace micm std::chrono::nanoseconds kernel_duration = micm::cuda::AddJacobianTermsKernelDriver( matrixParam, + sparseMatrix, processSet); return kernel_duration; // time performance of kernel function } diff --git a/include/micm/util/cuda_param.hpp b/include/micm/util/cuda_param.hpp index e14c36428..1a50edf67 100644 --- a/include/micm/util/cuda_param.hpp +++ b/include/micm/util/cuda_param.hpp @@ -1,6 +1,5 @@ #ifndef CUDA_PARAM_HPP #define CUDA_PARAM_HPP - //member data of class CUDAProcessSet grouped in struct passing to kernel driver function struct CUDAProcessSetParam{ const size_t* number_of_reactants; @@ -16,13 +15,18 @@ }; //different matrix data grouped in struct passing to kernel driver function struct CUDAMatrixParam{ - const double* rate_constants_; - const double* state_variables_; - double* forcing_; - double* jacobian_; - size_t n_grids_; - size_t n_reactions_; - size_t n_species_; - size_t jacobian_size_; + const double* rate_constants; + const double* state_variables; + double* forcing; + double* jacobian; + size_t n_grids; + size_t n_reactions; + size_t n_species; + size_t jacobian_size; }; +//sparseMatrix data grouped in struct passing to kernel driver function +struct CUDASparseMatrixParam{ + double* jacobian; + size_t jacobian_size; +}; #endif \ No newline at end of file diff --git a/src/process/process_set.cu b/src/process/process_set.cu index e4c794a0a..e1a7cc49c 100644 --- a/src/process/process_set.cu +++ b/src/process/process_set.cu @@ -126,6 +126,7 @@ namespace micm std::chrono::nanoseconds AddJacobianTermsKernelDriver( CUDAMatrixParam& matrixParam, + CUDASparseMatrix& sparseMatrix, CUDAProcessSetParam& processSet) { // create device pointers @@ -140,24 +141,24 @@ namespace micm jacobianDevice* device; // allocate device memory - cudaMalloc(&d_rate_constants, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_reactions_); - cudaMalloc(&d_state_variables, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_species_); - cudaMalloc(&d_jacobian, sizeof(double) * matrixParam.jacobian_size_); - cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_); + cudaMalloc(&d_rate_constants, sizeof(double) * matrixParam.n_grids * matrixParam.n_reactions); + cudaMalloc(&d_state_variables, sizeof(double) * matrixParam.n_grids * matrixParam.n_species); + cudaMalloc(&d_jacobian, sizeof(double) * sparseMatrix.jacobian_size); + cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions); cudaMalloc(&d_reactant_ids, sizeof(size_t) * processSet.reactant_ids_size); - cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions_); + cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions); cudaMalloc(&d_yields, sizeof(double) * processSet.yields_size); cudaMalloc(&d_jacobian_flat_ids, sizeof(size_t) * processSet.jacobian_flat_ids_size); cudaMalloc(&device, sizeof(jacobianDevice)); // transfer data from host to device - cudaMemcpy(d_rate_constants, matrixParam.rate_constants_, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_reactions_, cudaMemcpyHostToDevice); - cudaMemcpy(d_state_variables, matrixParam.state_variables_, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_species_, cudaMemcpyHostToDevice); - cudaMemcpy(d_jacobian, matrixParam.jacobian_, sizeof(double) * matrixParam.jacobian_size_, cudaMemcpyHostToDevice); - cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice); + cudaMemcpy(d_rate_constants, matrixParam.rate_constants, sizeof(double) * matrixParam.n_grids * matrixParam.n_reactions, cudaMemcpyHostToDevice); + cudaMemcpy(d_state_variables, matrixParam.state_variables, sizeof(double) * matrixParam.n_grids * matrixParam.n_species, cudaMemcpyHostToDevice); + cudaMemcpy(d_jacobian, sparseMatrix.jacobian, sizeof(double) * sparseMatrix.jacobian_size, cudaMemcpyHostToDevice); + cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice); cudaMemcpy(d_reactant_ids, processSet.reactant_ids, sizeof(size_t) * processSet.reactant_ids_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice); + cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice); cudaMemcpy(d_yields, processSet.yields, sizeof(double) * processSet.yields_size, cudaMemcpyHostToDevice); cudaMemcpy(d_jacobian_flat_ids, processSet.jacobian_flat_ids, sizeof(size_t) * processSet.jacobian_flat_ids_size, cudaMemcpyHostToDevice); cudaMemcpy(&(device->rate_constants), &d_rate_constants, sizeof(double*), cudaMemcpyHostToDevice); @@ -171,10 +172,10 @@ namespace micm // setup kernel - size_t total_blocks = (matrixParam.n_grids_ + BLOCK_SIZE - 1) / BLOCK_SIZE; + size_t total_blocks = (matrixParam.n_grids + BLOCK_SIZE - 1) / BLOCK_SIZE; - size_t n_reactions = matrixParam.n_reactions_; - size_t n_grids = matrixParam.n_grids_; + size_t n_reactions = matrixParam.n_reactions; + size_t n_grids = matrixParam.n_grids; // launch kernel and measure time performance auto startTime = std::chrono::high_resolution_clock::now(); AddJacobianTermsKernel<<>>( @@ -185,7 +186,7 @@ namespace micm auto endTime = std::chrono::high_resolution_clock::now(); auto kernel_duration = std::chrono::duration_cast(endTime - startTime); - cudaMemcpy(matrixParam.jacobian_, d_jacobian, sizeof(double) * matrixParam.jacobian_size_, cudaMemcpyDeviceToHost); + cudaMemcpy(sparseMatrix.jacobian, d_jacobian, sizeof(double) * sparseMatrix.jacobian_size, cudaMemcpyDeviceToHost); // clean up cudaFree(d_rate_constants); cudaFree(d_state_variables); @@ -215,23 +216,23 @@ namespace micm forcingDevice* device; // allocate device memory - cudaMalloc(&d_rate_constants, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_reactions_)); - cudaMalloc(&d_state_variables, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_)); - cudaMalloc(&d_forcing, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_)); - cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_); + cudaMalloc(&d_rate_constants, sizeof(double) * (matrixParam.n_grids * matrixParam.n_reactions)); + cudaMalloc(&d_state_variables, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species)); + cudaMalloc(&d_forcing, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species)); + cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions); cudaMalloc(&d_reactant_ids, sizeof(size_t) * processSet.reactant_ids_size); - cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions_); + cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions); cudaMalloc(&d_product_ids, sizeof(size_t) * processSet.product_ids_size); cudaMalloc(&d_yields, sizeof(double) * processSet.yields_size); cudaMalloc(&device, sizeof(forcingDevice)); // copy data from host memory to device memory - cudaMemcpy(d_rate_constants, matrixParam.rate_constants_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_reactions_), cudaMemcpyHostToDevice); - cudaMemcpy(d_state_variables, matrixParam.state_variables_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_), cudaMemcpyHostToDevice); - cudaMemcpy(d_forcing, matrixParam.forcing_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_), cudaMemcpyHostToDevice); - cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice); + cudaMemcpy(d_rate_constants, matrixParam.rate_constants, sizeof(double) * (matrixParam.n_grids * matrixParam.n_reactions), cudaMemcpyHostToDevice); + cudaMemcpy(d_state_variables, matrixParam.state_variables, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species), cudaMemcpyHostToDevice); + cudaMemcpy(d_forcing, matrixParam.forcing, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species), cudaMemcpyHostToDevice); + cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice); cudaMemcpy(d_reactant_ids, processSet.reactant_ids, sizeof(size_t) * processSet.reactant_ids_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice); + cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice); cudaMemcpy(d_product_ids, processSet.product_ids, sizeof(size_t) * processSet.product_ids_size, cudaMemcpyHostToDevice); cudaMemcpy(d_yields, processSet.yields, sizeof(double) * processSet.yields_size, cudaMemcpyHostToDevice); cudaMemcpy(&(device->rate_constants), &d_rate_constants, sizeof(double*),cudaMemcpyHostToDevice); @@ -245,11 +246,11 @@ namespace micm // total thread count == number of grid cells - int num_block = (matrixParam.n_grids_ + BLOCK_SIZE - 1) / BLOCK_SIZE; + int num_block = (matrixParam.n_grids + BLOCK_SIZE - 1) / BLOCK_SIZE; - size_t n_grids = matrixParam.n_grids_; - size_t n_reactions = matrixParam.n_reactions_; - size_t n_species = matrixParam.n_species_; + size_t n_grids = matrixParam.n_grids; + size_t n_reactions = matrixParam.n_reactions; + size_t n_species = matrixParam.n_species; // launch kernel and measure time performance auto startTime = std::chrono::high_resolution_clock::now(); @@ -263,7 +264,7 @@ namespace micm auto kernel_duration = std::chrono::duration_cast(endTime - startTime); // copy data from device memory to host memory - cudaMemcpy(matrixParam.forcing_, d_forcing, sizeof(double) * (n_grids * n_species), cudaMemcpyDeviceToHost); + cudaMemcpy(matrixParam.forcing, d_forcing, sizeof(double) * (n_grids * n_species), cudaMemcpyDeviceToHost); // clean up cudaFree(d_rate_constants); From 646a8afc198403dcea655ac489956efc1cd699c8 Mon Sep 17 00:00:00 2001 From: Qina Tan Date: Sat, 2 Sep 2023 09:49:01 -0600 Subject: [PATCH 2/3] added struct for sparseMatrix --- include/micm/util/cuda_param.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/micm/util/cuda_param.hpp b/include/micm/util/cuda_param.hpp index 1a50edf67..6ed4120d0 100644 --- a/include/micm/util/cuda_param.hpp +++ b/include/micm/util/cuda_param.hpp @@ -18,11 +18,9 @@ const double* rate_constants; const double* state_variables; double* forcing; - double* jacobian; size_t n_grids; size_t n_reactions; size_t n_species; - size_t jacobian_size; }; //sparseMatrix data grouped in struct passing to kernel driver function struct CUDASparseMatrixParam{ From 5d55d9553e00ecc36ed9a71c85cbb6788f2dd044 Mon Sep 17 00:00:00 2001 From: Qina Tan Date: Sat, 2 Sep 2023 09:53:45 -0600 Subject: [PATCH 3/3] sparse matrix in struct --- src/process/process_set.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/process/process_set.cu b/src/process/process_set.cu index e1a7cc49c..cfe8d2103 100644 --- a/src/process/process_set.cu +++ b/src/process/process_set.cu @@ -126,7 +126,7 @@ namespace micm std::chrono::nanoseconds AddJacobianTermsKernelDriver( CUDAMatrixParam& matrixParam, - CUDASparseMatrix& sparseMatrix, + CUDASparseMatrixParam& sparseMatrix, CUDAProcessSetParam& processSet) { // create device pointers