From 76d7a86d56cdf973f4ac8592898e3954e0b947f6 Mon Sep 17 00:00:00 2001
From: Qina Tan <qina.tan@hotmail.com>
Date: Sat, 2 Sep 2023 09:47:33 -0600
Subject: [PATCH 1/3] added struct for sparseMatrix

---
 include/micm/process/cuda_process_set.cuh |  1 +
 include/micm/process/cuda_process_set.hpp | 29 ++++++-----
 include/micm/util/cuda_param.hpp          | 22 +++++----
 src/process/process_set.cu                | 59 ++++++++++++-----------
 4 files changed, 60 insertions(+), 51 deletions(-)
diff --git a/include/micm/process/cuda_process_set.cuh b/include/micm/process/cuda_process_set.cuh
index 59d0b5225..088d25f34 100644
--- a/include/micm/process/cuda_process_set.cuh
+++ b/include/micm/process/cuda_process_set.cuh
@@ -11,6 +11,7 @@ namespace micm
 
     std::chrono::nanoseconds AddJacobianTermsKernelDriver(
         CUDAMatrixParam& matrixParam, 
+        CUDASparseMatrixParam& sparseMatrix, 
         CUDAProcessSetParam& processSet);
   }  // namespace cuda
 }  // namespace micm
diff --git a/include/micm/process/cuda_process_set.hpp b/include/micm/process/cuda_process_set.hpp
index f82a11618..ae5984a81 100644
--- a/include/micm/process/cuda_process_set.hpp
+++ b/include/micm/process/cuda_process_set.hpp
@@ -53,12 +53,12 @@ namespace micm
       MatrixPolicy<double>& forcing) const
   {
     CUDAMatrixParam matrixParam;
-    matrixParam.rate_constants_ = rate_constants.AsVector().data(); 
-    matrixParam.state_variables_ = state_variables.AsVector().data(); 
-    matrixParam.forcing_ = forcing.AsVector().data(); 
-    matrixParam.n_grids_ = rate_constants.size(); 
-    matrixParam.n_reactions_ = rate_constants[0].size(); 
-    matrixParam.n_species_ = state_variables[0].size();
+    matrixParam.rate_constants = rate_constants.AsVector().data(); 
+    matrixParam.state_variables = state_variables.AsVector().data(); 
+    matrixParam.forcing = forcing.AsVector().data(); 
+    matrixParam.n_grids = rate_constants.size(); 
+    matrixParam.n_reactions = rate_constants[0].size(); 
+    matrixParam.n_species = state_variables[0].size();
 
     CUDAProcessSetParam processSet; 
     processSet.number_of_reactants = number_of_reactants_.data(); 
@@ -83,13 +83,15 @@ namespace micm
       SparseMatrixPolicy<double>& jacobian) const
   {
     CUDAMatrixParam matrixParam; 
-    matrixParam.rate_constants_ = rate_constants.AsVector().data(); 
-    matrixParam.state_variables_ = state_variables.AsVector().data(); 
-    matrixParam.jacobian_= jacobian.AsVector().data(); 
-    matrixParam.n_grids_ = rate_constants.size(); 
-    matrixParam.n_reactions_ = rate_constants[0].size(); 
-    matrixParam.n_species_ = state_variables[0].size(); 
-    matrixParam.jacobian_size_ = jacobian.AsVector().size(); 
+    matrixParam.rate_constants = rate_constants.AsVector().data(); 
+    matrixParam.state_variables = state_variables.AsVector().data(); 
+    matrixParam.n_grids = rate_constants.size(); 
+    matrixParam.n_reactions = rate_constants[0].size(); 
+    matrixParam.n_species = state_variables[0].size(); 
+    
+    CUDASparseMatrixParam sparseMatrix; 
+    sparseMatrix.jacobian = jacobian.AsVector().data(); 
+    sparseMatrix.jacobian_size = jacobian.AsVector().size(); 
     
     CUDAProcessSetParam processSet; 
     processSet.number_of_reactants = number_of_reactants_.data(); 
@@ -103,6 +105,7 @@ namespace micm
     
     std::chrono::nanoseconds kernel_duration = micm::cuda::AddJacobianTermsKernelDriver(
         matrixParam, 
+        sparseMatrix, 
         processSet);
     return kernel_duration;  // time performance of kernel function
   }
diff --git a/include/micm/util/cuda_param.hpp b/include/micm/util/cuda_param.hpp
index e14c36428..1a50edf67 100644
--- a/include/micm/util/cuda_param.hpp
+++ b/include/micm/util/cuda_param.hpp
@@ -1,6 +1,5 @@
 #ifndef CUDA_PARAM_HPP
 #define CUDA_PARAM_HPP
-  
   //member data of class CUDAProcessSet grouped in struct passing to kernel driver function 
   struct CUDAProcessSetParam{
     const size_t* number_of_reactants; 
@@ -16,13 +15,18 @@
 }; 
 //different matrix data grouped in struct passing to kernel driver function 
     struct CUDAMatrixParam{
-     const double* rate_constants_; 
-     const double* state_variables_; 
-     double* forcing_; 
-     double* jacobian_; 
-     size_t n_grids_; 
-     size_t n_reactions_; 
-     size_t n_species_; 
-     size_t jacobian_size_; 
+     const double* rate_constants; 
+     const double* state_variables; 
+     double* forcing; 
+     double* jacobian; 
+     size_t n_grids; 
+     size_t n_reactions; 
+     size_t n_species; 
+     size_t jacobian_size; 
 }; 
+//sparseMatrix data grouped in struct passing to kernel driver function 
+struct CUDASparseMatrixParam{
+   double* jacobian; 
+   size_t jacobian_size; 
+};
 #endif
\ No newline at end of file
diff --git a/src/process/process_set.cu b/src/process/process_set.cu
index e4c794a0a..e1a7cc49c 100644
--- a/src/process/process_set.cu
+++ b/src/process/process_set.cu
@@ -126,6 +126,7 @@ namespace micm
 
     std::chrono::nanoseconds AddJacobianTermsKernelDriver(
         CUDAMatrixParam& matrixParam,
+        CUDASparseMatrix& sparseMatrix, 
         CUDAProcessSetParam& processSet)
     {
       // create device pointers
@@ -140,24 +141,24 @@ namespace micm
       jacobianDevice* device; 
 
       // allocate device memory
-      cudaMalloc(&d_rate_constants, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_reactions_);
-      cudaMalloc(&d_state_variables, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_species_);
-      cudaMalloc(&d_jacobian, sizeof(double) * matrixParam.jacobian_size_);
-      cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_);
+      cudaMalloc(&d_rate_constants, sizeof(double) * matrixParam.n_grids * matrixParam.n_reactions);
+      cudaMalloc(&d_state_variables, sizeof(double) * matrixParam.n_grids * matrixParam.n_species);
+      cudaMalloc(&d_jacobian, sizeof(double) * sparseMatrix.jacobian_size);
+      cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions);
       cudaMalloc(&d_reactant_ids, sizeof(size_t) * processSet.reactant_ids_size);
-      cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions_);
+      cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions);
       cudaMalloc(&d_yields, sizeof(double) * processSet.yields_size);
       cudaMalloc(&d_jacobian_flat_ids, sizeof(size_t) * processSet.jacobian_flat_ids_size);
       cudaMalloc(&device, sizeof(jacobianDevice)); 
       
 
       // transfer data from host to device
-      cudaMemcpy(d_rate_constants, matrixParam.rate_constants_, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
-      cudaMemcpy(d_state_variables, matrixParam.state_variables_, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_species_, cudaMemcpyHostToDevice);
-      cudaMemcpy(d_jacobian, matrixParam.jacobian_, sizeof(double) * matrixParam.jacobian_size_, cudaMemcpyHostToDevice);
-      cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
+      cudaMemcpy(d_rate_constants, matrixParam.rate_constants, sizeof(double) * matrixParam.n_grids * matrixParam.n_reactions, cudaMemcpyHostToDevice);
+      cudaMemcpy(d_state_variables, matrixParam.state_variables, sizeof(double) * matrixParam.n_grids * matrixParam.n_species, cudaMemcpyHostToDevice);
+      cudaMemcpy(d_jacobian, sparseMatrix.jacobian, sizeof(double) * sparseMatrix.jacobian_size, cudaMemcpyHostToDevice);
+      cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
       cudaMemcpy(d_reactant_ids, processSet.reactant_ids, sizeof(size_t) * processSet.reactant_ids_size, cudaMemcpyHostToDevice);
-      cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
+      cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
       cudaMemcpy(d_yields, processSet.yields, sizeof(double) * processSet.yields_size, cudaMemcpyHostToDevice);
       cudaMemcpy(d_jacobian_flat_ids, processSet.jacobian_flat_ids, sizeof(size_t) * processSet.jacobian_flat_ids_size, cudaMemcpyHostToDevice);
       cudaMemcpy(&(device->rate_constants), &d_rate_constants, sizeof(double*), cudaMemcpyHostToDevice); 
@@ -171,10 +172,10 @@ namespace micm
 
       
       // setup kernel
-      size_t total_blocks = (matrixParam.n_grids_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      size_t total_blocks = (matrixParam.n_grids + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
-      size_t n_reactions = matrixParam.n_reactions_; 
-      size_t n_grids = matrixParam.n_grids_; 
+      size_t n_reactions = matrixParam.n_reactions; 
+      size_t n_grids = matrixParam.n_grids; 
       // launch kernel and measure time performance
       auto startTime = std::chrono::high_resolution_clock::now();
       AddJacobianTermsKernel<<<total_blocks, BLOCK_SIZE>>>(
@@ -185,7 +186,7 @@ namespace micm
       auto endTime = std::chrono::high_resolution_clock::now();
       auto kernel_duration = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime);
 
-      cudaMemcpy(matrixParam.jacobian_, d_jacobian, sizeof(double) * matrixParam.jacobian_size_, cudaMemcpyDeviceToHost);
+      cudaMemcpy(sparseMatrix.jacobian, d_jacobian, sizeof(double) * sparseMatrix.jacobian_size, cudaMemcpyDeviceToHost);
       // clean up
       cudaFree(d_rate_constants);
       cudaFree(d_state_variables);
@@ -215,23 +216,23 @@ namespace micm
       forcingDevice* device; 
 
       // allocate device memory
-      cudaMalloc(&d_rate_constants, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_reactions_));
-      cudaMalloc(&d_state_variables, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_));
-      cudaMalloc(&d_forcing, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_));
-      cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_);
+      cudaMalloc(&d_rate_constants, sizeof(double) * (matrixParam.n_grids * matrixParam.n_reactions));
+      cudaMalloc(&d_state_variables, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species));
+      cudaMalloc(&d_forcing, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species));
+      cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions);
       cudaMalloc(&d_reactant_ids, sizeof(size_t) * processSet.reactant_ids_size);
-      cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions_);
+      cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions);
       cudaMalloc(&d_product_ids, sizeof(size_t) * processSet.product_ids_size);
       cudaMalloc(&d_yields, sizeof(double) * processSet.yields_size);
       cudaMalloc(&device, sizeof(forcingDevice)); 
 
       // copy data from host memory to device memory
-      cudaMemcpy(d_rate_constants, matrixParam.rate_constants_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_reactions_), cudaMemcpyHostToDevice);
-      cudaMemcpy(d_state_variables, matrixParam.state_variables_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_), cudaMemcpyHostToDevice);
-      cudaMemcpy(d_forcing, matrixParam.forcing_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_), cudaMemcpyHostToDevice);
-      cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
+      cudaMemcpy(d_rate_constants, matrixParam.rate_constants, sizeof(double) * (matrixParam.n_grids * matrixParam.n_reactions), cudaMemcpyHostToDevice);
+      cudaMemcpy(d_state_variables, matrixParam.state_variables, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species), cudaMemcpyHostToDevice);
+      cudaMemcpy(d_forcing, matrixParam.forcing, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species), cudaMemcpyHostToDevice);
+      cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
       cudaMemcpy(d_reactant_ids, processSet.reactant_ids, sizeof(size_t) * processSet.reactant_ids_size, cudaMemcpyHostToDevice);
-      cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
+      cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
       cudaMemcpy(d_product_ids, processSet.product_ids, sizeof(size_t) * processSet.product_ids_size, cudaMemcpyHostToDevice);
       cudaMemcpy(d_yields, processSet.yields, sizeof(double) * processSet.yields_size, cudaMemcpyHostToDevice);
       cudaMemcpy(&(device->rate_constants), &d_rate_constants, sizeof(double*),cudaMemcpyHostToDevice); 
@@ -245,11 +246,11 @@ namespace micm
 
       // total thread count == number of grid cells
      
-      int num_block = (matrixParam.n_grids_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int num_block = (matrixParam.n_grids + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
-      size_t n_grids = matrixParam.n_grids_; 
-      size_t n_reactions = matrixParam.n_reactions_; 
-      size_t n_species = matrixParam.n_species_; 
+      size_t n_grids = matrixParam.n_grids; 
+      size_t n_reactions = matrixParam.n_reactions; 
+      size_t n_species = matrixParam.n_species; 
       
       // launch kernel and measure time performance
       auto startTime = std::chrono::high_resolution_clock::now();
@@ -263,7 +264,7 @@ namespace micm
       auto kernel_duration = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime);
 
       // copy data from device memory to host memory
-      cudaMemcpy(matrixParam.forcing_, d_forcing, sizeof(double) * (n_grids * n_species), cudaMemcpyDeviceToHost);
+      cudaMemcpy(matrixParam.forcing, d_forcing, sizeof(double) * (n_grids * n_species), cudaMemcpyDeviceToHost);
 
       // clean up
       cudaFree(d_rate_constants);

From 646a8afc198403dcea655ac489956efc1cd699c8 Mon Sep 17 00:00:00 2001
From: Qina Tan <qina.tan@hotmail.com>
Date: Sat, 2 Sep 2023 09:49:01 -0600
Subject: [PATCH 2/3] added struct for sparseMatrix

---
 include/micm/util/cuda_param.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/micm/util/cuda_param.hpp b/include/micm/util/cuda_param.hpp
index 1a50edf67..6ed4120d0 100644
--- a/include/micm/util/cuda_param.hpp
+++ b/include/micm/util/cuda_param.hpp
@@ -18,11 +18,9 @@
      const double* rate_constants; 
      const double* state_variables; 
      double* forcing; 
-     double* jacobian; 
      size_t n_grids; 
      size_t n_reactions; 
      size_t n_species; 
-     size_t jacobian_size; 
 }; 
 //sparseMatrix data grouped in struct passing to kernel driver function 
 struct CUDASparseMatrixParam{

From 5d55d9553e00ecc36ed9a71c85cbb6788f2dd044 Mon Sep 17 00:00:00 2001
From: Qina Tan <qina.tan@hotmail.com>
Date: Sat, 2 Sep 2023 09:53:45 -0600
Subject: [PATCH 3/3] sparse matrix in struct

---
 src/process/process_set.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/process/process_set.cu b/src/process/process_set.cu
index e1a7cc49c..cfe8d2103 100644
--- a/src/process/process_set.cu
+++ b/src/process/process_set.cu
@@ -126,7 +126,7 @@ namespace micm
 
     std::chrono::nanoseconds AddJacobianTermsKernelDriver(
         CUDAMatrixParam& matrixParam,
-        CUDASparseMatrix& sparseMatrix, 
+        CUDASparseMatrixParam& sparseMatrix, 
         CUDAProcessSetParam& processSet)
     {
       // create device pointers