Skip to content

Commit

Permalink
merged param
Browse files Browse the repository at this point in the history
  • Loading branch information
qinatan committed Sep 2, 2023
2 parents d9187b1 + 5d55d95 commit b083012
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 58 deletions.
1 change: 1 addition & 0 deletions include/micm/process/cuda_process_set.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ namespace micm

std::chrono::nanoseconds AddJacobianTermsKernelDriver(
CUDAMatrixParam& matrixParam,
CUDASparseMatrixParam& sparseMatrix,
CUDAProcessSetParam& processSet);
} // namespace cuda
} // namespace micm
29 changes: 16 additions & 13 deletions include/micm/process/cuda_process_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ namespace micm
MatrixPolicy<double>& forcing) const
{
CUDAMatrixParam matrixParam;
matrixParam.rate_constants_ = rate_constants.AsVector().data();
matrixParam.state_variables_ = state_variables.AsVector().data();
matrixParam.forcing_ = forcing.AsVector().data();
matrixParam.n_grids_ = rate_constants.size();
matrixParam.n_reactions_ = rate_constants[0].size();
matrixParam.n_species_ = state_variables[0].size();
matrixParam.rate_constants = rate_constants.AsVector().data();
matrixParam.state_variables = state_variables.AsVector().data();
matrixParam.forcing = forcing.AsVector().data();
matrixParam.n_grids = rate_constants.size();
matrixParam.n_reactions = rate_constants[0].size();
matrixParam.n_species = state_variables[0].size();

CUDAProcessSetParam processSet;
processSet.number_of_reactants = number_of_reactants_.data();
Expand All @@ -83,13 +83,15 @@ namespace micm
SparseMatrixPolicy<double>& jacobian) const
{
CUDAMatrixParam matrixParam;
matrixParam.rate_constants_ = rate_constants.AsVector().data();
matrixParam.state_variables_ = state_variables.AsVector().data();
matrixParam.jacobian_= jacobian.AsVector().data();
matrixParam.n_grids_ = rate_constants.size();
matrixParam.n_reactions_ = rate_constants[0].size();
matrixParam.n_species_ = state_variables[0].size();
matrixParam.jacobian_size_ = jacobian.AsVector().size();
matrixParam.rate_constants = rate_constants.AsVector().data();
matrixParam.state_variables = state_variables.AsVector().data();
matrixParam.n_grids = rate_constants.size();
matrixParam.n_reactions = rate_constants[0].size();
matrixParam.n_species = state_variables[0].size();

CUDASparseMatrixParam sparseMatrix;
sparseMatrix.jacobian = jacobian.AsVector().data();
sparseMatrix.jacobian_size = jacobian.AsVector().size();

CUDAProcessSetParam processSet;
processSet.number_of_reactants = number_of_reactants_.data();
Expand All @@ -103,6 +105,7 @@ namespace micm

std::chrono::nanoseconds kernel_duration = micm::cuda::AddJacobianTermsKernelDriver(
matrixParam,
sparseMatrix,
processSet);
return kernel_duration; // time performance of kernel function
}
Expand Down
27 changes: 11 additions & 16 deletions include/micm/util/cuda_param.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include<thrust/device_vector.h>
#ifndef CUDA_PARAM_HPP
#define CUDA_PARAM_HPP

//member data of class CUDAProcessSet grouped in struct passing to kernel driver function
struct CUDAProcessSetParam{
const size_t* number_of_reactants;
Expand Down Expand Up @@ -35,20 +34,16 @@
}
//different matrix data grouped in struct passing to kernel driver function
struct CUDAMatrixParam{
const double* rate_constants_;
const double* state_variables_;
double* forcing_;
double* jacobian_;
const double* A;
double* L;
double* U;

size_t n_grids_;
size_t n_reactions_;
size_t n_species_;
size_t jacobian_size_;
size_t A_size;
size_t L_size;
size_t U_size;
const double* rate_constants;
const double* state_variables;
double* forcing;
size_t n_grids;
size_t n_reactions;
size_t n_species;
};
//sparseMatrix data grouped in struct passing to kernel driver function
struct CUDASparseMatrixParam{
double* jacobian;
size_t jacobian_size;
};
#endif
59 changes: 30 additions & 29 deletions src/process/process_set.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ namespace micm

std::chrono::nanoseconds AddJacobianTermsKernelDriver(
CUDAMatrixParam& matrixParam,
CUDASparseMatrixParam& sparseMatrix,
CUDAProcessSetParam& processSet)
{
// create device pointers
Expand All @@ -141,24 +142,24 @@ namespace micm
jacobianDevice* device;

// allocate device memory
cudaMalloc(&d_rate_constants, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_reactions_);
cudaMalloc(&d_state_variables, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_species_);
cudaMalloc(&d_jacobian, sizeof(double) * matrixParam.jacobian_size_);
cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_);
cudaMalloc(&d_rate_constants, sizeof(double) * matrixParam.n_grids * matrixParam.n_reactions);
cudaMalloc(&d_state_variables, sizeof(double) * matrixParam.n_grids * matrixParam.n_species);
cudaMalloc(&d_jacobian, sizeof(double) * sparseMatrix.jacobian_size);
cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions);
cudaMalloc(&d_reactant_ids, sizeof(size_t) * processSet.reactant_ids_size);
cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions_);
cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions);
cudaMalloc(&d_yields, sizeof(double) * processSet.yields_size);
cudaMalloc(&d_jacobian_flat_ids, sizeof(size_t) * processSet.jacobian_flat_ids_size);
cudaMalloc(&device, sizeof(jacobianDevice));


// transfer data from host to device
cudaMemcpy(d_rate_constants, matrixParam.rate_constants_, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
cudaMemcpy(d_state_variables, matrixParam.state_variables_, sizeof(double) * matrixParam.n_grids_ * matrixParam.n_species_, cudaMemcpyHostToDevice);
cudaMemcpy(d_jacobian, matrixParam.jacobian_, sizeof(double) * matrixParam.jacobian_size_, cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
cudaMemcpy(d_rate_constants, matrixParam.rate_constants, sizeof(double) * matrixParam.n_grids * matrixParam.n_reactions, cudaMemcpyHostToDevice);
cudaMemcpy(d_state_variables, matrixParam.state_variables, sizeof(double) * matrixParam.n_grids * matrixParam.n_species, cudaMemcpyHostToDevice);
cudaMemcpy(d_jacobian, sparseMatrix.jacobian, sizeof(double) * sparseMatrix.jacobian_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
cudaMemcpy(d_reactant_ids, processSet.reactant_ids, sizeof(size_t) * processSet.reactant_ids_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
cudaMemcpy(d_yields, processSet.yields, sizeof(double) * processSet.yields_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_jacobian_flat_ids, processSet.jacobian_flat_ids, sizeof(size_t) * processSet.jacobian_flat_ids_size, cudaMemcpyHostToDevice);
cudaMemcpy(&(device->rate_constants), &d_rate_constants, sizeof(double*), cudaMemcpyHostToDevice);
Expand All @@ -172,10 +173,10 @@ namespace micm


// setup kernel
size_t total_blocks = (matrixParam.n_grids_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
size_t total_blocks = (matrixParam.n_grids + BLOCK_SIZE - 1) / BLOCK_SIZE;

size_t n_reactions = matrixParam.n_reactions_;
size_t n_grids = matrixParam.n_grids_;
size_t n_reactions = matrixParam.n_reactions;
size_t n_grids = matrixParam.n_grids;
// launch kernel and measure time performance
auto startTime = std::chrono::high_resolution_clock::now();
AddJacobianTermsKernel<<<total_blocks, BLOCK_SIZE>>>(
Expand All @@ -186,7 +187,7 @@ namespace micm
auto endTime = std::chrono::high_resolution_clock::now();
auto kernel_duration = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime);

cudaMemcpy(matrixParam.jacobian_, d_jacobian, sizeof(double) * matrixParam.jacobian_size_, cudaMemcpyDeviceToHost);
cudaMemcpy(sparseMatrix.jacobian, d_jacobian, sizeof(double) * sparseMatrix.jacobian_size, cudaMemcpyDeviceToHost);
// clean up
cudaFree(d_rate_constants);
cudaFree(d_state_variables);
Expand Down Expand Up @@ -216,23 +217,23 @@ namespace micm
forcingDevice* device;

// allocate device memory
cudaMalloc(&d_rate_constants, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_reactions_));
cudaMalloc(&d_state_variables, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_));
cudaMalloc(&d_forcing, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_));
cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_);
cudaMalloc(&d_rate_constants, sizeof(double) * (matrixParam.n_grids * matrixParam.n_reactions));
cudaMalloc(&d_state_variables, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species));
cudaMalloc(&d_forcing, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species));
cudaMalloc(&d_number_of_reactants, sizeof(size_t) * matrixParam.n_reactions);
cudaMalloc(&d_reactant_ids, sizeof(size_t) * processSet.reactant_ids_size);
cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions_);
cudaMalloc(&d_number_of_products, sizeof(size_t) * matrixParam.n_reactions);
cudaMalloc(&d_product_ids, sizeof(size_t) * processSet.product_ids_size);
cudaMalloc(&d_yields, sizeof(double) * processSet.yields_size);
cudaMalloc(&device, sizeof(forcingDevice));

// copy data from host memory to device memory
cudaMemcpy(d_rate_constants, matrixParam.rate_constants_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_reactions_), cudaMemcpyHostToDevice);
cudaMemcpy(d_state_variables, matrixParam.state_variables_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_), cudaMemcpyHostToDevice);
cudaMemcpy(d_forcing, matrixParam.forcing_, sizeof(double) * (matrixParam.n_grids_ * matrixParam.n_species_), cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
cudaMemcpy(d_rate_constants, matrixParam.rate_constants, sizeof(double) * (matrixParam.n_grids * matrixParam.n_reactions), cudaMemcpyHostToDevice);
cudaMemcpy(d_state_variables, matrixParam.state_variables, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species), cudaMemcpyHostToDevice);
cudaMemcpy(d_forcing, matrixParam.forcing, sizeof(double) * (matrixParam.n_grids * matrixParam.n_species), cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_reactants, processSet.number_of_reactants, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
cudaMemcpy(d_reactant_ids, processSet.reactant_ids, sizeof(size_t) * processSet.reactant_ids_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions_, cudaMemcpyHostToDevice);
cudaMemcpy(d_number_of_products, processSet.number_of_products, sizeof(size_t) * matrixParam.n_reactions, cudaMemcpyHostToDevice);
cudaMemcpy(d_product_ids, processSet.product_ids, sizeof(size_t) * processSet.product_ids_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_yields, processSet.yields, sizeof(double) * processSet.yields_size, cudaMemcpyHostToDevice);
cudaMemcpy(&(device->rate_constants), &d_rate_constants, sizeof(double*),cudaMemcpyHostToDevice);
Expand All @@ -246,11 +247,11 @@ namespace micm

// total thread count == number of grid cells

int num_block = (matrixParam.n_grids_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
int num_block = (matrixParam.n_grids + BLOCK_SIZE - 1) / BLOCK_SIZE;

size_t n_grids = matrixParam.n_grids_;
size_t n_reactions = matrixParam.n_reactions_;
size_t n_species = matrixParam.n_species_;
size_t n_grids = matrixParam.n_grids;
size_t n_reactions = matrixParam.n_reactions;
size_t n_species = matrixParam.n_species;

// launch kernel and measure time performance
auto startTime = std::chrono::high_resolution_clock::now();
Expand All @@ -264,7 +265,7 @@ namespace micm
auto kernel_duration = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime);

// copy data from device memory to host memory
cudaMemcpy(matrixParam.forcing_, d_forcing, sizeof(double) * (n_grids * n_species), cudaMemcpyDeviceToHost);
cudaMemcpy(matrixParam.forcing, d_forcing, sizeof(double) * (n_grids * n_species), cudaMemcpyDeviceToHost);

// clean up
cudaFree(d_rate_constants);
Expand Down

0 comments on commit b083012

Please sign in to comment.