From 2b4762f02af2ed136134c7f0570646219753ab3e Mon Sep 17 00:00:00 2001
From: Matthew Martineau <mmartineau@nvidia.com>
Date: Wed, 25 Oct 2023 12:57:50 +0100
Subject: [PATCH] Fix final set of warnings ready for v2.4.0 (#274)

* Fixed missing detail in changelog and fixed warning.
* Fixed unreachable code warning.
* Tweaking changelog.
* Fixed some unused variable warnings.
* Fixed ifdef for USE_CUDAMALLOCASYNC
---
 CHANGELOG                                      | 14 +++++++++++++-
 examples/amgx_mpi_capi_cla.c                   |  4 ++--
 include/global_thread_handle.h                 |  2 +-
 src/aggregation/aggregation_amg_level.cu       |  6 +-----
 src/distributed/comms_mpi_hostbuffer_stream.cu |  8 ++++----
 src/global_thread_handle.cu                    |  6 +++---
 src/matrix.cu                                  |  1 -
 7 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/CHANGELOG b/CHANGELOG
index f3d3f16f..5da7a1ef 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -31,6 +31,18 @@ Changes:
 - Fixed issue with exact_coarse_solve grid sizing
 - Fixed issue with use_sum_stopping_criteria
 - Fixed SIGFPE that could occur when the initial norm is 0
+- Added a new API call AMGX_matrix_check_symmetry, that tests if a matrix is structurally or completely symmetric
+
+Tested configurations:
+
+Linux x86-64:
+-- Ubuntu 20.04, Ubuntu 22.04
+-- NVHPC 23.7, GCC 9.4.0, GCC 12.1
+-- OpenMPI 4.0.x
+-- CUDA 11.2, 11.8, 12.2
+-- A100, H100
+
+Note that while AMGX has support for building in Windows, testing on Windows is very limited.
 
 ===============================================================
 
@@ -103,4 +115,4 @@ v2.0.0 - 2017.10.17
 
 ---------------------------------------------------------------
 
-Initial open source release
\ No newline at end of file
+Initial open source release
diff --git a/examples/amgx_mpi_capi_cla.c b/examples/amgx_mpi_capi_cla.c
index edb81130..f76de616 100644
--- a/examples/amgx_mpi_capi_cla.c
+++ b/examples/amgx_mpi_capi_cla.c
@@ -166,8 +166,8 @@ int main(int argc, char **argv)
     int major, minor;
     char *ver, *date, *time;
     //input matrix and rhs/solution
-    int n, nnz, block_dimx, block_dimy, block_size, num_neighbors;
-    int *row_ptrs = NULL, *neighbors = NULL;
+    int n, nnz, block_dimx, block_dimy, block_size;
+    int *row_ptrs = NULL;
     void *col_indices = NULL;
     void *values = NULL, *diag = NULL, *dh_x = NULL, *dh_b = NULL;
     int *h_row_ptrs = NULL;
diff --git a/include/global_thread_handle.h b/include/global_thread_handle.h
index 3b71875c..fa053fe4 100644
--- a/include/global_thread_handle.h
+++ b/include/global_thread_handle.h
@@ -141,7 +141,7 @@ class MemoryPool
         //Mutex added to fix ICE threadsafe issue
         std::mutex m_mutex2;
 
-#ifndef USE_CUDAMALLOCASYNC
+#ifdef USE_CUDAMALLOCASYNC
         cudaMemPool_t m_mem_pool;
 #endif
 
diff --git a/src/aggregation/aggregation_amg_level.cu b/src/aggregation/aggregation_amg_level.cu
index f93a2b40..e5984eb2 100644
--- a/src/aggregation/aggregation_amg_level.cu
+++ b/src/aggregation/aggregation_amg_level.cu
@@ -2386,11 +2386,7 @@ void Aggregation_AMG_Level_Base<T_Config>::consolidateCoarseGridMatrix()
     Matrix<TConfig> &A = this->getA();
     Matrix<TConfig> &Ac = this->getNextLevel( MemorySpace( ) )->getA();
 
-    int num_parts, num_fine_neighbors, my_id;
-
-    num_parts = A.manager->getComms()->get_num_partitions();
-    num_fine_neighbors = A.manager->neighbors.size();
-    my_id = A.manager->global_id();
+    int my_id = A.manager->global_id();
         
     IVector_h &destination_part = A.manager->getDestinationPartitions();
     int my_destination_part = A.manager->getMyDestinationPartition();
diff --git a/src/distributed/comms_mpi_hostbuffer_stream.cu b/src/distributed/comms_mpi_hostbuffer_stream.cu
index 02fd38c6..da877bc8 100644
--- a/src/distributed/comms_mpi_hostbuffer_stream.cu
+++ b/src/distributed/comms_mpi_hostbuffer_stream.cu
@@ -1427,25 +1427,25 @@ void CommsMPIHostBufferStream<T_Config>::recv_vector_wait_all(HZVector &a) { rec
 template <class T_Config>
 int CommsMPIHostBufferStream<T_Config>::get_num_partitions()
 {
-    int total = 0;
 #ifdef AMGX_WITH_MPI
+    int total = 0;
     MPI_Comm_size( mpi_comm, &total );
+    return total;
 #else
     FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED);
 #endif
-    return total;
 }
 
 template <class T_Config>
 int CommsMPIHostBufferStream<T_Config>::get_global_id()
 {
-    int rank = 0;
 #ifdef AMGX_WITH_MPI
+    int rank = 0;
     MPI_Comm_rank( mpi_comm, &rank);
+    return rank;
 #else
     FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED);
 #endif
-    return rank;
 }
 
 
diff --git a/src/global_thread_handle.cu b/src/global_thread_handle.cu
index 326645f5..a64f9cb4 100644
--- a/src/global_thread_handle.cu
+++ b/src/global_thread_handle.cu
@@ -83,7 +83,7 @@ MemoryPool::MemoryPool(size_t max_block_size, size_t page_size, size_t max_size)
 {
     //initializeCriticalSection(&m_mutex2);
 
-#ifndef USE_CUDAMALLOCASYNC
+#ifdef USE_CUDAMALLOCASYNC
     int device;
     cudaGetDevice(&device);
     cudaDeviceGetMemPool(&m_mem_pool, device);
@@ -846,7 +846,7 @@ cudaError_t cudaFreeHost(void *ptr)
 
 cudaError_t cudaMallocAsync(void **ptr, size_t size, cudaStream_t stream)
 {
-#ifndef USE_CUDAMALLOCASYNC
+#ifdef USE_CUDAMALLOCASYNC
 
     return ::cudaMallocAsync(ptr, size, stream);
 
@@ -961,7 +961,7 @@ cudaError_t cudaMallocAsync(void **ptr, size_t size, cudaStream_t stream)
 
 cudaError_t cudaFreeAsync(void *ptr, cudaStream_t stream)
 {
-#ifndef USE_CUDAMALLOCASYNC
+#ifdef USE_CUDAMALLOCASYNC
 
     return ::cudaFreeAsync(ptr, stream);
 
diff --git a/src/matrix.cu b/src/matrix.cu
index 9157151f..1a86e37f 100644
--- a/src/matrix.cu
+++ b/src/matrix.cu
@@ -415,7 +415,6 @@ Matrix< TemplateConfig<AMGX_host, t_vecPrec, t_matPrec, t_indPrec> >::print(char
         fprintf(fid, "%d %d %d\n", this->get_num_rows() * this->get_block_dimx(), this->get_num_cols() * this->get_block_dimy(), tnnz);
 
         auto trafI = [&](auto const &I, auto const &i) { return I *  this->get_block_dimy() + i + 1; };
-        auto trafJ = [&](auto const &J, auto const &j) { return J *  this->get_block_dimx() + j + 1; };
 
         for (i = printRowsStart; i < printRowsEnd; i++)
         {