diff --git a/.gitignore b/.gitignore index 35ede212..9514d350 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,8 @@ plugin_config.cu *.sublime-project *.sublime-workspace core/src/version.cu -ci/docker/ \ No newline at end of file +ci/docker/ +Release +RelWithTraces +Debug +install diff --git a/examples/amgx_spmv_example/amgx_spmv_distributed_internal.cu b/examples/amgx_spmv_example/amgx_spmv_distributed_internal.cu index d1bdaf29..583625e9 100644 --- a/examples/amgx_spmv_example/amgx_spmv_distributed_internal.cu +++ b/examples/amgx_spmv_example/amgx_spmv_distributed_internal.cu @@ -264,7 +264,6 @@ void registerParameters() AMG_Config::registerParameter("exception_handling", "a flag that forces internal exception processing instead of returning error codes(1:internal, 0:external)", 0, bool_flag_values); //Register System Parameters (memory pools) AMG_Config::registerParameter("device_mem_pool_size", "size of the device memory pool in bytes", 256 * 1024 * 1024); - AMG_Config::registerParameter("device_consolidation_pool_size", "size of the device memory pool for root partition in bytes", 256 * 1024 * 1024); AMG_Config::registerParameter("device_mem_pool_max_alloc_size", "maximum size of a single allocation in the device memory pool in bytes", 20 * 1024 * 1024); AMG_Config::registerParameter("device_alloc_scaling_factor", "over allocation for large buffers (in %% -- a value of X will lead to 100+X%% allocations)", 10); AMG_Config::registerParameter("device_alloc_scaling_threshold", "buffers smaller than that threshold will NOT be scaled", 16 * 1024); diff --git a/examples/amgx_spmv_example/amgx_spmv_internal.cu b/examples/amgx_spmv_example/amgx_spmv_internal.cu index 91986020..4cb4f1f7 100644 --- a/examples/amgx_spmv_example/amgx_spmv_internal.cu +++ b/examples/amgx_spmv_example/amgx_spmv_internal.cu @@ -118,7 +118,6 @@ void registerParameters() AMG_Config::registerParameter("exception_handling", "a flag that forces internal exception processing instead of returning error codes(1:internal, 0:external)", 0, bool_flag_values); //Register System Parameters (memory pools) AMG_Config::registerParameter("device_mem_pool_size", "size of the device memory pool in bytes", 256 * 1024 * 1024); - AMG_Config::registerParameter("device_consolidation_pool_size", "size of the device memory pool for root partition in bytes", 256 * 1024 * 1024); AMG_Config::registerParameter("device_mem_pool_max_alloc_size", "maximum size of a single allocation in the device memory pool in bytes", 20 * 1024 * 1024); AMG_Config::registerParameter("device_alloc_scaling_factor", "over allocation for large buffers (in %% -- a value of X will lead to 100+X%% allocations)", 10); AMG_Config::registerParameter("device_alloc_scaling_threshold", "buffers smaller than that threshold will NOT be scaled", 16 * 1024); diff --git a/include/aggregation/aggregation_amg_level.h b/include/aggregation/aggregation_amg_level.h index 3a9f3770..39125766 100644 --- a/include/aggregation/aggregation_amg_level.h +++ b/include/aggregation/aggregation_amg_level.h @@ -110,9 +110,6 @@ class Aggregation_AMG_Level_Base : public AMG_Level void restrictResidual(VVector &r, VVector &rr); void prolongateAndApplyCorrection( VVector &c, VVector &bc, VVector &x, VVector &tmp); void computeRestrictionOperator(); - void consolidateVector(VVector &x); - void unconsolidateVector(VVector &x); - protected: diff --git a/include/amg.h b/include/amg.h index 2daeb221..48c19882 100644 --- a/include/amg.h +++ b/include/amg.h @@ -161,9 +161,6 @@ class AMG void setD2Workspace(void *workspace) { d2_workspace = workspace; } void *getCsrWorkspace() { return csr_workspace; } void setCsrWorkspace(void *workspace) { csr_workspace = workspace; } - inline void setConsolidationLowerThreshold(IndexType consolidation_lower_threshold) { m_consolidation_lower_threshold = consolidation_lower_threshold;} - inline void setConsolidationUpperThreshold(IndexType consolidation_upper_threshold) { m_consolidation_upper_threshold = consolidation_upper_threshold;} - private: AMG_Level *fine_d; @@ -183,9 +180,6 @@ class AMG int max_levels; double coarsen_threshold; - IndexType m_amg_consolidation_flag; - IndexType m_consolidation_lower_threshold; - IndexType m_consolidation_upper_threshold; int m_sum_stopping_criteria; int m_structure_reuse_levels; int m_amg_host_levels_rows; diff --git a/include/amg_level.h b/include/amg_level.h index c4be56cc..7866732c 100644 --- a/include/amg_level.h +++ b/include/amg_level.h @@ -111,9 +111,6 @@ class AMG_Level virtual IndexType getNumCoarseVertices() = 0; virtual void prepareNextLevelMatrix(const Matrix &A, Matrix &Ac) = 0; - virtual void consolidateVector(VVector &r) = 0; - virtual void unconsolidateVector(VVector &r) = 0; - virtual void transfer_level(AMG_Level *ref_lvl) = 0; void transfer_from(AMG_Level *ref_lvl); // copy from other memoryspace @@ -179,8 +176,6 @@ class AMG_Level inline void setNextLevel( AMG_Level *level ) { next_d = level; } inline void resetNextLevel( device_memory ) { next_d = 0L; } inline void deleteNextLevel( device_memory ) { delete next_d; next_d = 0L; } - inline bool isConsolidationLevel() { return m_is_consolidation_level; } - inline void setIsConsolidationLevel(bool is_consolidation_level) { m_is_consolidation_level = is_consolidation_level; } inline bool isReuseLevel() { return m_is_reuse_level; } inline void setReuseLevel(bool is_reuse_level) { m_is_reuse_level = is_reuse_level; } @@ -225,15 +220,9 @@ class AMG_Level int level_id; IndexType m_next_level_size; bool init; //marks if the x vector needs to be initialized - bool m_is_consolidation_level; bool m_is_reuse_level; std::string m_amg_level_name; - - bool m_is_root_partition; - IndexType m_destination_part; - INDEX_TYPE m_num_parts_to_consolidate; - }; template< typename TConfig, AMGX_MemorySpace MemSpace, class CycleDispatcher > diff --git a/include/classical/classical_amg_level.h b/include/classical/classical_amg_level.h index 1adb1259..b02d0979 100644 --- a/include/classical/classical_amg_level.h +++ b/include/classical/classical_amg_level.h @@ -103,8 +103,6 @@ class Classical_AMG_Level_Base : public AMG_Level virtual void computeAOperator_1x1() = 0; virtual void computeAOperator_1x1_distributed() = 0; void prepareNextLevelMatrix(const Matrix &A, Matrix &Ac) {}; - void consolidateVector(VVector &x); - void unconsolidateVector(VVector &x); void prolongateAndApplyCorrectionRescale(VVector &ec, VVector &bf, VVector &xf, VVector &ef, VVector &Aef) { FatalError( "prolongateAndApplyCorrectionRescale is not available with classical AMG. Try setting scale_corection to 0", AMGX_ERR_BAD_PARAMETERS); }; diff --git a/include/distributed/comms_mpi.h b/include/distributed/comms_mpi.h index 17f1ce7e..d690e1eb 100644 --- a/include/distributed/comms_mpi.h +++ b/include/distributed/comms_mpi.h @@ -70,8 +70,6 @@ class CommsMPI : public DistributedComms CommsMPI() : DistributedComms() {}; - virtual DistributedComms *CloneSubComm(HIVector &coarse_part_to_fine_part, bool is_root_partition) = 0; - virtual ~CommsMPI() { } diff --git a/include/distributed/comms_mpi_gpudirect.h b/include/distributed/comms_mpi_gpudirect.h index 61127f85..b6968393 100644 --- a/include/distributed/comms_mpi_gpudirect.h +++ b/include/distributed/comms_mpi_gpudirect.h @@ -96,11 +96,6 @@ class CommsMPIDirect : public CommsMPIHostBufferStream return new CommsMPIDirect(*this); } - DistributedComms *CloneSubComm(HIVector &coarse_part_to_fine_part, bool is_root_partition) - { - return NULL; - } - public: void exchange_matrix_halo(Matrix_Array &halo_rows, DistributedManager_Array &halo_btl, const Matrix &m); diff --git a/include/distributed/comms_mpi_hostbuffer_stream.h b/include/distributed/comms_mpi_hostbuffer_stream.h index e9016028..3a6c1dfb 100644 --- a/include/distributed/comms_mpi_hostbuffer_stream.h +++ b/include/distributed/comms_mpi_hostbuffer_stream.h @@ -128,24 +128,6 @@ class CommsMPIHostBufferStream : public CommsMPI }; #endif - void createSubComm( HIVector &coarse_part_to_fine_part, bool is_root_partition ) - { -#ifdef AMGX_WITH_MPI - MPI_Group new_group, orig_group; - MPI_Comm new_comm; - MPI_Comm_group(mpi_comm, &orig_group); - MPI_Group_incl(orig_group, coarse_part_to_fine_part.size(), coarse_part_to_fine_part.raw(), &new_group); - MPI_Comm_create(mpi_comm, new_group, &new_comm); - - if (is_root_partition) - { - MPI_Comm_dup(new_comm, &mpi_comm); - MPI_Comm_set_errhandler(mpi_comm, glbMPIErrorHandler); - } - -#endif - } - #ifdef AMGX_WITH_MPI MPI_Comm get_mpi_comm() { @@ -157,35 +139,6 @@ class CommsMPIHostBufferStream : public CommsMPI } #endif - DistributedComms *CloneSubComm(HIVector &coarse_part_to_fine_part, bool is_root_partition) - { -#ifdef AMGX_WITH_MPI - int my_id = this->get_global_id(); - MPI_Group new_group, orig_group; - MPI_Comm new_comm; - MPI_Comm_group(mpi_comm, &orig_group); // get orig group - - if (is_root_partition) - { - MPI_Group_incl(orig_group, coarse_part_to_fine_part.size(), coarse_part_to_fine_part.raw(), &new_group); // reorder group - MPI_Comm_create(mpi_comm, new_group, &new_comm); // create comm for new group - MPI_Group_free(&new_group); - //MPI_Comm_set_errhandler(new_comm, glbMPIErrorHandler); // set handler for new group - return new CommsMPIHostBufferStream(this, &new_comm); //wrap new comm and return it - wrapper will handle resource release - } - else - { - MPI_Group_incl(orig_group, 0, coarse_part_to_fine_part.raw(), &new_group); // reorder group - NULL group - do not need to delete this - MPI_Comm_create(mpi_comm, new_group, &new_comm); // create comm for new group - empty comm, do not need to delete this - MPI_Group_free(&new_group); // but do it anyways just for the funzies - return NULL; - } - -#else - return NULL; -#endif - } - void printString(const std::string &str); CommsMPIHostBufferStream *Clone() const diff --git a/include/distributed/distributed_comms.h b/include/distributed/distributed_comms.h index cdcdcc3b..5070a995 100644 --- a/include/distributed/distributed_comms.h +++ b/include/distributed/distributed_comms.h @@ -144,9 +144,6 @@ class DistributedComms virtual void set_neighbors(int num_neighbors) = 0; virtual DistributedComms *Clone() const = 0; - virtual DistributedComms *CloneSubComm(HIVector &coarse_part_to_fine_part, bool is_root_partition) = 0; - - virtual void createSubComm( HIVector &coarse_part_to_fine_part, bool is_root_partition ) = 0; #ifdef AMGX_WITH_MPI virtual MPI_Comm get_mpi_comm() = 0; diff --git a/include/distributed/distributed_manager.h b/include/distributed/distributed_manager.h index 720624e8..b69fac01 100644 --- a/include/distributed/distributed_manager.h +++ b/include/distributed/distributed_manager.h @@ -283,7 +283,7 @@ template class DistributedManagerBase DistributedManagerBase() : m_fine_level_comms(NULL), _num_interior_nodes(0), m_pinned_buffer(NULL), m_pinned_buffer_size(0), _num_boundary_nodes(0), _comms(NULL), has_B2L(false), neighbors(_neighbors), B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), - halo_ranges(_halo_ranges), halo_rows_ref_count(0), halo_btl_ref_count(0), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), halo_rows(NULL), halo_btl(NULL), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + halo_ranges(_halo_ranges), halo_rows_ref_count(0), halo_btl_ref_count(0), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), halo_rows(NULL), halo_btl(NULL), m_fixed_view_size(false) { cudaEventCreate(&comm_event); @@ -302,7 +302,7 @@ template class DistributedManagerBase neighbors(_neighbors), halo_offsets(halo_offsets_), B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), halo_ranges(_halo_ranges), halo_rows_ref_count(0), halo_btl_ref_count(0), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), halo_rows(NULL), halo_btl(NULL), - _comms(NULL), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + _comms(NULL), m_fixed_view_size(false) { cudaStreamCreateWithFlags(&m_int_stream, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&m_bdy_stream, cudaStreamNonBlocking); @@ -360,7 +360,7 @@ template class DistributedManagerBase std::vector > &B2L_rings_, DistributedComms **comms_) : m_fine_level_comms(NULL), A(&a), m_pinned_buffer_size(0), m_pinned_buffer(NULL), neighbors(neighbors_), halo_ranges(halo_ranges_), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), - B2L_maps(B2L_maps_), L2H_maps(_L2H_maps), B2L_rings(B2L_rings_), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + B2L_maps(B2L_maps_), L2H_maps(_L2H_maps), B2L_rings(B2L_rings_), m_fixed_view_size(false) { cudaStreamCreateWithFlags(&m_int_stream, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&m_bdy_stream, cudaStreamNonBlocking); @@ -379,7 +379,7 @@ template class DistributedManagerBase std::vector &L2H_maps_, DistributedComms **comms_) : m_fine_level_comms(NULL), A(&a), m_pinned_buffer_size(0), m_pinned_buffer(NULL), neighbors(neighbors_), halo_ranges(_halo_ranges), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), - B2L_maps(B2L_maps_), L2H_maps(L2H_maps_), B2L_rings(_B2L_rings), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + B2L_maps(B2L_maps_), L2H_maps(L2H_maps_), B2L_rings(_B2L_rings), m_fixed_view_size(false) { cudaStreamCreateWithFlags(&m_int_stream, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&m_bdy_stream, cudaStreamNonBlocking); @@ -405,8 +405,6 @@ template class DistributedManagerBase void cacheMapsOneRing(const VecInt_t **b2l_maps, const VecInt_t *b2l_sizes, const VecInt_t **l2h_maps, const VecInt_t *l2h_sizes); - void setAConsolidationFlags( Matrix &A); - void uploadMatrix(int n, int nnz, int block_dimx, int block_dimy, const int *row_ptrs, const int *col_indices, const void *data, const void *diag_data, Matrix &A); void updateMapsReorder(); @@ -426,7 +424,7 @@ template class DistributedManagerBase Vector &neighbors_, I64Vector_h &halo_ranges_h_, DistributedComms **comms_) : m_fine_level_comms(NULL), _comms(NULL), A(&a), m_pinned_buffer_size(0), m_pinned_buffer(NULL), neighbors(neighbors_), halo_ranges(_halo_ranges), halo_ranges_h(halo_ranges_h_), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), - B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), m_fixed_view_size(false) { cudaStreamCreateWithFlags(&m_int_stream, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&m_bdy_stream, cudaStreamNonBlocking); @@ -444,7 +442,7 @@ template class DistributedManagerBase const VecInt_t *neighbor_bases, const VecInt_t *neighbor_sizes, int num_neighbors) : m_fine_level_comms(NULL), A(&a), m_pinned_buffer_size(0), m_pinned_buffer(NULL), neighbors(_neighbors), halo_ranges(_halo_ranges), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), - B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), m_fixed_view_size(false) { cudaStreamCreateWithFlags(&m_int_stream, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&m_bdy_stream, cudaStreamNonBlocking); @@ -467,30 +465,6 @@ template class DistributedManagerBase delete prep; } - void initializeAfterConsolidation( - INDEX_TYPE my_id, - Matrix &A_, - Vector neighbors_, - INDEX_TYPE interior_nodes_, - INDEX_TYPE boundary_nodes_, - INDEX_TYPE total_num_rows, - Vector halo_offsets_, - std::vector &B2L_maps_, - INDEX_TYPE ring_, - bool is_root_partition_) - { - A = &A_; - this->set_global_id(my_id); - _num_interior_nodes = interior_nodes_; - _num_boundary_nodes = boundary_nodes_; - neighbors = neighbors_; - halo_offsets = halo_offsets_; - B2L_maps = B2L_maps_; - m_is_root_partition = is_root_partition_; - this->set_num_halo_rows(total_num_rows - halo_offsets[0]); - this->set_num_halo_rings(ring_); - } - virtual void reorder_matrix() = 0; virtual void reorder_matrix_owned() = 0; @@ -503,80 +477,21 @@ template class DistributedManagerBase virtual void renumber_P_R(Matrix &P, Matrix &R, Matrix &A) = 0; - virtual void createOneRingB2Lmaps() = 0; - virtual void createOneRingHaloRows() = 0; - void computeDestinationPartitions(INDEX_TYPE upper_threshold, float avg_size, const int num_parts, int &new_num_parts, bool &wantNeighbors); - - void computeDestinationPartitionsWithCons(int my_id, int num_parts, IVector_h &destination_part, DistributedComms *comms); - - Vector &getDestinationPartitions() - { - return m_destination_partitions; - } - Vector &getFineDestinationPartitions() - { - return m_destination_partitions; - } - void setDestinationPartitions(Vector &destination_partitions) - { - m_destination_partitions = destination_partitions; - } - void createNeighToDestPartMap(IVector_h &neigh_to_part, IVector_h &neighbors, IVector_h &destination_part, int num_neighbors); - void createConsolidatedNeighToPartMap(IVector_h &cons_neigh_to_part, IVector_h &neigh_to_part, int my_destination_part, IVector_h &destination_part, int &num_cons_neighbors); - void createNeighToConsNeigh(IVector_h &neigh_to_cons_neigh, IVector_h &cons_neigh_to_part, IVector_h &neigh_to_part, int my_destination_part, int &num_neighbors); - - void consolidateB2Lmaps(IVector_h_vector &dest_coarse_B2L_maps, IVector_h_vector &coarse_B2L_maps, IVector_h &fine_neigh_to_coarse_neigh, int num_coarse_neighbors, int num_fine_neighbors); - void consolidateB2Lmaps(IVector_d_vector &dest_coarse_B2L_maps, IVector_d_vector &coarse_B2L_maps, IVector_h &fine_neigh_to_coarse_neigh, int num_coarse_neighbors, int num_fine_neighbors); - template void consB2Lmaps(std::vector &dest_coarse_B2L_maps, std::vector &coarse_B2L_maps, IVector_h &fine_neigh_to_coarse_neigh, int num_coarse_neighbors, int num_fine_neighbors); - void computeConsolidatedOffsets(const int my_id, const int my_destination_part, const bool sis_root_partition, const int num_interior_rows, const int num_boundary_rows, IVector_h_vector &vertex_counts, const IVector_h &parts_to_consolidate, const int num_parts_to_consolidate, int &interior_offset, int &boundary_offset, int &total_interior_rows_in_merged, int &total_boundary_rows_in_merged, int &total_rows_in_merged, DistributedComms *comms); - - void createAggregatesRenumbering(IVector_d &renumbering, IVector_d_vector &B2L_maps, int size, int num_neighbors, int &num_interior_aggregates, int &num_boundary_aggregates, int num_rings); - void createAggregatesRenumbering(IVector_h &renumbering, IVector_h_vector &B2L_maps, int size, int num_neighbors, int &num_interior_aggregates, int &num_boundary_aggregates, int num_rings); - - template - void createAggRenumbering(IVector_hd &renumbering, std::vector &B2L_maps, int size, int num_neighbors, int &num_interior_aggregates, int &num_boundary_aggregates, int num_rings); - - - void consolidateB2LmapsOnRoot(int &num_consolidated_neighbors, IVector_d_vector &consolidated_B2L_maps, IVector_h &consolidated_coarse_ids, IVector_d_vector &dest_coarse_B2L_maps, IVector_h &coarse_neigh_to_fine_part, IVector_h &num_bdy_per_coarse_neigh, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int my_id, int my_destination_part, bool is_root_partition, int num_coarse_neighbors, DistributedComms *comms); - - void consolidateB2LmapsOnRoot(int &num_consolidated_neighbors, IVector_h_vector &consolidated_B2L_maps, IVector_h &consolidated_coarse_ids, IVector_h_vector &dest_coarse_B2L_maps, IVector_h &coarse_neigh_to_fine_part, IVector_h &num_bdy_per_coarse_neigh, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int my_id, int my_destination_part, bool is_root_partition, int num_coarse_neighbors, DistributedComms *comms); - - template - void consB2LmapsOnRoot(int &num_consolidated_neighbors, std::vector &consolidated_B2L_maps, IVector_h &consolidated_coarse_ids, std::vector &dest_coarse_B2L_maps, IVector_h &coarse_neigh_to_fine_part, IVector_h &num_bdy_per_coarse_neigh, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int my_id, int my_destination_part, bool is_root_partition, int num_coarse_neighbors, DistributedComms *comms); - - - void consolidateAndRenumberHalos(IVector_h &aggregates, const IVector_h &manager_halo_offsets, IVector_h &halo_offsets, const IVector_h &neighbors, int num_fine_neighbors, const IVector_h &consolidated_coarse_ids, int num_consolidated_neighbors, const IVector_h &destination_part, int my_destination_part, bool is_root_partition, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int num_parts, int my_id, int total_rows_in_merged, int &num_all_aggregates, DistributedComms *comms); - - void consolidateAndRenumberHalos(IVector_d &aggregates, const IVector_h &manager_halo_offsets, IVector_h &halo_offsets, const IVector_h &neighbors, int num_fine_neighbors, const IVector_h &consolidated_coarse_ids, int num_consolidated_neighbors, const IVector_h &destination_part, int my_destination_part, bool is_root_partition, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int num_parts, int my_id, int total_rows_in_merged, int &num_all_aggregates, DistributedComms *comms); - - template - void consAndRenumberHalos(IVector_hd &aggregates, const IVector_h &manager_halo_offsets, IVector_h &halo_offsets, const IVector_h &neighbors, int num_fine_neighbors, const IVector_h &consolidated_coarse_ids, int num_consolidated_neighbors, const IVector_h &destination_part, int my_destination_part, bool is_root_partition, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int num_parts, int my_id, int total_rows_in_merged, int &num_all_aggregates, DistributedComms *comms); - - void ipcExchangePtr(void *&ptr, bool is_root_partition, int num_parts_to_consolidate, IVector_h &parts_to_consolidate, int my_root_partition, int my_id, DistributedComms *comms); - - void ipcWaitForChildren(bool is_root_partition, int num_parts_to_consolidate, IVector_h &parts_to_consolidate, int my_destination_part, int my_id, DistributedComms *comms); - - void ipcWaitForRoot(bool is_root_partition, int num_parts_to_consolidate, IVector_h &parts_to_consolidate, int my_destination_part, int my_id, DistributedComms *comms); - - void remove_boundary(IVector_h &flagArray, IVector_h &B2L_maps, int size); - void remove_boundary(IVector_d &flagArray, IVector_d &B2L_maps, int size); void get_unassigned(IVector_h &flagArray, IVector_h &B2L_maps, IVector_h &partition_flags, int size, int flagArray_size /*, int rank*/); void get_unassigned(IVector_d &flagArray, IVector_d &B2L_maps, IVector_d &partition_flags, int size, int flagArray_size /*, int rank*/); void set_unassigned(IVector_h &partition_flags, IVector_h &partition_renum, IVector_h &B2L_map, IVector_h &renumbering, int size, int max_element, int renumbering_size /*, int rank*/); void set_unassigned(IVector_d &partition_flags, IVector_d &partition_renum, IVector_d &B2L_map, IVector_d &renumbering, int size, int max_element, int renumbering_size /*, int rank*/); - void exchangeSolveResultsConsolidation(int &num_iters, std::vector &res_history, AMGX_STATUS &status, bool store_res_history); - void flag_halo_ids(int size, IVector_h &scratch, IVector_h &halo_aggregates, VecInt_t min_index_coarse_halo, int max_index, int min_index) ; void flag_halo_ids(int size, IVector_d &scratch, IVector_d &halo_aggregates, VecInt_t min_index_coarse_halo, int max_index, int min_index) ; void read_halo_ids(int size, IVector_h &scratch, IVector_h &halo_aggregates, VecInt_t min_index_coarse_halo); @@ -644,60 +559,10 @@ template class DistributedManagerBase { return (int64_t) _base_index; } - - bool isRootPartition() const - { - return m_is_root_partition; - } - bool isGlued() const - { - return m_is_glued; - } - bool isFineLevelGlued() const - { - return m_is_fine_level_glued; - } - bool isFineLevelRootPartition() const - { - return m_is_fine_level_root_partition; - } - - bool isFineLevelConsolidated() const - { - return m_is_fine_level_consolidated; - } - void setIsFineLevelConsolidated(const bool flag) - { - m_is_fine_level_consolidated = flag; - } - void setIsFineLevelGlued(const bool flag) - { - m_is_fine_level_glued = flag; - } - void setIsFineLevelRootPartition(const bool flag) - { - m_is_fine_level_root_partition = flag; - } - void setIsRootPartition(bool flag) - { - m_is_root_partition = flag; - } - void setIsGlued(const bool flag) - { - m_is_glued = flag; - } void fineLevelUpdate() { - m_is_fine_level_root_partition = m_is_root_partition; - m_num_fine_level_parts_to_consolidate = m_num_parts_to_consolidate; - m_fine_level_parts_to_consolidate = m_parts_to_consolidate; - m_my_fine_level_destination_part = m_my_destination_part; m_fine_level_comms = _comms; m_fine_level_id = _global_id; - // other data structure used on the finest level - //fine_level_id - //get_unconsolidated_size - //getFineLevelComms } INDEX_TYPE getMyDestinationPartition() @@ -705,30 +570,11 @@ template class DistributedManagerBase return m_my_destination_part; } - INDEX_TYPE getNumPartsToConsolidate() - { - return m_num_parts_to_consolidate; - } - - void setNumPartsToConsolidate(INDEX_TYPE num_fine_parts) - { - m_num_parts_to_consolidate = num_fine_parts; - } void setMyDestinationPartition(INDEX_TYPE my_destination_part) { m_my_destination_part = my_destination_part; } - void setPartsToConsolidate(Vector &parts_to_consolidate) - { - m_parts_to_consolidate = parts_to_consolidate; - } - - Vector &getPartsToConsolidate(void) - { - return m_parts_to_consolidate; - } - void setB2Lrings(std::vector > &par_B2L_rings) { B2L_rings = par_B2L_rings; @@ -742,51 +588,6 @@ template class DistributedManagerBase { return B2L_maps; } - void setCoarseToFine(Vector &coarse_to_fine_part) - { - m_coarse_to_fine_part = coarse_to_fine_part; - } - - Vector &getCoarseToFine(void) - { - return m_coarse_to_fine_part; - } - void setFineToCoarse(Vector &fine_to_coarse_part) - { - m_fine_to_coarse_part = fine_to_coarse_part; - } - - Vector &getFineToCoarse(void) - { - return m_fine_to_coarse_part; - } - - void setConsolidationOffsets(int int_off, int int_size, int bndry_off, int bndry_size) - { - m_cons_interior_offset = int_off; - m_cons_interior_size = int_size; - m_cons_bndry_offset = bndry_off; - m_cons_bndry_size = bndry_size; - } - - void getConsolidationOffsets(int *int_off, int *int_size, int *bndry_off, int *bndry_size) - { - *int_off = m_cons_interior_offset; - *int_size = m_cons_interior_size; - *bndry_off = m_cons_bndry_offset; - *bndry_size = m_cons_bndry_size; - } - - void setConsolidationArrayOffsets(std::vector &array) - { - m_consolidationArrayOffsets = array; - } - - std::vector &getConsolidationArrayOffsets() - { - return m_consolidationArrayOffsets; - } - void set_fine_level_id(INDEX_TYPE id) { m_fine_level_id = id; @@ -896,9 +697,6 @@ template class DistributedManagerBase L2H_maps[i] = a.L2H_maps[i]; } - m_is_root_partition = a.isRootPartition(); - m_is_glued = a.isGlued(); - m_is_fine_level_glued = a.isFineLevelGlued(); destroyComms(); //since you have a copy you should not free the memory halo_rows = NULL; @@ -941,15 +739,6 @@ template class DistributedManagerBase B2L_maps.swap(a.B2L_maps); L2H_maps.swap(a.L2H_maps); B2L_rings.swap(a.B2L_rings); - bool tmp = m_is_root_partition; - m_is_root_partition = a.isRootPartition(); - a.setIsRootPartition(tmp); - tmp = m_is_glued; - m_is_glued = a.isGlued(); - a.setIsGlued(tmp); - tmp = m_is_fine_level_glued; - m_is_glued = a.isFineLevelGlued(); - a.setIsFineLevelGlued(tmp); } void print(char *f, char *s, int trank); @@ -1375,8 +1164,6 @@ template class DistributedManagerBase // Consolidation related DistributedComms *m_fine_level_comms; //LEVEL 0 - pointer to comms module - bool m_is_fine_level_consolidated; - bool m_use_cuda_ipc_consolidation; bool m_host_transform; int m_fine_level_id; int m_old_nnz_CONS; @@ -1676,27 +1463,7 @@ template class DistributedManagerBase INDEX_TYPE _num_halo_rows = 0; //LEVEL 0 - total number of rows in the halo section of the matrix INDEX_TYPE _num_halo_rings = 0; //LEVEL 0 - number of halo rings - bool m_is_root_partition; - bool m_is_glued; - bool m_is_fine_level_glued; INDEX_TYPE m_my_destination_part = 0; - INDEX_TYPE m_num_parts_to_consolidate = 0; - INDEX_TYPE m_cons_interior_offset = 0; - INDEX_TYPE m_cons_interior_size = 0; - INDEX_TYPE m_cons_bndry_offset = 0; - INDEX_TYPE m_cons_bndry_size = 0; - std::vector m_consolidationArrayOffsets; - Vector m_destination_partitions; - - Vector m_parts_to_consolidate; - Vector m_fine_to_coarse_part; - Vector m_coarse_to_fine_part; - - // fine level consolidation data structures (used both in classical aggregation) - bool m_is_fine_level_root_partition; - INDEX_TYPE m_num_fine_level_parts_to_consolidate; - Vector m_fine_level_parts_to_consolidate; - INDEX_TYPE m_my_fine_level_destination_part; //cached sizes for different views of the matrix (set in Matrix::set_initialized(1)) INDEX_TYPE _num_rows_interior = 0; @@ -1733,15 +1500,6 @@ template class DistributedManagerBase IVector renumbering; IVector inverse_renumbering; - // Containers to store info of the unglued matrix, we need that glue or unglue vectors - IVector renumbering_before_glue; // to glue vectors during the solve, used in glue path. - IVector inverse_renumbering_before_glue; // to unglue vectors before prolongation in fixed cycle, used in glue path. - // we need that to exchange the halo of unglued vectors (in coarse level consolidation) - Vector neighbors_before_glue; // just neighbors before glue - std::vector B2L_maps_before_glue; - std::vector > B2L_rings_before_glue; //list of boundary nodes to export to other partitions. - Vector halo_offsets_before_glue; - Vector halo_offsets; //offsets (and size) to halos received from different neighbors, size is halo_rings*neighbors.size()+1, first element is already the offset into the matrix Vector &neighbors; //LEVEL 0 - list of neighbors with their global index, in the order we store their halos @@ -1870,19 +1628,13 @@ class DistributedManager< TemplateConfig &P, Matrix &R, Matrix &A); - void createOneRingB2Lmaps(); void createOneRingHaloRows(); - void consolidateAndUploadAll(int n, int nnz, int block_dimx, int block_dimy, const int *row_ptrs, const int *col_indices, const void *data, const void *diag_data, Matrix &in_A) ; - - void replaceMatrixCoefficientsNoCons(int n, int nnz, const mat_value_type *data, const mat_value_type *diag_data); - void replaceMatrixCoefficientsWithCons(int n, int nnz, const mat_value_type *data, const mat_value_type *diag_data); + void replaceMatrixCoefficients(int n, int nnz, const mat_value_type *data, const mat_value_type *diag_data); void transformAndUploadVector(VVector_v &v, const void *data, int n, int block_dim); void transformVector(VVector_v &v); - void transformAndUploadVectorWithCons(VVector_v &v, const void *data, int n, int block_dim); void revertAndDownloadVector(VVector_v &v, const void *data, int n, int block_dimy); void revertVector(VVector_v &v); void revertVector(VVector_v &v_in, VVector_v &v_out); - void revertAndDownloadVectorWithCons(VVector_v &v, const void *data, int n, int block_dimy); void createRenumbering(IVector &renumbering); //constructors @@ -1990,8 +1742,6 @@ class DistributedManager< TemplateConfig IVector_h_vector; typedef std::vector IVector_d_vector; - void consolidateAndUploadAll(int n, int nnz, int block_dimx, int block_dimy, const int *row_ptrs, const int *col_indices, const void *data, const void *diag_data, Matrix &A) ; - void reorder_matrix(); void reorder_matrix_owned(); @@ -2003,18 +1753,14 @@ class DistributedManager< TemplateConfig &P, Matrix &R, Matrix &A); - void createOneRingB2Lmaps(); void createOneRingHaloRows(); - void replaceMatrixCoefficientsNoCons(int n, int nnz, const mat_value_type *data_pinned, const mat_value_type *diag_data_pinned); - void replaceMatrixCoefficientsWithCons(int n, int nnz, const mat_value_type *data_pinned, const mat_value_type *diag_data_pinned); + void replaceMatrixCoefficients(int n, int nnz, const mat_value_type *data_pinned, const mat_value_type *diag_data_pinned); void transformAndUploadVector(VVector_v &v, const void *data, int n, int block_dim); void transformVector(VVector_v &v); - void transformAndUploadVectorWithCons(VVector_v &v, const void *data, int n, int block_dim); void revertAndDownloadVector(VVector_v &v, const void *data, int n, int block_dimy); void revertVector(VVector_v &v); void revertVector(VVector_v &v_in, VVector_v &v_out); - void revertAndDownloadVectorWithCons(VVector_v &v, const void *data, int n, int block_dimy); void createRenumbering(IVector &renumbering); diff --git a/include/distributed/glue.h b/include/distributed/glue.h deleted file mode 100644 index 0d41bdaa..00000000 --- a/include/distributed/glue.h +++ /dev/null @@ -1,1228 +0,0 @@ -/* Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define COARSE_CLA_CONSO 0 // used enable / disable coarse level consolidation (used in cycles files) - -namespace amgx -{ - -/********************************************************** - * Glue ( Consolidation) - *********************************************************/ -#ifdef AMGX_WITH_MPI - -//------------------------ -//---------Matrix------------- -//------------------------ -template -void compute_glue_info(Matrix &A) -{ - // Fill distributed manager fields for consolidation - // Example - // destination_part = [0 0 0 0 4 4 4 4 8 8 8 8] (input from manager->computeDestinationPartitions) - // num_parts_to_consolidate = 4 for partitions 0,4,8 - (0 otherwise) - // parts_to_consolidate (rank 0)[0 1 2 3] (rank 4)[4 5 6 7] (rank 8)[8 9 10 11] - //coarse_part_to_fine_part = [0 4 8] num_coarse_partitions = 3 - //fine_part_to_coarse_part = [0 0 0 0 1 1 1 1 2 2 2 2] - //ConsolidationArrayOffsets constains the offset of the nnz of each partitions in row pointer fashion : 0, pat1.NNZ, pat1.NNZ+part2.NNZ ... NNZ - typedef typename TConfig::template setMemSpace::Type TConfig_h; - typedef typename TConfig_h::template setVecPrec::Type ivec_value_type_h; - typedef typename ivec_value_type_h::VecPrec VecInt_t; - - bool is_root_partition = false; - int num_parts_to_consolidate = 0; - int num_parts = A.manager->getComms()->get_num_partitions(); - int my_id = A.manager->global_id(); - Vector parts_to_consolidate; - Vector dest_partitions = A.manager->getDestinationPartitions(); - - // compute is_root_partition and num_parts_to_consolidate - for (int i = 0; i < num_parts; i++) - { - if (dest_partitions[i] == my_id) - { - is_root_partition = true; - num_parts_to_consolidate++; - } - } - - parts_to_consolidate.resize(num_parts_to_consolidate); - // parts_to_consolidate - int count = 0; - - for (int i = 0; i < num_parts; i++) - { - if (dest_partitions[i] == my_id) - { - parts_to_consolidate[count] = i; - count++; - } - } - - A.manager->setIsRootPartition(is_root_partition); - A.manager->setNumPartsToConsolidate(num_parts_to_consolidate); - A.manager->setPartsToConsolidate(parts_to_consolidate); - // We don't really use the following in the latest version of the glue path but they are useful information - // coarse_to_fine_part, fine_to_coarse_part - Vector coarse_to_fine_part, fine_to_coarse_part(num_parts); - coarse_to_fine_part = dest_partitions; - thrust::sort(coarse_to_fine_part.begin(), coarse_to_fine_part.end()); - cudaCheckError(); - coarse_to_fine_part.erase(thrust::unique(coarse_to_fine_part.begin(), coarse_to_fine_part.end()), coarse_to_fine_part.end()); - cudaCheckError(); - thrust::lower_bound(coarse_to_fine_part.begin(), coarse_to_fine_part.end(), dest_partitions.begin(), dest_partitions.end(), fine_to_coarse_part.begin()); - cudaCheckError(); - A.manager->setCoarseToFine(coarse_to_fine_part); - A.manager->setFineToCoarse(fine_to_coarse_part); - Vector consolidationArrayOffsets; - consolidationArrayOffsets.resize(num_parts); -} - -template -MPI_Comm compute_glue_matrices_communicator(Matrix &A) -{ - // Create temporary communicators for each consilidated matrix - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (A.manager->getDestinationPartitions().size() != 0) - { - int color = A.manager->getDestinationPartitions()[rank]; - MPI_Comm new_comm; - // Split a communicator into multiple, non-overlapping communicators by color(each destiation partition has its color) - // 1. Use MPI_Allgather to get the color and key from each process - // 2. Count the number of processes with the same color; create a - // communicator with that many processes. If this process has - // MPI_UNDEFINED as the color, create a process with a single member. - // 3. Use key to order the ranks - MPI_Comm_split(MPI_COMM_WORLD, color, rank, &new_comm); - return new_comm; - } - else - { - FatalError("NO DESTINATION PARTIONS", AMGX_ERR_CORE); - } -} - -//function object (functor) for thrust calls (it is a unary operator to add a constant) -template -class add_constant_op -{ - const T c; - public: - add_constant_op(T _c) : c(_c) {} - __host__ __device__ T operator()(const T &x) const - { - return x + c; - } -}; - -template -int create_part_offsets(MPI_Comm &mpicm, Matrix &nv_mtx) -{ - /* WARNING: Notice that part_offsets_h & part_offsets have type int64_t. - Therefore we need to use MPI_INT64_T (or MPI_LONG_LONG) in MPI_Allgather. - Also, we need the send & recv buffers to be of the same type, therefore - we will create a temporary variable n64 of the correct type below. */ - //create TConfig64, which is the same as TConfig, but with index type being int64_t - typedef typename TConfig::template setVecPrec::Type TConfig64; - typedef typename TConfig64::VecPrec t_VecPrec; //t_VecPrec = int64_t - int n, offset, mpist; - int nranks = 0; //nv_mtx.manager->get_num_partitions(); - - if (nv_mtx.manager != NULL) - { - //some initializations - nv_mtx.getOffsetAndSizeForView(OWNED, &offset, &n); - MPI_Comm_size(mpicm, &nranks); - nv_mtx.manager->part_offsets_h.resize(nranks + 1); - //gather the number of rows per partition on the host (on all ranks) - t_VecPrec n64 = n; - nv_mtx.manager->part_offsets_h[0] = 0; //first element is zero (the # of rows is gathered afterwards) - - if (typeid(t_VecPrec) == typeid(int64_t)) - { - mpist = MPI_Allgather(&n64, 1, MPI_INT64_T, nv_mtx.manager->part_offsets_h.raw() + 1, 1, MPI_INT64_T, mpicm); - } - else - { - FatalError("MPI_Gatherv of the vector has failed - incorrect vector data type", AMGX_ERR_CORE); - } - - if (mpist != MPI_SUCCESS) - { - FatalError("MPI_Gatherv of the vector has failed - detected incorrect MPI return code", AMGX_ERR_CORE); - } - - //perform a prefix sum - thrust::inclusive_scan(nv_mtx.manager->part_offsets_h.begin(), nv_mtx.manager->part_offsets_h.end(), nv_mtx.manager->part_offsets_h.begin()); - //create the corresponding array on device (this is important) - nv_mtx.manager->part_offsets.resize(nranks + 1); - thrust::copy(nv_mtx.manager->part_offsets_h.begin(), nv_mtx.manager->part_offsets_h.end(), nv_mtx.manager->part_offsets.begin()); - } - - return 0; -} - -template -int glue_matrices(Matrix &nv_mtx, MPI_Comm &nv_mtx_com, MPI_Comm &temp_com) -{ - typedef typename TConfig::IndPrec t_IndPrec; - typedef typename TConfig::MatPrec t_MatPrec; - int n, nnz, offset, l, k = 0, i; - int start, end, shift; - int mpist, root = 0; - //MPI call parameters - t_IndPrec *rc_ptr, *di_ptr; - t_IndPrec *hli_ptr, *hgi_ptr, *hgr_ptr, *i_ptr, *r_ptr; - t_MatPrec *hlv_ptr, *hgv_ptr, *v_ptr; - thrust::host_vector rc; - thrust::host_vector di; - //unpacked local matrix on the device and host - device_vector_alloc Bp; - device_vector_alloc Bi; - device_vector_alloc Bv; - thrust::host_vector hBp; - thrust::host_vector hBi; - thrust::host_vector hBv; - //Consolidated matrices on the host - thrust::host_vector hAp; - thrust::host_vector hAi; - thrust::host_vector hAv; - //Consolidated matrices on the device - device_vector_alloc Ap; - device_vector_alloc Ai; - device_vector_alloc Av; - //WARNING: this routine currently supports matrix only with block size =1 (it can be generalized in the future, though) - //initialize the defaults - mpist = MPI_SUCCESS; - - if (nv_mtx.manager != NULL) - { - //int rank = nv_mtx.manager->global_id(); - nv_mtx.getOffsetAndSizeForView(OWNED, &offset, &n); - nv_mtx.getNnzForView(OWNED, &nnz); - - if (nv_mtx.manager->part_offsets_h.size() == 0 || nv_mtx.manager->part_offsets.size() == 0) // create part_offsets_h & part_offsets - { - create_part_offsets(nv_mtx_com, nv_mtx); // (if needed for aggregation path) - } - - Bp.resize(n + 1); - Bi.resize(nnz); - Bv.resize(nnz); - hBp.resize(n + 1); - hBi.resize(nnz); - hBv.resize(nnz); - //--- unpack the matrix --- - nv_mtx.manager->unpack_partition(thrust::raw_pointer_cast(Bp.data()), - thrust::raw_pointer_cast(Bi.data()), - thrust::raw_pointer_cast(Bv.data())); - cudaCheckError(); - //copy to host (should be able to optimize this out later on) - hBp = Bp; - hBi = Bi; - hBv = Bv; - cudaCheckError(); - - // --- Glue matrices --- - - // construct global row pointers - // compute recvcounts and displacements for MPI_Gatherv - if (nv_mtx.manager->isRootPartition()) - { - l = nv_mtx.manager->getNumPartsToConsolidate(); // number of partitions - rc.resize(l); - di.resize(l); - - //compute recvcounts and displacements for MPI_Gatherv - for (i = 0; i < l; i++) - { - start = nv_mtx.manager->part_offsets_h[nv_mtx.manager->getPartsToConsolidate()[i]]; - end = nv_mtx.manager->part_offsets_h[nv_mtx.manager->getPartsToConsolidate()[i] + 1]; - rc[i] = end - start; - di[i] = k + 1; - k += rc[i]; - } - - hAp.resize(k + 1); // extra +1 is needed because row_offsets have one extra element at the end - } - - cudaCheckError(); - //alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details) - rc_ptr = thrust::raw_pointer_cast(rc.data()); - di_ptr = thrust::raw_pointer_cast(di.data()); - hli_ptr = thrust::raw_pointer_cast(hBp.data()); - hgr_ptr = thrust::raw_pointer_cast(hAp.data()); - cudaCheckError(); - - //gather (on the host) - if (typeid(t_IndPrec) == typeid(int)) - { - mpist = MPI_Gatherv(hli_ptr + 1, n, MPI_INT, hgr_ptr, rc_ptr, di_ptr, MPI_INT, root, temp_com); - } - else - { - FatalError("MPI_Gatherv of the vector has failed - incorrect vector data type", AMGX_ERR_CORE); - } - - if (mpist != MPI_SUCCESS) - { - FatalError("MPI_Gatherv of the vector has failed - detected incorrect MPI return code", AMGX_ERR_CORE); - } - - // Adjust row pointers, construct global column indices and values (recvcounts and displacements were computed above) - if (nv_mtx.manager->isRootPartition()) - { - //adjust global row pointers and setup the recvcounts & displacements for subsequent MPI calls - for (i = 0; i < l; i++) - { - start = di[i] - 1; - end = di[i] + rc[i] - 1; - shift = hAp[start]; - thrust::transform(hAp.begin() + start + 1, hAp.begin() + end + 1, hAp.begin() + start + 1, add_constant_op(shift)); - cudaCheckError(); - di[i] = shift; - rc[i] = hAp[end] - hAp[start]; - } - - //some allocations/resizing - hAi.resize(hAp[k]); - hAv.resize(hAp[k]); - } - - //alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details) - rc_ptr = thrust::raw_pointer_cast(rc.data()); - di_ptr = thrust::raw_pointer_cast(di.data()); - hli_ptr = thrust::raw_pointer_cast(hBi.data()); - hgi_ptr = thrust::raw_pointer_cast(hAi.data()); - hlv_ptr = thrust::raw_pointer_cast(hBv.data()); - hgv_ptr = thrust::raw_pointer_cast(hAv.data()); - cudaCheckError(); - - //gather (on the host) - //columns indices - if (typeid(t_IndPrec) == typeid(int)) - { - mpist = MPI_Gatherv(hli_ptr, nnz, MPI_INT, hgi_ptr, rc_ptr, di_ptr, MPI_INT, root, temp_com); - } - else - { - FatalError("MPI_Gatherv of the vector has failed - incorrect vector data type", AMGX_ERR_CORE); - } - - if (mpist != MPI_SUCCESS) - { - FatalError("MPI_Gatherv of the vector has failed - detected incorrect MPI return code", AMGX_ERR_CORE); - } - - //values - if (typeid(t_MatPrec) == typeid(float)) - { - mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_FLOAT, hgv_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, temp_com); - } - else if (typeid(t_MatPrec) == typeid(double)) - { - mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_DOUBLE, hgv_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, temp_com); - } - else - { - FatalError("MPI_Gatherv of the vector has failed - incorrect vector data type", AMGX_ERR_CORE); - } - - if (mpist != MPI_SUCCESS) - { - FatalError("MPI_Gatherv of the vector has failed - detected incorrect MPI return code", AMGX_ERR_CORE); - } - - // --- Upload matrices --- - if (nv_mtx.manager->isRootPartition()) - { - n = hAp.size() - 1; - nnz = hAi.size(); - Ap.resize(hAp.size()); - Ai.resize(hAi.size()); - Av.resize(hAv.size()); - thrust::copy(hAp.begin(), hAp.end(), Ap.begin()); - thrust::copy(hAi.begin(), hAi.end(), Ai.begin()); - thrust::copy(hAv.begin(), hAv.end(), Av.begin()); - cudaCheckError(); - } - else - { - n = 0; - nnz = 0; - Ap.resize(1); // warning row_ponter size is expected to be n+1. - Ap.push_back(0); - Ai.resize(0); - Av.resize(0); - cudaCheckError(); - } - - r_ptr = thrust::raw_pointer_cast(Ap.data()); - i_ptr = thrust::raw_pointer_cast(Ai.data()); - v_ptr = thrust::raw_pointer_cast(Av.data()); - cudaCheckError(); - upload_matrix_after_glue(n, nnz, r_ptr, i_ptr, v_ptr, nv_mtx); - } - else - { - /* ASSUMPTION: when manager has not been allocated you are running on a single rank */ - } - - return 0; -} - - -template -int upload_matrix_after_glue(int n, int nnz, int *r_ptr, int *i_ptr, void *v_ptr, Matrix &nv_mtx) -{ - // Using a path similar to AMGX_matrix_upload_all_global - typedef typename TConfig::IndPrec t_IndPrec; - typedef typename TConfig::MatPrec t_MatPrec; - typedef typename TConfig::template setMemSpace::Type TConfig_h; - typedef typename TConfig::template setVecPrec::Type ivec_value_type; - typedef typename TConfig_h::template setVecPrec::Type ivec_value_type_h; - typedef typename ivec_value_type_h::VecPrec VecInt_t; - typedef Vector IVector; - // some parameters - int block_dimx, block_dimy, num_ranks, n_global, start, end, val; - t_IndPrec *part_vec_ptr; - thrust::host_vector pv; - // set parameters - nv_mtx.setView(ALL); // not sure about this - n_global = nv_mtx.manager->part_offsets_h.back(); - block_dimx = nv_mtx.get_block_dimx(); - block_dimy = nv_mtx.get_block_dimy(); - //MPI_Comm* mpi_comm = nv_mtx.getResources()->getMpiComm(); - //MPI_Comm_size(*mpi_comm, &num_ranks); - num_ranks = nv_mtx.manager->getComms()->get_num_partitions(); - // WARNING We create an artificial partition vectior that matches the new distribution - // example n = 8, num_ranks(ie. num partitions) = 4 , DestinationPartitions[0,0,2,2], partvec = [0,0,0,0,2,2,2,2] - // This might be an issue for the finest level if input_partvect !=NULL - Vector dest_partitions = nv_mtx.manager->getDestinationPartitions(); - - for (int i = 0; i < num_ranks; i++) - { - val = dest_partitions[i]; - start = nv_mtx.manager->part_offsets_h[i]; - end = nv_mtx.manager->part_offsets_h[i + 1]; - - for (int j = 0; j < end - start; j++) - { - pv.push_back(val); - } - } - - part_vec_ptr = thrust::raw_pointer_cast(pv.data()); - cudaCheckError(); - // Save some glue info - bool is_root_partition = nv_mtx.manager->isRootPartition(); - int dest_part = nv_mtx.manager->getMyDestinationPartition(); - int num_parts_to_consolidate = nv_mtx.manager->getNumPartsToConsolidate(); - Vector parts_to_consolidate = nv_mtx.manager->getPartsToConsolidate(); - std::vector cao; - - for (int i = 0; i < nv_mtx.manager->part_offsets_h.size(); i++) - { - cao.push_back(nv_mtx.manager->part_offsets_h[i]); - } - - // WARNING - // renumbering contains the inverse permutation to unreorder an amgx vector - // inverse_renumbering contains the permutaion to reorder an amgx vector - Vector ir = nv_mtx.manager->inverse_renumbering; - Vector r = nv_mtx.manager->renumbering; - // We need that to exchange the halo of unglued vectors (in coarse level consolidation) - Vector nei = nv_mtx.manager->neighbors; // just neighbors before glue - std::vector > b2lr = nv_mtx.manager->getB2Lrings(); //list of boundary nodes to export to other partitions. - Vector ho = nv_mtx.manager->halo_offsets; - std::vector b2lm = nv_mtx.manager->getB2Lmaps(); - cudaCheckError(); - - // Create a fresh distributed manager - if (nv_mtx.manager != NULL ) - { - delete nv_mtx.manager; - } - - nv_mtx.manager = new DistributedManager(nv_mtx); - nv_mtx.set_initialized(0); - nv_mtx.delProps(DIAG); - // Load distributed matrix - MatrixDistribution mdist; - mdist.setPartitionVec(part_vec_ptr); - nv_mtx.manager->loadDistributedMatrix(n, nnz, block_dimx, block_dimy, r_ptr, i_ptr, (t_MatPrec *) v_ptr, num_ranks, n_global, NULL, mdist); - // Create B2L_maps for comm - nv_mtx.manager->renumberMatrixOneRing(); - // WARNING WE SHOULD GET THE NUMBER OF RINGS AND DO THE FOLLOWING ONLY IF THERE ARE 2 RINGS - // Exchange 1 ring halo rows (for d2 interp) - // if (num_import_rings == 2) - nv_mtx.manager->createOneRingHaloRows(); - nv_mtx.manager->getComms()->set_neighbors(nv_mtx.manager->num_neighbors()); - nv_mtx.setView(OWNED); - nv_mtx.set_initialized(1); - cudaCheckError(); - // restore some glue info to consolidate the vectors in the future - nv_mtx.manager->setDestinationPartitions(dest_partitions); - nv_mtx.manager->setIsRootPartition(is_root_partition); - nv_mtx.manager->setNumPartsToConsolidate(num_parts_to_consolidate); - nv_mtx.manager->setPartsToConsolidate(parts_to_consolidate); - nv_mtx.manager->setIsGlued(true); - nv_mtx.manager->setMyDestinationPartition(dest_part); - nv_mtx.manager->setConsolidationArrayOffsets(cao); // partions_offest before consolidation - // set fine level data structures, this is used to match the former distribution when we upload / download from the API - - if (nv_mtx.amg_level_index == 0) - { - // just small copies inside fineLevelUpdate. - nv_mtx.manager->fineLevelUpdate(); - } - - nv_mtx.manager->renumbering_before_glue = r; - nv_mtx.manager->inverse_renumbering_before_glue = ir; - nv_mtx.manager->neighbors_before_glue = nei; - nv_mtx.manager->halo_offsets_before_glue = ho; - nv_mtx.manager->B2L_rings_before_glue = b2lr; - nv_mtx.manager->B2L_maps_before_glue = b2lm; - cudaCheckError(); - return 0; -} - -template -int glue_vector(Matrix &nv_mtx, MPI_Comm &A_comm, Vector &nv_vec, MPI_Comm &temp_com) -{ - // glu vecots based on dest_partitions (which contains partitions should be merged together) - typedef typename TConfig::IndPrec t_IndPrec; - typedef typename TConfig::VecPrec t_VecPrec; - int n, l, mpist, start, end, k = 0, root = 0, rank = 0; - //MPI call parameters - t_IndPrec *rc_ptr, *di_ptr; - t_VecPrec *hv_ptr, *hg_ptr, *v_ptr; - thrust::host_vector rc; - thrust::host_vector di; - //unreordered local vector on the host - thrust::host_vector hv; - //constructed global vector on the host - thrust::host_vector hg; - //constructed global vector on the device - device_vector_alloc v; - //WARNING: this routine currently supports vectors only with block size =1 (it can be generalized in the future, though) - //initialize the defaults - mpist = MPI_SUCCESS; - - if (nv_mtx.manager != NULL) - { - // some initializations - rank = nv_mtx.manager->global_id(); - - if (nv_mtx.manager->getComms() != NULL) - { - nv_mtx.manager->getComms()->get_mpi_comm(); - } - - n = nv_mtx.manager->getConsolidationArrayOffsets()[rank + 1] - nv_mtx.manager->getConsolidationArrayOffsets()[rank]; - - if (nv_mtx.manager->getConsolidationArrayOffsets().size() == 0) - { - std::cout << "ERROR part_offsets in glue path" << std::endl; - } - - l = nv_mtx.manager->getNumPartsToConsolidate(); // number of partitions - //some allocations/resizing - hv.resize(nv_mtx.manager->renumbering_before_glue.size()); // host copy of nv_vec - - if (nv_mtx.manager->isRootPartition()) - { - // This works with neighbours only - hg.resize(nv_mtx.manager->getConsolidationArrayOffsets()[rank + l] - nv_mtx.manager->getConsolidationArrayOffsets()[rank]); // host copy of cvec - rc.resize(l); - di.resize(l); - } - - cudaCheckError(); - //--- unreorder the vector back (just like you did with the matrix, but only need to undo the interior-boundary reordering, because others do not apply) --- - // unreorder and copy the vector - // WARNING - // renumbering contains the inverse permutation to unreorder an amgx vector - // inverse_renumbering contains the permutaion to reorder an amgx vector - thrust::copy(thrust::make_permutation_iterator(nv_vec.begin(), nv_mtx.manager->renumbering_before_glue.begin() ), - thrust::make_permutation_iterator(nv_vec.begin(), nv_mtx.manager->renumbering_before_glue.begin() + nv_mtx.manager->renumbering_before_glue.size()), - hv.begin()); - cudaCheckError(); - hv.resize(n); - - // --- construct global vector (rhs/sol) --- - //compute recvcounts and displacements for MPI_Gatherv - if (nv_mtx.manager->isRootPartition()) - { - l = nv_mtx.manager->getNumPartsToConsolidate(); // number of partitions - - //compute recvcounts and displacements for MPI_Gatherv - for (int i = 0; i < l; i++) - { - start = nv_mtx.manager->getConsolidationArrayOffsets()[nv_mtx.manager->getPartsToConsolidate()[i]]; - end = nv_mtx.manager->getConsolidationArrayOffsets()[nv_mtx.manager->getPartsToConsolidate()[i] + 1]; - rc[i] = end - start; - di[i] = k; - k += rc[i]; - } - } - - //alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details) - rc_ptr = thrust::raw_pointer_cast(rc.data()); - di_ptr = thrust::raw_pointer_cast(di.data()); - hv_ptr = thrust::raw_pointer_cast(hv.data()); - hg_ptr = thrust::raw_pointer_cast(hg.data()); - cudaCheckError(); - - //gather (on the host) - if (typeid(t_VecPrec) == typeid(float)) - { - mpist = MPI_Gatherv(hv_ptr, n, MPI_FLOAT, hg_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, temp_com); - } - else if (typeid(t_VecPrec) == typeid(double)) - { - mpist = MPI_Gatherv(hv_ptr, n, MPI_DOUBLE, hg_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, temp_com); - } - else - { - FatalError("MPI_Gatherv of the vector has failed - incorrect vector data type", AMGX_ERR_CORE); - } - - if (mpist != MPI_SUCCESS) - { - FatalError("MPI_Gatherv of the vector has failed - detected incorrect MPI return code", AMGX_ERR_CORE); - } - - // clean - nv_vec.in_transfer = IDLE; - - //nv_vec.dirtybit = 0; - if (nv_vec.buffer != NULL) - { - delete nv_vec.buffer; - nv_vec.buffer = NULL; - nv_vec.buffer_size = 0; - } - - if (nv_vec.linear_buffers_size != 0) - { - amgx::memory::cudaFreeHost(&(nv_vec.linear_buffers[0])); - nv_vec.linear_buffers_size = 0; - } - - if (nv_vec.explicit_host_buffer) - { - amgx::memory::cudaFreeHost(nv_vec.explicit_host_buffer); - nv_vec.explicit_host_buffer = NULL; - nv_vec.explicit_buffer_size = 0; - cudaEventDestroy(nv_vec.mpi_event); - } - - // resize - if (nv_mtx.manager->isRootPartition()) - { - n = hg.size(); - v.resize(hg.size()); - thrust::copy(hg.begin(), hg.end(), v.begin()); - cudaCheckError(); - } - else - { - n = 0; - v.resize(0); - cudaCheckError(); - } - - // upload - v_ptr = thrust::raw_pointer_cast(v.data()); - cudaCheckError(); - upload_vector_after_glue(n, v_ptr, nv_vec, nv_mtx); - } - else - { - // ASSUMPTION: when manager has not been allocated you are running on a single rank - } - - return 0; -} - - -template -int upload_vector_after_glue(int n, void *v_ptr, Vector &nv_vec, Matrix &nv_mtx) -{ - typedef typename TConfig::VecPrec t_VecPrec; - // vector bind - nv_vec.unsetManager(); - cudaCheckError(); - - if (nv_mtx.manager != NULL) - { - nv_vec.setManager(*(nv_mtx.manager)); - } - - cudaCheckError(); - - if (nv_vec.is_transformed()) - { - nv_vec.unset_transformed(); - } - - nv_vec.set_block_dimx(1); - nv_vec.set_block_dimy(nv_mtx.get_block_dimy()); - // the dirtybit has to be set to one here in order to have correct results to ensure an exachange halo before the solve - // this is particulary important when the number of consolidated partitions is greater than 1 on large matrices such as drivaer9M - nv_vec.dirtybit = 1; - - if (nv_mtx.manager != NULL) - { - nv_vec.getManager()->transformAndUploadVector(nv_vec, (t_VecPrec *)v_ptr, n, nv_vec.get_block_dimy()); - } - - MPI_Barrier(MPI_COMM_WORLD); - return 0; -} - -template -int unglue_vector(Matrix &nv_mtx, MPI_Comm &A_comm, Vector &nv_vec, MPI_Comm &temp_com, Vector &nv_vec_unglued) -{ - // glue vecots based on dest_partitions (which contains partitions should be merged together) - typedef typename TConfig::IndPrec t_IndPrec; - typedef typename TConfig::VecPrec t_VecPrec; - int n_loc, l, mpist, start, end, k = 0, root = 0, rank = 0; - //MPI call parameters - t_IndPrec *sc_ptr, *di_ptr; - t_VecPrec *hv_ptr, *hg_ptr; - thrust::host_vector sc; - thrust::host_vector di; - //unreordered local vector on the host - thrust::host_vector hv; - //constructed global vector on the host - thrust::host_vector hg; - //constructed global vector on the device - device_vector_alloc v; - //WARNING: this routine currently supports vectors only with block size =1 (it can be generalized in the future, though) - //initialize the defaults - mpist = MPI_SUCCESS; - - if (nv_mtx.manager != NULL) - { - // some initializations - rank = nv_mtx.manager->global_id(); - - if (nv_mtx.manager->getComms() != NULL) - { - nv_mtx.manager->getComms()->get_mpi_comm(); - } - - n_loc = nv_mtx.manager->getConsolidationArrayOffsets()[rank + 1] - nv_mtx.manager->getConsolidationArrayOffsets()[rank]; - - if (nv_mtx.manager->getConsolidationArrayOffsets().size() == 0) - { - printf("ERROR part_offsets\n"); - } - - l = nv_mtx.manager->getNumPartsToConsolidate(); // number of partitions - //some allocations/resizing - hv.resize(n_loc); // host copy - - if (nv_mtx.manager->isRootPartition()) - { - hg.resize(nv_vec.size()); // host copy of cvec - } - - sc.resize(l); - di.resize(l); - cudaCheckError(); - // Exchange_halo before unreordering - // Do we need that? - nv_mtx.manager->exchange_halo(nv_vec, nv_vec.tag); - - // unreorder the vector - if (nv_mtx.manager->isRootPartition()) - { - // WARNING - // renumbering contains the inverse permutation to unreorder an amgx vector - // inverse_renumbering contains the permutaion to reorder an amgx vector - thrust::copy(thrust::make_permutation_iterator(nv_vec.begin(), nv_mtx.manager->renumbering.begin() ), - thrust::make_permutation_iterator(nv_vec.begin(), nv_mtx.manager->renumbering.begin() + nv_mtx.manager->renumbering.size()), - hg.begin()); - cudaCheckError(); - hg.resize(nv_mtx.manager->getConsolidationArrayOffsets()[rank + l] - nv_mtx.manager->getConsolidationArrayOffsets()[rank]); - } - - // --- construct local vector (sol) --- - //compute sendcounts and displacements for MPI_Gatherv - for (int i = 0; i < l; i++) - { - start = nv_mtx.manager->getConsolidationArrayOffsets()[nv_mtx.manager->getPartsToConsolidate()[i]]; - end = nv_mtx.manager->getConsolidationArrayOffsets()[nv_mtx.manager->getPartsToConsolidate()[i] + 1]; - sc[i] = end - start; - di[i] = k; - k += sc[i]; - } - - //alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details) - sc_ptr = thrust::raw_pointer_cast(sc.data()); - di_ptr = thrust::raw_pointer_cast(di.data()); - hv_ptr = thrust::raw_pointer_cast(hv.data()); - hg_ptr = thrust::raw_pointer_cast(hg.data()); - cudaCheckError(); - - // Scatter (on the host) - if (typeid(t_VecPrec) == typeid(float)) - { - mpist = MPI_Scatterv(hg_ptr, sc_ptr, di_ptr, MPI_FLOAT, hv_ptr, n_loc, MPI_FLOAT, root, temp_com); - } - else if (typeid(t_VecPrec) == typeid(double)) - { - mpist = MPI_Scatterv(hg_ptr, sc_ptr, di_ptr, MPI_DOUBLE, hv_ptr, n_loc, MPI_DOUBLE, root, temp_com); - } - else - { - FatalError("MPI_Gatherv of the vector has failed - incorrect vector data type", AMGX_ERR_CORE); - } - - if (mpist != MPI_SUCCESS) - { - FatalError("MPI_Gatherv of the vector has failed - detected incorrect MPI return code", AMGX_ERR_CORE); - } - - // --- Manual upload --- - // Cleaning - nv_vec_unglued.in_transfer = IDLE; - - if (nv_vec_unglued.buffer != NULL) - { - delete nv_vec_unglued.buffer; - nv_vec_unglued.buffer = NULL; - nv_vec_unglued.buffer_size = 0; - } - - if (nv_vec_unglued.linear_buffers_size != 0) - { - amgx::memory::cudaFreeHost(&(nv_vec_unglued.linear_buffers[0])); - nv_vec_unglued.linear_buffers_size = 0; - } - - if (nv_vec_unglued.explicit_host_buffer) - { - amgx::memory::cudaFreeHost(nv_vec_unglued.explicit_host_buffer); - nv_vec_unglued.explicit_host_buffer = NULL; - nv_vec_unglued.explicit_buffer_size = 0; - cudaEventDestroy(nv_vec_unglued.mpi_event); - } - - // We should avoid copies between nv_vec and hv here - nv_vec_unglued.resize( nv_mtx.manager->inverse_renumbering_before_glue.size()); - thrust::fill( nv_vec_unglued.begin(), nv_vec_unglued.end(), 0.0 ); - thrust::copy(hv.begin(), hv.end(), nv_vec_unglued.begin()); - hv.resize( nv_mtx.manager->inverse_renumbering_before_glue.size()); - cudaCheckError(); - // Manual reordering - // Upload_vector_after_glue is not going to work because matrix managers has been modified during glued matrices, and don't match the new, glued, topology. - thrust::copy(thrust::make_permutation_iterator(nv_vec_unglued.begin(), nv_mtx.manager->inverse_renumbering_before_glue.begin() ), - thrust::make_permutation_iterator(nv_vec_unglued.begin(), nv_mtx.manager->inverse_renumbering_before_glue.begin() + nv_mtx.manager->inverse_renumbering_before_glue.size()), - hv.begin()); - cudaCheckError(); - thrust::fill( nv_vec_unglued.begin(), nv_vec_unglued.end(), 0.0 ); - thrust::copy(hv.begin(), hv.end(), nv_vec_unglued.begin()); - } - else - { - // ASSUMPTION: when manager has not been allocated you are running on a single rank - printf("Glue was called on a single rank\n"); - } - - return 0; -} - -#if 0 - -// The folowing code is to perform an exhange halo using data that doesn't matches the topology of the matrix stored in its distributed manager -// We use instead other containers stored in the distributed manager. They are suffixed by "_before_glue" -// This allows to exchange vector halo between unglued vectors from glued matrices - -template -void exchange_halo_after_unglue(const Matrix &A, Vector &data, int tag, int num_ring = 1) -{ - setup_after_unglue(data, A, num_ring); //set pointers to buffer - gather_B2L_after_unglue(A, data, num_ring); //write values to buffer - exchange_halo_after_unglue(data, A, num_ring); //exchange buffers - //scatter_L2H(data); //NULL op -} -/* -template -void CommsMPIHostBufferStream::setup(DVector &b, const Matrix &m, int num_rings) { do_setup_after_unglue((b, m, num_rings);} -template -void CommsMPIHostBufferStream::exchange_halo(DVector &b, const Matrix &m, cudaEvent_t event, int tag, int num_rings) { do_exchange_halo_after_unglue((b, m, num_rings);} -template -void CommsMPIHostBufferStream::setup(FVector &b, const Matrix &m, int tag, int num_rings) { do_setup_after_unglue((b, m, num_rings);} -template -void CommsMPIHostBufferStream::exchange_halo(FVector &b, const Matrix &m, cudaEvent_t event, int tag, int num_rings) { do_exchange_halo_after_unglue((b, m, num_rings);} -*/ -template -void setup_after_unglue(Vector &b, const Matrix &m, int num_rings) -{ - /* - thrust::copy( m.manager->neighbors_before_glue.begin(), m.manager->neighbors_before_glue.end(), std::ostream_iterator(std::cout, " ")); - thrust::copy( m.manager->halo_offsets_before_glue.begin(), m.manager->halo_offsets_before_glue.end(), std::ostream_iterator(std::cout, " ")); - for (int i = 0; i < m.manager->B2L_rings_before_glue.size(); ++i) - { - thrust::copy( m.manager->B2L_rings_before_glue[i].begin(), m.manager->B2L_rings_before_glue[i].end(), std::ostream_iterator(std::cout, " ")); - } - - for (int i = 0; i < m.manager->B2L_maps_before_glue.size(); ++i) - { - thrust::copy( m.manager->B2L_maps_before_glue[i].begin(), m.manager->B2L_maps_before_glue[i].end(), std::ostream_iterator(std::cout, " ")); - } - */ - if (TConfig::memSpace == AMGX_host) - { - FatalError("MPI Comms module no implemented for host", AMGX_ERR_NOT_IMPLEMENTED); - } - else - { -#ifdef AMGX_WITH_MPI - int bsize = b.get_block_size(); - int num_cols = b.get_num_cols(); - - if (bsize != 1 && num_cols != 1) - FatalError("Error: vector cannot have block size and subspace size.", - AMGX_ERR_INTERNAL); - - // set num neighbors = size of B2L_rings_before_glue - // need to do this because comms might have more neighbors than our matrix knows about - int neighbors = m.manager->B2L_rings_before_glue.size(); - m.manager->getComms()->set_neighbors(m.manager->B2L_rings_before_glue.size()); - - if (b.in_transfer & SENDING) - { - b.in_transfer = IDLE; - } - - typedef typename TConfig::template setVecPrec<(AMGX_VecPrecision)AMGX_GET_MODE_VAL(AMGX_MatPrecision, TConfig::mode)>::Type value_type; - b.requests.resize(2 * neighbors); //first part is sends second is receives - b.statuses.resize(2 * neighbors); - - for (int i = 0; i < 2 * neighbors; i++) - { - b.requests[i] = MPI_REQUEST_NULL; - } - - int total = 0; - - for (int i = 0; i < neighbors; i++) - { - total += m.manager->B2L_rings_before_glue[i][num_rings] * bsize * num_cols; - } - - b.buffer_size = total; - - if (b.buffer == NULL) - { - b.buffer = new Vector(total); - } - else - { - if (total > b.buffer->size()) - { - b.buffer->resize(total); - } - } - - if (b.linear_buffers_size < neighbors) - { - if (b.linear_buffers_size != 0) { amgx::memory::cudaFreeHost(b.linear_buffers); } - - amgx::memory::cudaMallocHost((void **) & (b.linear_buffers), neighbors * sizeof(value_type *)); - b.linear_buffers_size = neighbors; - } - - cudaCheckError(); - total = 0; - bool linear_buffers_changed = false; - - for (int i = 0; i < neighbors; i++) - { - if (b.linear_buffers[i] != b.buffer->raw() + total) - { - linear_buffers_changed = true; - } - - b.linear_buffers[i] = b.buffer->raw() + total; - total += m.manager->B2L_rings_before_glue[i][num_rings] * bsize * num_cols; - } - - // Copy to device - if (linear_buffers_changed) - { - b.linear_buffers_ptrs.resize(neighbors); - //thrust::copy(b.linear_buffers.begin(),b.linear_buffers.end(),b.linear_buffers_ptrs.begin()); - cudaMemcpyAsync(thrust::raw_pointer_cast(&b.linear_buffers_ptrs[0]), &(b.linear_buffers[0]), neighbors * sizeof(value_type *), cudaMemcpyHostToDevice); - cudaCheckError(); - } - - int size = 0; - size = total + (m.manager->halo_offsets_before_glue[num_rings * neighbors] - m.manager->halo_offsets_before_glue[0]) * bsize * num_cols; - - if (size > 0) - { - if (b.explicit_host_buffer == NULL) - { - b.host_buffer.resize(1); - cudaEventCreateWithFlags(&b.mpi_event, cudaEventDisableTiming); - cudaCheckError(); - amgx::memory::cudaMallocHost((void **)&b.explicit_host_buffer, size * sizeof(value_type)); - cudaCheckError(); - } - else if (size > b.explicit_buffer_size) - { - amgx::memory::cudaFreeHost(b.explicit_host_buffer); - cudaCheckError(); - amgx::memory::cudaMallocHost((void **)&b.explicit_host_buffer, size * sizeof(value_type)); - cudaCheckError(); - } - - cudaCheckError(); - b.explicit_buffer_size = size; - } - -#else - FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED); -#endif - } -} -template -void gather_B2L_after_unglue(const Matrix &m, Vector &b, int num_rings = 1) -{ - if (TConfig::memSpace == AMGX_host) - { - if (m.manager->neighbors_before_glue.size() > 0) - { - FatalError("Distributed solve only supported on devices", AMGX_ERR_NOT_IMPLEMENTED); - } - } - else - { - for (int i = 0; i < m.manager->neighbors_before_glue.size(); i++) - { - int size = m.manager->B2L_rings_before_glue[i][num_rings]; - int num_blocks = min(4096, (size + 127) / 128); - - if ( size != 0) - { - if (b.get_num_cols() == 1) - { - gatherToBuffer <<< num_blocks, 128>>>(b.raw(), m.manager->B2L_maps_before_glue[i].raw(), b.linear_buffers[i], b.get_block_size(), size); - } - else - { - gatherToBufferMultivector <<< num_blocks, 128>>>(b.raw(), m.manager->B2L_maps_before_glue[i].raw(), b.linear_buffers[i], b.get_num_cols(), b.get_lda(), size); - } - - cudaCheckError(); - } - } - } -} -template -void exchange_halo_after_unglue(Vector &b, const Matrix &m, int num_rings) -{ - if (TConfig::memSpace == AMGX_host) - { - FatalError("Halo exchanges not implemented for host", AMGX_ERR_NOT_IMPLEMENTED); - } - else - { -#ifdef AMGX_WITH_MPI - typedef typename TConfig::VecPrec VecPrec; - cudaCheckError(); - int bsize = b.get_block_size(); - int num_cols = b.get_num_cols(); - int offset = 0; - int neighbors = m.manager->B2L_rings_before_glue.size(); - MPI_Comm mpi_comm = m.manager->getComms()->get_mpi_comm(); - - if (b.buffer_size != 0) - { - cudaMemcpy(&(b.explicit_host_buffer[0]), b.buffer->raw(), b.buffer_size * sizeof(typename TConfig::VecPrec), cudaMemcpyDeviceToHost); - } - - for (int i = 0; i < neighbors; i++) - { - int size = m.manager->B2L_rings_before_glue[i][num_rings] * bsize * num_cols; - - if (size != 0) - { - MPI_Isend(&(b.explicit_host_buffer[offset]), size * sizeof(typename TConfig::VecPrec), MPI_BYTE, m.manager->neighbors_before_glue[i], m.manager->global_id(), mpi_comm, &b.requests[i]); - } - else - { - MPI_Isend(&(b.host_buffer[0]), size * sizeof(typename TConfig::VecPrec), MPI_BYTE, m.manager->neighbors_before_glue[i], m.manager->global_id(), mpi_comm, &b.requests[i]); - } - - offset += size; - } - - b.in_transfer = RECEIVING | SENDING; - offset = 0; - - for (int i = 0; i < neighbors; i++) - { - // Count total size to receive from one neighbor - int size = 0; - - for (int j = 0; j < num_rings; j++) - { - size += m.manager->halo_offsets_before_glue[j * neighbors + i + 1] * bsize * num_cols - m.manager->halo_offsets_before_glue[j * neighbors + i] * bsize * num_cols; - } - - if (size != 0) - { - MPI_Irecv(&(b.explicit_host_buffer[b.buffer_size + offset]), size * sizeof(typename TConfig::VecPrec), MPI_BYTE, m.manager->neighbors_before_glue[i], m.manager->neighbors_before_glue[i], mpi_comm, &b.requests[neighbors + i]); - } - else - { - MPI_Irecv(&(b.host_buffer[0]), size * sizeof(typename TConfig::VecPrec), MPI_BYTE, m.manager->neighbors_before_glue[i], m.manager->neighbors_before_glue[i], mpi_comm, &b.requests[neighbors + i]); - } - - offset += size; - int required_size = m.manager->halo_offsets_before_glue[0] * bsize * num_cols + offset; - - if (required_size > b.size()) - { - // happen because we have 2 ring - // required_size correspond to "n" in the FULL view of the unconsolidated matrix. - // In exchange halo this is a fatal error since it should never happen. - b.resize(required_size); - } - } - - MPI_Waitall(2 * neighbors, &b.requests[0], /*&b.statuses[0]*/ MPI_STATUSES_IGNORE); //I only wait to receive data, I can start working before all my buffers were sent - b.dirtybit = 0; - b.in_transfer = IDLE; - - // copy on host ring by ring - if (num_rings == 1) - { - if (num_cols == 1) - { - if (offset != 0) - { - cudaMemcpy(b.raw() + m.manager->halo_offsets_before_glue[0]*bsize, &(b.explicit_host_buffer[b.buffer_size]), offset * sizeof(typename TConfig::VecPrec), cudaMemcpyHostToDevice); - } - } - else - { - int lda = b.get_lda(); - VecPrec *rank_start = &(b.explicit_host_buffer[b.buffer_size]); - - for (int i = 0; i < neighbors; ++i) - { - int halo_size = m.manager->halo_offsets_before_glue[i + 1] - m.manager->halo_offsets_before_glue[i]; - - for (int s = 0; s < num_cols; ++s) - { - VecPrec *halo_start = b.raw() + lda * s + m.manager->halo_offsets_before_glue[i]; - VecPrec *received_halo = rank_start + s * halo_size; - cudaMemcpy(halo_start, received_halo, halo_size * sizeof(VecPrec), cudaMemcpyHostToDevice); - } - - rank_start += num_cols * halo_size; - } - } - } - else - { - if (num_cols == 1) - { - offset = 0; - - // Copy into b, one neighbor at a time, one ring at a time - for (int i = 0 ; i < neighbors ; i++) - { - for (int j = 0; j < num_rings; j++) - { - int size = m.manager->halo_offsets_before_glue[j * neighbors + i + 1] * bsize - m.manager->halo_offsets_before_glue[j * neighbors + i] * bsize; - - if (size != 0) - { - cudaMemcpy(b.raw() + m.manager->halo_offsets_before_glue[j * neighbors + i]*bsize, &(b.explicit_host_buffer[b.buffer_size + offset]), size * sizeof(typename TConfig::VecPrec), cudaMemcpyHostToDevice); - } - - offset += size; - } - } - } - else - { - FatalError("num_rings != 1 && num_cols != 1 not supported\n", AMGX_ERR_NOT_IMPLEMENTED); - } - } - -#else - FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED); -#endif - } -} -#endif -// if 0 -#endif -//MPI -} // namespace amgx diff --git a/include/energymin/energymin_amg_level.h b/include/energymin/energymin_amg_level.h index 1e4930e4..3559b2e2 100644 --- a/include/energymin/energymin_amg_level.h +++ b/include/energymin/energymin_amg_level.h @@ -100,8 +100,6 @@ class Energymin_AMG_Level_Base : public AMG_Level virtual void computeAOperator_1x1() = 0; virtual void computeAOperator_1x1_distributed() = 0; void prepareNextLevelMatrix(const Matrix &A, Matrix &Ac) {}; - void consolidateVector(VVector &x) {}; - void unconsolidateVector(VVector &x) {}; void prolongateAndApplyCorrectionRescale(VVector &ec, VVector &bf, VVector &xf, VVector &ef, VVector &Aef); protected: diff --git a/include/solvers/solver.h b/include/solvers/solver.h index add41b5d..e143d2fb 100644 --- a/include/solvers/solver.h +++ b/include/solvers/solver.h @@ -205,8 +205,6 @@ class Solver : public AuxData // Decrement the reference counter. bool decr_ref_count() { return --m_ref_count == 0; } - void setGluedSetup(bool val) { m_skip_glued_setup = val; } - // tag used for communication int tag; @@ -285,8 +283,6 @@ class Solver : public AuxData // Timings. float m_setup_time, m_solve_time; - bool m_skip_glued_setup; - ThreadManager *m_tmng; }; diff --git a/include/vector.h b/include/vector.h index ae66367a..0935f47d 100644 --- a/include/vector.h +++ b/include/vector.h @@ -176,11 +176,11 @@ class Vector > : publ typedef typename TConfig::IndPrec index_type; typedef cusp::array1d_format format; - Vector() : block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_host_buffer(NULL), explicit_buffer_size(0), m_unconsolidated_size(0), m_resources(0) { }; - inline Vector(unsigned int N) : thrust::host_vector(N), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(unsigned int N, value_type v) : thrust::host_vector(N, v), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(const Vector &a) : thrust::host_vector(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(const Vector &a) : thrust::host_vector(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} + Vector() : block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_host_buffer(NULL), explicit_buffer_size(0), m_resources(0) { }; + inline Vector(unsigned int N) : thrust::host_vector(N), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(unsigned int N, value_type v) : thrust::host_vector(N, v), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(const Vector &a) : thrust::host_vector(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(const Vector &a) : thrust::host_vector(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} ~Vector() { @@ -381,8 +381,6 @@ class Vector > : publ DistributedManager *getManager() const {return manager;} void unset_transformed() {v_is_transformed = false;} void set_transformed() {v_is_transformed = true;} - void set_unconsolidated_size(int size) {m_unconsolidated_size = size;} - int get_unconsolidated_size() { return m_unconsolidated_size;} bool is_transformed() { return v_is_transformed;} void set_is_vector_read_partitioned(bool is_read_partitioned) {v_is_read_partitioned = is_read_partitioned;} inline bool is_vector_read_partitioned() const {return v_is_read_partitioned;} @@ -395,7 +393,6 @@ class Vector > : publ volatile int cancel; //Signals to the async host-copy comms module that vector is being deallocated int delayed_send; unsigned int in_transfer; - int m_unconsolidated_size; std::vector host_buffer; value_type *explicit_host_buffer; //A separate pinned memory buffer to be used by async host-copy comms module int explicit_buffer_size; @@ -438,12 +435,12 @@ class Vector > : pu typedef typename TConfig::IndPrec index_type; typedef cusp::array1d_format format; - Vector() : block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(unsigned int N) : device_vector_alloc(N), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(unsigned int N, int dimx, int dimy) : device_vector_alloc(N), block_dimx(dimx), block_dimy(dimy), num_rows(0), num_cols(1), lda(0), buffer(NULL), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(unsigned int N, value_type v) : device_vector_alloc(N, v), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(const Vector &a) : device_vector_alloc(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} - inline Vector(const Vector &a) : device_vector_alloc(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_unconsolidated_size(0), m_resources(0) {} + Vector() : block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(unsigned int N) : device_vector_alloc(N), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(unsigned int N, int dimx, int dimy) : device_vector_alloc(N), block_dimx(dimx), block_dimy(dimy), num_rows(0), num_cols(1), lda(0), buffer(NULL), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(unsigned int N, value_type v) : device_vector_alloc(N, v), block_dimx(1), block_dimy(1), num_rows(0), num_cols(1), lda(0), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(const Vector &a) : device_vector_alloc(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} + inline Vector(const Vector &a) : device_vector_alloc(a), block_dimx(a.get_block_dimx()), block_dimy(a.get_block_dimy()), num_rows(a.get_num_rows()), num_cols(a.get_num_cols()), lda(a.get_lda()), buffer(NULL), buffer_size(0), dirtybit(1), in_transfer(IDLE), tag(-1), delayed_send(1), cancel(0), manager(NULL), v_is_transformed(false), v_is_read_partitioned(false), host_send_recv_buffer(NULL), linear_buffers_size(0), explicit_buffer_size(0), explicit_host_buffer(NULL), m_resources(0) {} ~Vector() { @@ -667,8 +664,6 @@ class Vector > : pu DistributedManager *getManager() const {return manager;} void set_transformed() {v_is_transformed = true;} void unset_transformed() {v_is_transformed = false;} - void set_unconsolidated_size(int size) {m_unconsolidated_size = size;} - int get_unconsolidated_size() { return m_unconsolidated_size;} bool is_transformed() { return v_is_transformed;} void set_is_vector_read_partitioned(bool is_read_partitioned) {v_is_read_partitioned = is_read_partitioned;} inline bool is_vector_read_partitioned() const {return v_is_read_partitioned;} @@ -689,7 +684,6 @@ class Vector > : pu volatile int dirtybit; volatile int cancel; //Signals to the async host-copy comms module that vector is being deallocated int delayed_send; - int m_unconsolidated_size; unsigned int in_transfer; std::vector host_buffer; value_type *explicit_host_buffer; //A separate pinned memory buffer to be used by async host-copy comms module diff --git a/src/aggregation/aggregation_amg_level.cu b/src/aggregation/aggregation_amg_level.cu index 7800ab7f..faf996c4 100644 --- a/src/aggregation/aggregation_amg_level.cu +++ b/src/aggregation/aggregation_amg_level.cu @@ -475,17 +475,7 @@ void Aggregation_AMG_LevelisConsolidationLevel()) - { - max_threads = this->m_num_aggregates; - } - else - { - max_threads = this->m_num_all_aggregates; - } - + int max_threads = this->m_num_aggregates; int num_blocks = min( AMGX_GRID_MAX_SIZE, (max_threads - 1) / block_size + 1); const IndexType *R_row_offsets_ptr = this->m_R_row_offsets.raw(); const IndexType *R_column_indices_ptr = this->m_R_column_indices.raw(); @@ -501,16 +491,7 @@ void Aggregation_AMG_LevelisConsolidationLevel()) - { - max_threads = this->m_num_aggregates; - } - else - { - max_threads = this->m_num_all_aggregates; - }; + int max_threads = this->m_num_aggregates; const int num_blocks = min( AMGX_GRID_MAX_SIZE, (max_threads + block_size - 1) / block_size); @@ -938,7 +919,7 @@ void Aggregation_AMG_Level_Base::restrictResidual(VVector &r, VVector Matrix &Ac = this->getNextLevel( MemorySpace( ) )->getA(); rr.dirtybit = 1; - if (!Ac.is_matrix_singleGPU() && !this->isConsolidationLevel() && rr.delayed_send == 0) + if (!Ac.is_matrix_singleGPU() && rr.delayed_send == 0) { Matrix &Ac = this->getNextLevel( MemorySpace( ) )->getA(); //TODO problem in memoryspace transfer is here @@ -1024,17 +1005,6 @@ __global__ void export_matrix_diagonal(T *values, INDEX_TYPE bsize, INDEX_TYPE * } } -__global__ void remove_boundary(INDEX_TYPE *flags, INDEX_TYPE *maps, INDEX_TYPE size) -{ - int element = blockIdx.x * blockDim.x + threadIdx.x; - - while (element < size) - { - flags[maps[element]] = 0; //this won't be a problem, because we are overwriting the same thing - element += blockDim.x * gridDim.x; - } -} - __global__ void calc_inverse_renumbering(INDEX_TYPE *renum, INDEX_TYPE *irenum, INDEX_TYPE *renum_gbl, INDEX_TYPE base_index, INDEX_TYPE max_element) { int idx = blockDim.x * blockIdx.x + threadIdx.x; @@ -1908,75 +1878,6 @@ void Aggregation_AMG_Level_Base::setNeighborAggregates() Ac.manager->halo_offsets[num_neighbors] = m_num_all_aggregates; } -//TODO: The consolidate and unconsolidate parts could be made more efficient by only sending the -// nonzero values -template -void Aggregation_AMG_Level_Base::consolidateVector(VVector &x) -{ - int my_id = this->getA().manager->global_id(); - - if (this->getA().manager->isRootPartition()) - { - // Here all partitions being consolidated should have same vector size, see TODO above - INDEX_TYPE num_parts = this->getA().manager->getNumPartsToConsolidate(); - - for (int i = 0; i < num_parts; i++) - { - int current_part = this->getA().manager->getPartsToConsolidate()[i]; - - // Vector has been set to correct size - if (current_part != my_id) - { - //printf("Root partition %d receiving %d -> %d and %d -> %d (total %d)\n", this->getA().manager->global_id(), this->getA().manager->getConsolidationArrayOffsets()[i], this->getA().manager->getConsolidationArrayOffsets()[i+1], this->getA().manager->getConsolidationArrayOffsets()[num_parts+i], this->getA().manager->getConsolidationArrayOffsets()[num_parts+i+1], (int)x.size()/x.get_block_size()); - this->getA().manager->getComms()->recv_vector(x, current_part, 10000 + current_part, x.get_block_size()*this->getA().manager->getConsolidationArrayOffsets()[i], x.get_block_size() * (this->getA().manager->getConsolidationArrayOffsets()[i + 1] - this->getA().manager->getConsolidationArrayOffsets()[i])); - this->getA().manager->getComms()->recv_vector(x, current_part, 20000 + current_part, x.get_block_size()*this->getA().manager->getConsolidationArrayOffsets()[num_parts + i], x.get_block_size() * (this->getA().manager->getConsolidationArrayOffsets()[num_parts + i + 1] - this->getA().manager->getConsolidationArrayOffsets()[num_parts + i])); - } - } - } - else - { - int my_destination_part = this->getA().manager->getMyDestinationPartition(); - int i_off, i_size, b_off, b_size; - this->getA().manager->getConsolidationOffsets(&i_off, &i_size, &b_off, &b_size); - // Here all partitions being consolidated should have same vector size, see TODO above - this->getA().manager->getComms()->send_vector_async(x, my_destination_part, 10000 + my_id, i_off * x.get_block_size(), i_size * x.get_block_size()); - this->getA().manager->getComms()->send_vector_async(x, my_destination_part, 20000 + my_id, b_off * x.get_block_size(), b_size * x.get_block_size()); - } -} - -//TODO: The consolidate and unconsolidate parts could be made more efficient by only sending the -// nonzero values -template -void Aggregation_AMG_Level_Base::unconsolidateVector(VVector &x) -{ - if (this->getA().manager->isRootPartition()) - { - INDEX_TYPE num_parts = this->getA().manager->getNumPartsToConsolidate(); - - for (int i = 0; i < num_parts; i++) - { - int current_part = this->getA().manager->getPartsToConsolidate()[i]; - - // Vector has been set to correct size - if (current_part != this->getA().manager->global_id()) - { - this->getA().manager->getComms()->send_vector_async(x, current_part, 30000 + current_part, x.get_block_size()*this->getA().manager->getConsolidationArrayOffsets()[i], x.get_block_size() * (this->getA().manager->getConsolidationArrayOffsets()[i + 1] - this->getA().manager->getConsolidationArrayOffsets()[i])); - this->getA().manager->getComms()->send_vector_async(x, current_part, 40000 + current_part, x.get_block_size()*this->getA().manager->getConsolidationArrayOffsets()[num_parts + i], x.get_block_size() * (this->getA().manager->getConsolidationArrayOffsets()[num_parts + i + 1] - this->getA().manager->getConsolidationArrayOffsets()[num_parts + i])); - } - } - } - else - { - int my_destination_part = this->getA().manager->getMyDestinationPartition(); - // Vector x is of unknown size - int i_off, i_size, b_off, b_size; - this->getA().manager->getConsolidationOffsets(&i_off, &i_size, &b_off, &b_size); - this->getA().manager->getComms()->recv_vector(x, my_destination_part, 30000 + this->getA().manager->global_id(), i_off * x.get_block_size(), i_size * x.get_block_size()); - this->getA().manager->getComms()->recv_vector(x, my_destination_part, 40000 + this->getA().manager->global_id(), b_off * x.get_block_size(), b_size * x.get_block_size()); - } -} - - template void Aggregation_AMG_Level_Base::createCoarseVertices() { @@ -2001,623 +1902,40 @@ void Aggregation_AMG_Level_Base::createCoarseMatrices() Matrix &A = this->getA(); Matrix &Ac = this->getNextLevel( MemorySpace( ) )->getA(); profileSubphaseFindAggregates(); - int num_parts, num_fine_neighbors, my_id; - if (!A.is_matrix_singleGPU()) + /* WARNING: do not recompute prolongation (P) and restriction (R) when you + are reusing the level structure (structure_reuse_levels > 0). + Notice that in aggregation path, prolongation P is implicit, + and is used through the aggregates array. */ + if (this->isReuseLevel() == false) { - num_parts = A.manager->getComms()->get_num_partitions(); - num_fine_neighbors = A.manager->neighbors.size(); - my_id = A.manager->global_id(); + this->setNeighborAggregates(); } - else - { - num_parts = 1; - num_fine_neighbors = 0; - my_id = 0; - } - - if (!A.is_matrix_singleGPU() && this->isConsolidationLevel()) - { - // ---------------------------------------------------- - // Consolidate multiple fine matrices into one coarse matrix - // ---------------------------------------------------- - // ---------------- - // Step 1 - // Decide which partitions should be merged together, store in destination_partitions vector - // --------------- - IVector_h &destination_part = A.manager->getDestinationPartitions(); - int my_destination_part = A.manager->getMyDestinationPartition(); - - if (my_destination_part >= num_parts) - { - FatalError("During consolidation, sending data to partition that doesn't exist", AMGX_ERR_NOT_IMPLEMENTED); - } - - // Create mapping from coarse partition indices (ranks on the coarse consolidated level) to partition indices on the fine level (ranks on the fine level) - IVector_h coarse_part_to_fine_part = destination_part; - thrust::sort(coarse_part_to_fine_part.begin(), coarse_part_to_fine_part.end()); - cudaCheckError(); - coarse_part_to_fine_part.erase(thrust::unique(coarse_part_to_fine_part.begin(), coarse_part_to_fine_part.end()), coarse_part_to_fine_part.end()); - cudaCheckError(); - //Then, the number of coarse partitions is simply the size of this vector - int num_coarse_partitions = coarse_part_to_fine_part.size(); - // Create mapping from fine partition indices to coarse partition indices, with fine partitions that are merging together having the same coarse indices - IVector_h fine_part_to_coarse_part(num_parts); - thrust::lower_bound(coarse_part_to_fine_part.begin(), coarse_part_to_fine_part.end(), destination_part.begin(), destination_part.end(), fine_part_to_coarse_part.begin()); - cudaCheckError(); - // Create mapping from this specific partition's neighbors to consolidated coarse neighbors, but using their fine index (aka. destination partition indices for my neighbors) - IVector_h fine_neigh_to_fine_part; - A.manager->createNeighToDestPartMap(fine_neigh_to_fine_part, A.manager->neighbors, destination_part, num_fine_neighbors); - // Create mapping from consolidated coarse neighbors to fine partition indices (even if the current partition is not going to be a root) - IVector_h coarse_neigh_to_fine_part; - int num_coarse_neighbors; - A.manager->createConsolidatedNeighToPartMap(coarse_neigh_to_fine_part, fine_neigh_to_fine_part, my_destination_part, destination_part, num_coarse_neighbors); - // Create mapping from fine neighbors to coarse neighbors, with fine neighbors this partition is merging with labeled with -1 - IVector_h fine_neigh_to_coarse_neigh; - A.manager->createNeighToConsNeigh(fine_neigh_to_coarse_neigh, coarse_neigh_to_fine_part, fine_neigh_to_fine_part, my_destination_part, num_fine_neighbors); - /* - EXAMPLE - Take the following partition graph (that describes connections between partitions, vertices are the partitions themselves), this is the same graph that is used in the setup example - number of partitions num_parts=12 - CSR row_offsets [0 4 8 13 21 25 32 36 41 46 50 57 61] - CSR col_indices [0 1 3 8 - 0 1 2 3 - 1 2 3 4 5 - 0 1 2 3 4 5 8 10 - 2 4 5 6 - 2 3 4 5 6 7 10 - 4 5 6 7 - 5 6 7 9 10 - 0 3 8 10 11 - 7 9 10 11 - 3 5 7 8 9 10 11 - 8 9 10 11] - destination_part = [0 0 0 0 4 4 4 4 8 8 8 8] - coarse_part_to_fine_part = [0 4 8] num_coarse_partitions = 3 - fine_part_to_coarse_part = [0 0 0 0 1 1 1 1 2 2 2 2] - original neighbor lists correspond to the rows of the matrix, minus the diagonal elements: (part 0)[1 3 8] (part 3)[0 1 2 4 5 8 10] (part 10)[3 5 7 8 9 11] - fine_neigh_to_fine_part (part 0)[0 0 2] (part 3)[0 0 0 0 1 2 2] (part 10)[0 1 1 2 2 2] - coarse_neigh_to_fine_part (part 0)[8] (part 3)[4 8] (part 10)[0 4] - fine_neigh_to_coarse_neigh (part 0)[-1 -1 0] (part 3)[-1 -1 -1 0 0 1 1] (part 10)[0 1 1 -1 -1 -1] - */ - // -------------------------- - // Step 2 - // Create coarse B2L_maps, by mapping fine B2L maps to coarse indices using this->m_aggregates and eliminating duplicates - // -------------------------- - std::vector coarse_B2L_maps(num_fine_neighbors); - m_num_all_aggregates = m_num_aggregates; - int num_neighbors_temp = A.manager->neighbors.size(); - int num_rings = A.manager->B2L_rings[0].size() - 1; - - if (num_rings != 1) - { - FatalError("num_rings > 1 not supported in consolidation\n", AMGX_ERR_NOT_IMPLEMENTED); - } - - IndexType max_b2l = 0; - - for (int i = 0; i < num_neighbors_temp; i++ ) { max_b2l = max_b2l > A.manager->B2L_rings[i][1] ? max_b2l : A.manager->B2L_rings[i][1]; } - - IVector B2L_aggregates(max_b2l); - IVector indices(max_b2l); - - //TODO: use the algorithm from setNeighborAggregates() - for (int i = 0; i < num_neighbors_temp; i++ ) - { - int size = A.manager->B2L_rings[i][1]; - thrust::fill(B2L_aggregates.begin(), B2L_aggregates.begin() + size, 0); - thrust::sequence(indices.begin(), indices.begin() + size); - //substitute coarse aggregate indices for fine boundary nodes - thrust::copy(thrust::make_permutation_iterator(this->m_aggregates.begin(), A.manager->B2L_maps[i].begin()), - thrust::make_permutation_iterator(this->m_aggregates.begin(), A.manager->B2L_maps[i].begin() + size), - B2L_aggregates.begin()); - //find the unique ones - thrust::sort_by_key(B2L_aggregates.begin(), B2L_aggregates.begin() + size, indices.begin()); - IndexType num_unique = thrust::unique_by_key(B2L_aggregates.begin(), B2L_aggregates.begin() + size, indices.begin()).first - B2L_aggregates.begin(); - coarse_B2L_maps[i].resize(num_unique); - //sort it back so we have the original ordering - thrust::sort_by_key(indices.begin(), indices.begin() + num_unique, B2L_aggregates.begin()); - thrust::copy(B2L_aggregates.begin(), B2L_aggregates.begin() + num_unique, coarse_B2L_maps[i].begin()); - } - - cudaCheckError(); - /* - * EXAMPLE - say, partition 3 has the following coarse B2L_maps: - neighbors [0 1 2 4 5 8 10] - B2L_maps[0(=0)] = [6 7 8] - B2L_maps[1(=1)] = [8 9 10] - B2L_maps[2(=2)] = [10 11 12 13] - B2L_maps[3(=4)] = [13 14 15] - B2L_maps[4(=5)] = [15 16 17] - B2L_maps[5(=8)] = [6 18 19] - B2L_maps[6(=10)] = [17 20 19] - */ - // --------------------------------------------------- - // Step 3 - // create new B2L maps for each merged destination neighbor and drop B2L maps to neighbors we are merging with - // --------------------------------------------------- - std::vector dest_coarse_B2L_maps; - A.manager->consolidateB2Lmaps(dest_coarse_B2L_maps, coarse_B2L_maps, fine_neigh_to_coarse_neigh, num_coarse_neighbors, num_fine_neighbors); - /* - * EXAMPLE - Then, merging the coarse B2L maps on partition 3, we get: - coarse_neigh_to_fine_part [4 8] - dest_coarse_B2L_maps[0(=4)] = [13 14 15 16 17] - dest_coarse_B2L_maps[1(=8)] = [6 17 18 19 20] - */ - // ----------------------- - // Step 4 - // Create interior-boundary renumbering of aggregates according to dest_coarse_B2L_maps - // ----------------------- - // Now renumber the aggregates with all interior aggregates first, boundary aggregates second - int num_interior_aggregates; //returned by createAggregatesRenumbering - int num_boundary_aggregates; //returned by createAggregatesRenumbering - IVector renumbering; //returned by createAggregatesRenumbering - // Following calls create renumbering array and modifies B2L_maps - A.manager->createAggregatesRenumbering(renumbering, dest_coarse_B2L_maps, this->m_num_aggregates, num_coarse_neighbors, num_interior_aggregates, num_boundary_aggregates, num_rings); - /* - * EXAMPLE - Partition 3 will get a renumbering vector of size 21, for the 21 owned agggregates: - [0 1 2 3 4 5 17 6 7 8 9 10 11 12 13 14 15 16 18 19 20] - num_interior_aggregates = 12 - num_boundary_aggregates = 9 - */ - // ------------------------------------------------- - // Step 5 - // Determine whether root partition, make list of partitions merged into one - // ------------------------------------------------ - // Check if I'm root partition and how fine partitions (including myself) are merging into me - bool is_root_partition = false; - int num_fine_parts_to_consolidate = 0; - IVector_h fine_parts_to_consolidate; - - for (int i = 0; i < num_parts; i++) - { - if (destination_part[i] == my_id) - { - is_root_partition = true; - num_fine_parts_to_consolidate++; - } - } - - fine_parts_to_consolidate.resize(num_fine_parts_to_consolidate); - int count = 0; - - for (int i = 0; i < num_parts; i++) - { - if (destination_part[i] == my_id) - { - fine_parts_to_consolidate[count] = i; - count++; - } - } - - //save this information as state, as this will also be required during solve for restriction/prolongation - A.manager->setIsRootPartition(is_root_partition); - A.manager->setNumPartsToConsolidate(num_fine_parts_to_consolidate); - A.manager->setPartsToConsolidate(fine_parts_to_consolidate); - /* - * EXAMPLE - isRootPartition is true for partitions 0,4,8 false for others - num_fine_parts_to_consolidate = 4 for partitions 0,4,8 - fine_parts_to_consolidate (part 0)[0 1 2 3] (part 4)[4 5 6 7] (part 8)[8 9 10 11] - */ - // ---------------------- - // Step 6 - // Compute number of interior, boundary and total nodes in the consolidated coarse matrix. Create offsets so that partitions being merged together will have their aggregate indices ordered like this: - // [num_interior(fine_parts_to_consolidate[0]] num_interior(fine_parts_to_consolidate[1]] ... num_interior(fine_parts_to_consolidate[num_fine_parts_to_consolidate] - // num_boundary(fine_parts_to_consolidate[0]] num_boundary(fine_parts_to_consolidate[1]] ... num_boundary(fine_parts_to_consolidate[num_fine_parts_to_consolidate] ] - // ---------------------- - // Gather to get number of interior/boundary aggregates of neighbors I will merge with - std::vector vertex_counts; - int interior_offset, boundary_offset, total_interior_rows_in_merged, total_boundary_rows_in_merged; - int total_rows_in_merged; - //Computes these offsets on the root, sends them back - A.manager->computeConsolidatedOffsets(my_id, my_destination_part, is_root_partition, num_interior_aggregates, num_boundary_aggregates, vertex_counts, fine_parts_to_consolidate, num_fine_parts_to_consolidate, interior_offset, boundary_offset, total_interior_rows_in_merged, total_boundary_rows_in_merged, total_rows_in_merged, A.manager->getComms()); - //Partitions save these offsets, as it will be required during solve restriction/prolongation - A.manager->setConsolidationOffsets(interior_offset, num_interior_aggregates, boundary_offset + num_interior_aggregates, num_boundary_aggregates); - /* - * EXAMPLE - For root partition 0, say we have the following interior/boundary counts (note that partition 1 has 0 boundary, as it is only connected to partitions it is merging with) - part 0 - interior: 10 boundary 3 - part 1 - interior: 18 - part 2 - interior: 10 boundary 16 - part 3 - interior: 12 boundary 9 - interior_offset for partitions 0,1,2,3: 0 10 28 38 (total_interior_rows_in_merged 50) - boundary_offset for partitions 0,1,2,3: 0 3 3 19 (total_boundary_rows_in_merged 28) - */ - // ---------------------- - // Step 7 - // Each partition renumbers its aggregates and dest_coarse_B2L_maps using offsets computed in Step 6 and permutation in Step 4 - // ---------------------- - // Kernel to renumber the aggregates - int block_size = 128; - int grid_size = std::min( 4096, ( A.manager->halo_offsets[0] + block_size - 1 ) / block_size); - renumberAggregatesKernel <<< grid_size, block_size >>>(renumbering.raw(), interior_offset, boundary_offset, this->m_aggregates.raw(), A.manager->halo_offsets[0], num_interior_aggregates, renumbering.size()); - cudaCheckError(); - - for (int i = 0; i < num_coarse_neighbors; i++) - { - thrust::transform(dest_coarse_B2L_maps[i].begin(), - dest_coarse_B2L_maps[i].end(), - thrust::constant_iterator(boundary_offset), - dest_coarse_B2L_maps[i].begin(), - thrust::plus()); - } - - cudaCheckError(); - /* - * EXAMPLE - Partition 3 had a renumbering vector: - [0 1 2 3 4 5 17 6 7 8 9 10 11 12 13 14 15 16 18 19 20] - which is now adjusted to account for the consolidated coarse matrices' indices: - [38 39 40 41 42 43 74 44 45 46 47 48 49 69 70 71 72 73 75 76 77] - And the dest_coarse_B2L_maps, which looked like: - dest_coarse_B2L_maps[0(=4)] = [13 14 15 16 17] - dest_coarse_B2L_maps[1(=8)] = [6 17 18 19 20] - is now: - dest_coarse_B2L_maps[0(=4)] = [69 70 71 72 73] - dest_coarse_B2L_maps[1(=8)] = [74 73 75 76 77] - */ - // ------------------------------------------------- - // Step 8 - // Send dest_coarse_B2L_maps to root partitions - // ------------------------------------------------ - // Each fine partition sends to its root the number of coarse neighbors it has, their ids, and the number of boundary nodes for each coarse neighbor - IVector_h num_bdy_per_coarse_neigh(num_coarse_neighbors); - - for (int i = 0; i < num_coarse_neighbors; i++) - { - num_bdy_per_coarse_neigh[i] = dest_coarse_B2L_maps[i].size(); - } - - IVector_h consolidated_coarse_neigh_to_fine_part; //consolidated list of coarse neighbors for the root partition, using fine partition indices - int num_consolidated_neighbors = 0; - std::vector consolidated_B2L_maps; //concatenates dest_coarse_B2L_maps received from partitions that are merging into the same root and pointing to the same destination coarse neighbor - A.manager->consolidateB2LmapsOnRoot(num_consolidated_neighbors, consolidated_B2L_maps, consolidated_coarse_neigh_to_fine_part, dest_coarse_B2L_maps, coarse_neigh_to_fine_part, num_bdy_per_coarse_neigh, fine_parts_to_consolidate, num_fine_parts_to_consolidate, my_id, my_destination_part, is_root_partition, num_coarse_neighbors, A.manager->getComms()); - // - // Step 9 - figuring out halo aggregate IDs - // - //Now we need to update halo aggregate IDs - this is just a halo exchange on this->m_aggregates between partitions - //that are being merged together, but we need to send other halos to the root to come up with the halo renumbering - //TODO: separate transactions, send "real halo" to the root nodes (coarse neighbors) immediately - //Step 9.1: takes care of synchronizing the aggregate IDs between partitions we are merging together and got consistent halo aggregate IDs for neighbor we are not merging with (which are going to be sent to the root in 9.2) - A.manager->exchange_halo(this->m_aggregates, 6666); - /* - * EXAMPLE 2 - This example is independent from the previous ones. - Say partition 0 and 1 are merging (into 0) partition 0 is neighbors with 1,2,3 and partition 1 is neighbors with 0,3,4 - Partitions 3 and 4 are merging (into partition 3) and partition 2 is not merging with anyone. - This example details the renumbering of halo indices on partition 0 and partition 1. - After the exchange halo, we have: - this->m_aggregates on partition 0: - [(fine interior nodes) (fine boundary nodes) (fine halo from part 1) (fine halo from part 2) (fine halo from part 3)] - [(fine interior nodes) (fine boundary nodes) (13 13 15) (12 15 17) (14 16 18)] - aggregates on partition 1: - [(fine interior nodes) (fine boundary nodes) (fine halo from part 0) (fine halo from part 3) (fine halo from part 4)] - [(fine interior nodes) (fine boundary nodes) (14 16 17) (18 19 19) (15 15 17)] - indices in (fine halo from part 0) and (fine halo from part 1) actually contain interior aggregate indices (if they are not connected to partitions 2,3 or 4), because the boundary is disappearing there. - Indices in halo regions contain remote-local indices. - - This example is used throughout consolidateAndRenumberHalos - */ - //Step 9.2 - 9.5 - IVector_h halo_offsets(num_consolidated_neighbors + 1, 0); - A.manager->consolidateAndRenumberHalos(this->m_aggregates, A.manager->halo_offsets, halo_offsets, A.manager->neighbors, num_fine_neighbors, consolidated_coarse_neigh_to_fine_part, num_consolidated_neighbors, destination_part, my_destination_part, is_root_partition, fine_parts_to_consolidate, num_fine_parts_to_consolidate, num_parts, my_id, total_rows_in_merged, this->m_num_all_aggregates, A.manager->getComms()); - - if (is_root_partition) - { - for (int i = 0; i < consolidated_B2L_maps.size(); i++) - { - thrust::sort(consolidated_B2L_maps[i].begin(), consolidated_B2L_maps[i].end()); - } - cudaCheckError(); - } + this->getA().setView(ALL); - // Step 10 do the Galerkin product - // - ViewType oldView = this->getA().currentView(); - this->getA().setView(ALL); - // If we reuse the level we keep the previous restriction operator + // Compute restriction operator + // TODO: computing the restriction operator could be merged with the selector to save some work + // If we reuse the level we keep the previous restriction operator + if (this->isReuseLevel() == false) + { + profileSubphaseComputeRestriction(); this->Profile.tic("computeR"); computeRestrictionOperator(); this->Profile.toc("computeR"); - profileSubphaseComputeCoarseA(); - this->Profile.tic("computeA"); - Ac.copyAuxData(&A); - - if (Ac.manager == NULL) - { - Ac.manager = new DistributedManager(); - } - - this->m_coarseAGenerator->computeAOperator(A, Ac, this->m_aggregates, this->m_R_row_offsets, this->m_R_column_indices, this->m_num_all_aggregates); - Ac.setColsReorderedByColor(false); - ViewType oldViewC = Ac.currentView(); - Ac.setView(FULL); - this->Profile.toc("computeA"); - - // - // Step 11, send matrices to root, consolidate - // - - if (!is_root_partition) - { - A.manager->getComms()->send_vector_async(Ac.row_offsets, my_destination_part, 1111); - A.manager->getComms()->send_vector_async(Ac.col_indices, my_destination_part, 1112); - A.manager->getComms()->send_vector_async(Ac.values, my_destination_part, 1113); - } - else - { - int total_num_rows = this->m_num_all_aggregates; - IVector new_row_offsets(total_num_rows + 1, 0); - - //if diags are inside then we won't be counting those twice when computing halo row length - if (!Ac.hasProps(DIAG)) - { - thrust::fill(new_row_offsets.begin() + halo_offsets[0], new_row_offsets.begin() + halo_offsets[num_consolidated_neighbors], 1); - cudaCheckError(); - } - - std::vector recv_row_offsets(num_fine_parts_to_consolidate); - std::vector num_nz(num_fine_parts_to_consolidate); - IVector *work_row_offsets; - std::vector index_offset_array(2 * num_fine_parts_to_consolidate + 1); - int interior_offset = 0; - int boundary_offset = 0; - - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - boundary_offset += vertex_counts[i][0]; - } - - int max_num_nz = 0; - - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - - //receive row offsets - if (current_part != my_id) - { - recv_row_offsets[i].resize(total_num_rows + 1); - A.manager->getComms()->recv_vector(recv_row_offsets[i], current_part, 1111); - work_row_offsets = &(recv_row_offsets[i]); - num_nz[i] = (*work_row_offsets)[work_row_offsets->size() - 1]; - max_num_nz = max_num_nz > num_nz[i] ? max_num_nz : num_nz[i]; - } - else - { - work_row_offsets = &(Ac.row_offsets); - num_nz[i] = Ac.get_num_nz(); - } - - //Get interior row length - thrust::transform(work_row_offsets->begin() + interior_offset + 1, - work_row_offsets->begin() + interior_offset + vertex_counts[i][0] + 1, - work_row_offsets->begin() + interior_offset, - new_row_offsets.begin() + interior_offset, - thrust::minus()); - cudaCheckError(); - //Get boundary row length - thrust::transform(work_row_offsets->begin() + boundary_offset + 1, - work_row_offsets->begin() + boundary_offset + vertex_counts[i][1] + 1, - work_row_offsets->begin() + boundary_offset, - new_row_offsets.begin() + boundary_offset, - thrust::minus()); - cudaCheckError(); - //Increment halo row length by one for every nonzero that is an edge from the halo into this partition - int size = halo_offsets[num_consolidated_neighbors] - halo_offsets[0]; - const int block_size = 128; - const int num_blocks = min( AMGX_GRID_MAX_SIZE, (size - 1) / block_size + 1); - set_halo_rowlen <<< num_blocks, block_size>>>(work_row_offsets->raw() + halo_offsets[0], new_row_offsets.raw() + halo_offsets[0], size, Ac.hasProps(DIAG)); - cudaCheckError(); - index_offset_array[i] = interior_offset; - index_offset_array[num_fine_parts_to_consolidate + i] = boundary_offset; - interior_offset += vertex_counts[i][0]; - boundary_offset += vertex_counts[i][1]; - index_offset_array[i + 1] = interior_offset; - index_offset_array[num_fine_parts_to_consolidate + i + 1] = boundary_offset; - } - - A.manager->setConsolidationArrayOffsets(index_offset_array); - //Exclusive scan row length array to get row offsets - thrust::exclusive_scan(new_row_offsets.begin(), new_row_offsets.end(), new_row_offsets.begin()); - cudaCheckError(); - //Prepare to receive column indices and values - int num_nz_consolidated = new_row_offsets[new_row_offsets.size() - 1]; - IVector recv_col_indices(max_num_nz); - IVector new_col_indices(num_nz_consolidated); - MVector recv_values((max_num_nz + 1 + Ac.hasProps(DIAG) * (halo_offsets[num_consolidated_neighbors] - 1))*Ac.get_block_size()); - MVector new_values((num_nz_consolidated + 1 + Ac.hasProps(DIAG) * (halo_offsets[num_consolidated_neighbors] - 1))*Ac.get_block_size()); - thrust::fill(new_col_indices.begin() + new_row_offsets[halo_offsets[0]], new_col_indices.end(), -1); //Set all the halo col indices to -1 - - if (!Ac.hasProps(DIAG)) { thrust::fill(new_values.begin() + num_nz_consolidated * Ac.get_block_size(), new_values.end(), types::util::get_zero()); } - - cudaCheckError(); - IVector *work_col_indices; - MVector *work_values; - interior_offset = 0; - boundary_offset = 0; - - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - boundary_offset += vertex_counts[i][0]; - } - - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - - if (current_part != my_id) - { - A.manager->getComms()->recv_vector(recv_col_indices, current_part, 1112, 0, num_nz[i]); - A.manager->getComms()->recv_vector(recv_values, current_part, 1113, 0, (num_nz[i] + 1 + Ac.hasProps(DIAG) * (halo_offsets[num_consolidated_neighbors] - 1))*Ac.get_block_size()); - work_col_indices = &(recv_col_indices); - work_row_offsets = &(recv_row_offsets[i]); - work_values = &(recv_values); - } - else - { - work_row_offsets = &(Ac.row_offsets); - work_col_indices = &(Ac.col_indices); - work_values = &(Ac.values); - } - - //Put interior rows in place - thrust::copy(work_col_indices->begin() + (*work_row_offsets)[interior_offset], - work_col_indices->begin() + (*work_row_offsets)[interior_offset + vertex_counts[i][0]], - new_col_indices.begin() + new_row_offsets[interior_offset]); - cudaCheckError(); - thrust::copy(work_values->begin() + (*work_row_offsets)[interior_offset]*Ac.get_block_size(), - work_values->begin() + ((*work_row_offsets)[interior_offset + vertex_counts[i][0]])*Ac.get_block_size(), - new_values.begin() + new_row_offsets[interior_offset]*Ac.get_block_size()); - cudaCheckError(); - //Put boundary rows in place - thrust::copy(work_col_indices->begin() + (*work_row_offsets)[boundary_offset], - work_col_indices->begin() + (*work_row_offsets)[boundary_offset + vertex_counts[i][1]], - new_col_indices.begin() + new_row_offsets[boundary_offset]); - cudaCheckError(); - thrust::copy(work_values->begin() + (*work_row_offsets)[boundary_offset]*Ac.get_block_size(), - work_values->begin() + ((*work_row_offsets)[boundary_offset + vertex_counts[i][1]])*Ac.get_block_size(), - new_values.begin() + new_row_offsets[boundary_offset]*Ac.get_block_size()); - cudaCheckError(); - //Process halo rows (merge) - int size = halo_offsets[num_consolidated_neighbors] - halo_offsets[0]; - const int block_size = 128; - const int num_blocks = min( AMGX_GRID_MAX_SIZE, (size - 1) / block_size + 1); - //TODO: vectorise this kernel, will be inefficient for larger block sizes - append_halo_nz <<< num_blocks, block_size>>>(work_row_offsets->raw() + halo_offsets[0], - new_row_offsets.raw() + halo_offsets[0], - work_col_indices->raw(), - new_col_indices.raw(), - work_values->raw(), - new_values.raw(), - size, Ac.hasProps(DIAG), halo_offsets[0], Ac.get_block_size()); - cudaCheckError(); - - // Diagonals - if (Ac.hasProps(DIAG)) - { - // Diagonal corresponding to interior rows - thrust::copy(work_values->begin() + (num_nz[i] + interior_offset)*Ac.get_block_size(), - work_values->begin() + (num_nz[i] + interior_offset + vertex_counts[i][0])*Ac.get_block_size(), - new_values.begin() + (new_row_offsets[halo_offsets[halo_offsets.size() - 1]] + interior_offset)*Ac.get_block_size()); - // Diagonal corresponding to boundary rows - thrust::copy(work_values->begin() + (num_nz[i] + boundary_offset)*Ac.get_block_size(), - work_values->begin() + (num_nz[i] + boundary_offset + vertex_counts[i][1])*Ac.get_block_size(), - new_values.begin() + (new_row_offsets[halo_offsets[halo_offsets.size() - 1]] + boundary_offset)*Ac.get_block_size()); - cudaCheckError(); - } - - interior_offset += vertex_counts[i][0]; - boundary_offset += vertex_counts[i][1]; - } - - Ac.set_initialized(0); - Ac.row_offsets = new_row_offsets; - Ac.col_indices = new_col_indices; - Ac.values = new_values; - } - - // Create a new distributed communicator for coarse levels that only contains active partitions - Ac.manager->setComms(A.manager->getComms()->Clone()); - Ac.manager->getComms()->createSubComm(coarse_part_to_fine_part, is_root_partition); - - // - // Step 12 - finalizing, bookkeping - // - if (is_root_partition) - { - int my_consolidated_id = fine_part_to_coarse_part[my_id]; - - for (int i = 0; i < num_consolidated_neighbors; i++) - { - consolidated_coarse_neigh_to_fine_part[i] = fine_part_to_coarse_part[consolidated_coarse_neigh_to_fine_part[i]]; - } - - Ac.manager->initializeAfterConsolidation( - my_consolidated_id, - Ac, - consolidated_coarse_neigh_to_fine_part, - total_interior_rows_in_merged, - total_boundary_rows_in_merged, - this->m_num_all_aggregates, - halo_offsets, - consolidated_B2L_maps, - 1, - true); - Ac.manager->B2L_rings.resize(num_consolidated_neighbors + 1); - - for (int i = 0; i < num_consolidated_neighbors; i++) - { - Ac.manager->B2L_rings[i].resize(2); - Ac.manager->B2L_rings[i][0] = 0; - Ac.manager->B2L_rings[i][1] = consolidated_B2L_maps[i].size(); - } - - Ac.manager->set_initialized(Ac.row_offsets); - Ac.manager->getComms()->set_neighbors(num_consolidated_neighbors); - int new_nnz = Ac.row_offsets[Ac.row_offsets.size() - 1]; - Ac.set_num_nz(new_nnz); - Ac.set_num_cols(Ac.manager->halo_offsets[Ac.manager->halo_offsets.size() - 1]); - Ac.set_num_rows(Ac.get_num_cols()); - - if (A.hasProps(DIAG)) { Ac.addProps(DIAG); } - - Ac.computeDiagonal(); - Ac.set_initialized(1); - } - else - { - Ac.set_initialized(0); - // set size of Ac to be zero - Ac.resize(0, 0, 0, 1); - Ac.set_initialized(1); - } - - this->getA().setView(oldView); - Ac.setView(OWNED); } - else - { - /* WARNING: do not recompute prolongation (P) and restriction (R) when you - are reusing the level structure (structure_reuse_levels > 0). - Notice that in aggregation path, prolongation P is implicit, - and is used through the aggregates array. */ - if (this->isReuseLevel() == false) - { - this->setNeighborAggregates(); - } - this->getA().setView(ALL); - - // Compute restriction operator - // TODO: computing the restriction operator could be merged with the selector to save some work - // If we reuse the level we keep the previous restriction operator - if (this->isReuseLevel() == false) - { - profileSubphaseComputeRestriction(); - this->Profile.tic("computeR"); - computeRestrictionOperator(); - this->Profile.toc("computeR"); - } - - profileSubphaseComputeCoarseA(); - this->Profile.tic("computeA"); - Ac.set_initialized(0); - Ac.copyAuxData(&A); - this->m_coarseAGenerator->computeAOperator(A, Ac, this->m_aggregates, this->m_R_row_offsets, this->m_R_column_indices, this->m_num_all_aggregates); - Ac.setColsReorderedByColor(false); - Ac.setView(FULL); - this->Profile.toc("computeA"); - this->prepareNextLevelMatrix(A, Ac); - A.setView(OWNED); - Ac.setView(OWNED); - } + profileSubphaseComputeCoarseA(); + this->Profile.tic("computeA"); + Ac.set_initialized(0); + Ac.copyAuxData(&A); + this->m_coarseAGenerator->computeAOperator(A, Ac, this->m_aggregates, this->m_R_row_offsets, this->m_R_column_indices, this->m_num_all_aggregates); + Ac.setColsReorderedByColor(false); + Ac.setView(FULL); + this->Profile.toc("computeA"); + this->prepareNextLevelMatrix(A, Ac); + A.setView(OWNED); + Ac.setView(OWNED); this->m_next_level_size = this->m_num_all_aggregates * Ac.get_block_dimy(); diff --git a/src/amg.cu b/src/amg.cu index d9add8a7..04583aa1 100644 --- a/src/amg.cu +++ b/src/amg.cu @@ -42,7 +42,6 @@ #include #include #include -#include #include #include @@ -70,18 +69,10 @@ AMG coarsen_threshold = cfg.getParameter("coarsen_threshold", cfg_scope); min_fine_rows = cfg.getParameter( "min_fine_rows", cfg_scope ); min_coarse_rows = cfg.getParameter( "min_coarse_rows", cfg_scope); - m_amg_consolidation_flag = cfg.getParameter("amg_consolidation_flag", cfg_scope); - m_consolidation_lower_threshold = cfg.getParameter("matrix_consolidation_lower_threshold", cfg_scope); - m_consolidation_upper_threshold = cfg.getParameter("matrix_consolidation_upper_threshold", cfg_scope); m_sum_stopping_criteria = cfg.getParameter("use_sum_stopping_criteria", cfg_scope); m_structure_reuse_levels = cfg.getParameter("structure_reuse_levels", cfg_scope); m_amg_host_levels_rows = cfg.getParameter("amg_host_levels_rows", cfg_scope); - if (m_consolidation_upper_threshold <= m_consolidation_lower_threshold) - { - FatalError("Error, matrix_consolidation_lower_threshold must be smaller than matrix_consolidation_upper_threshold", AMGX_ERR_CONFIGURATION); - } - std::string solverName, new_scope, tmp_scope; cfg.getParameter( "coarse_solver", solverName, cfg_scope, new_scope ); @@ -324,49 +315,13 @@ class AMG_Setup int num_parts = level->getA().manager->getComms()->get_num_partitions(); float avg_size = num_rows_global / num_parts; - if (avg_size < amg->m_consolidation_lower_threshold) - { - if (level->isClassicalAMGLevel()) - { - FatalError("Consolidation with classical path not supported)", AMGX_ERR_NOT_IMPLEMENTED); - } - - int new_num_parts; - bool want_neighbors = false; - level->getA().manager->computeDestinationPartitions(amg->m_consolidation_upper_threshold, - avg_size, num_parts, new_num_parts, want_neighbors); - - if (new_num_parts != num_parts) - { - level->setIsConsolidationLevel(true); - // Modify partition_rows so that non-consolidated partitions have 0 rows - // Root partitions have total number of rows to consolidate - IVector_h row_count_part(num_parts, 0); - - for (int i = 0; i < num_parts; i++) - { - row_count_part[level->getA().manager->getDestinationPartitions()[i]] += partition_rows[i][0]; - } - - for (int i = 0; i < num_parts; i++) - { - partition_rows[i][0] = row_count_part[i]; - } - } - } - if (!amg->m_sum_stopping_criteria) { min_partition_rows = INT_MAX; for (int i = 0; i < partition_rows.size(); i++) { - // If aggregation AMG, ignore partitions with 0 rows, since those are caused by consolidation - // If classical AMG, include all partitions - if ( level->isClassicalAMGLevel() || (!(level->isClassicalAMGLevel()) && partition_rows[i][0] != 0)) - { - min_partition_rows = std::min(partition_rows[i][0], min_partition_rows); - } + min_partition_rows = std::min(partition_rows[i][0], min_partition_rows); } } else @@ -376,12 +331,7 @@ class AMG_Setup for (int i = 0; i < partition_rows.size(); i++) { - // If aggregation AMG, ignore partitions with 0 rows, since those are caused by consolidation - // If classical AMG, include all partitions - if ( level->isClassicalAMGLevel() || (!(level->isClassicalAMGLevel()) && partition_rows[i][0] != 0)) - { - min_partition_rows += partition_rows[i][0]; - } + min_partition_rows += partition_rows[i][0]; } } } @@ -423,403 +373,7 @@ class AMG_Setup break; } - // If consolidation level and not root partition, break; - if (!level->getA().is_matrix_singleGPU() && level->isConsolidationLevel() - && !level->getA().manager->isRootPartition()) - { - amg->setCoarseSolver(NULL, MemorySpace()); - delete coarseSolver; - coarseSolver = NULL; - coarseSolverExists = false; - break; - } - - nextLevel->setup(); - // Move to the next level. - prev_level = level; - level = nextLevel; - // Increment the level counter. - amg->num_levels++; - } //end of while(true) - - return prev_level; - } - - template< typename TConfig_hd > - static - AMG_Level *setup_v2( AMG *amg, - AMG_Level *&level, - int min_rows, bool hybrid ) - { - typedef typename TConfig_hd::MemSpace MemorySpace; - typedef TemplateConfig TConfig_h; - typedef TemplateConfig TConfig_d; - typedef typename Matrix::IVector IVector_h; - typedef typename Matrix::IVector IVector_d; - typedef typename Matrix::VVector VVector_h; - typedef typename Matrix::VVector VVector_d; - typedef typename Matrix::MVector MVector_h; - typedef typename Matrix::MVector MVector_d; - typedef typename Matrix::IVector IVector_hd; - typedef typename Matrix::VVector VVector_hd; - typedef typename Matrix::MVector MVector_hd; - typedef typename MatPrecisionMap::Type ValueTypeA; - typedef typename VecPrecisionMap::Type ValueTypeB; - MemorySpace memorySpaceTag; - // The previous level. - AMG_Level *prev_level = 0L; - typedef TemplateConfig hvector_type; - typedef Vector HVector; - std::vector partition_rows(0); - HVector num_rows(1); - int64_t num_rows_global; - num_rows[0] = num_rows_global = level->getNumRows( ); - int min_partition_rows = INT_MAX, offset = 0, n = 0, num_parts = 1, num_active_parts = 0; - float avg_size; - - if (level->getA().is_matrix_distributed()) - { - num_parts = level->getA().manager->getComms()->get_num_partitions(); - level->getA( ).manager->getComms()->global_reduce(partition_rows, num_rows, - level->getA( ), level->tag * 100 + 7); - num_rows_global = 0; - - for (int i = 0; i < partition_rows.size(); i++) - { - if (partition_rows[i][0] != 0) - { - min_partition_rows = std::min(partition_rows[i][0], min_partition_rows); - num_active_parts++; - } - - num_rows_global += partition_rows[i][0]; - } - - if (min_partition_rows == INT_MAX) - { - min_partition_rows = 0; - } - } - - IVector_h row_count_part(num_parts, 0); - Solver *coarseSolver = amg->getCoarseSolver( MemorySpace() ); - bool coarseSolverExists = coarseSolver != NULL; - - // Build the remaining / all the levels on the CPU. Note: level_h is NULL if all the setup happened on the GPU. - while (true) - { - // Glue matrices of the current level - avg_size = num_rows_global / num_parts; - // Allow to glue other levels tha 0 if COARSE_CLA_CONSO is true -#if COARSE_CLA_CONSO - - if (level->getA().is_matrix_distributed() && avg_size < amg->m_consolidation_lower_threshold) - { -#else - - if (level->getA().is_matrix_distributed() && avg_size < amg->m_consolidation_lower_threshold && level->getLevelIndex() == 0) - { -#endif - // Just remove level->getLevelIndex() == 0 in the previous test to allow coarse level consolidation -#ifdef AMGX_WITH_MPI - level->getA().manager->setIsGlued(false); - int new_num_parts = glue_level(amg, level, num_active_parts); - - if (new_num_parts && new_num_parts != num_active_parts) - { - if (level->getA().manager->global_id() == 0) - { - std::cout << "Level " << level->getLevelIndex() << " has been consolidated : " << num_active_parts << " --> " << new_num_parts << std::endl; - } - - // this is for coarse level consolidation - if (level->getLevelIndex() > 0) - { - level->setIsConsolidationLevel(true); - } - - level->setup(); - num_active_parts = new_num_parts; - // Modify partition_rows so that non-consolidated partitions have 0 rows - // Root partitions have total number of rows to consolidate - num_rows[0] = level->getNumRows(); - level->getA().manager->getComms()->global_reduce( partition_rows, num_rows, - level->getA(), level->tag * 100 + 33 ); - // Update some local arrays and variables - num_rows_global = 0; - - for (int i = 0; i < partition_rows.size(); i++) - { - num_rows_global += partition_rows[i][0]; - } - - for (int i = 0; i < num_parts; i++) - { - row_count_part[level->getA().manager->getDestinationPartitions()[i]] += partition_rows[i][0]; - } - - for (int i = 0; i < num_parts; i++) - { - partition_rows[i][0] = row_count_part[i]; - } - } - else - { - level->getA().manager->setIsGlued(false); - } - -#endif - } - - level->getA().getOffsetAndSizeForView(OWNED, &offset, &n); - - if (!n) - { - // no coarse solver for empty matrices? - // maybe we can deal with this in classical amg cycle - amg->setCoarseSolver(NULL, MemorySpace()); - delete coarseSolver; - coarseSolver = NULL; - coarseSolverExists = false; - } - - //Check if you reached the coarsest level (min_partition_rows is the number of rows in this partition/rank) - //NOTE: min_rows = min_coarse_rows if async framework is disabled (min_fine_rows =< min_coarse_rows) - if (amg->num_levels >= amg->max_levels || min_partition_rows <= min_rows) - { -#if 0 //AMGX_ASYNCCPU_PROOF_OF_CONCEPT - asyncmanager::singleton()->waitall(); -#endif - - //Check if the user wishes to use DENSE_LU_SOLVER capping the matrix the size, and the matrix size exceeds the maximum allowed - //NOTE: if dense_lu_max_rows=0 then either you are not using dense solver or you don't want to cap the maximum matrix size - if ((amg->m_dense_lu_max_rows != 0) && (min_partition_rows > amg->m_dense_lu_max_rows)) - { - amg->setCoarseSolver(NULL, MemorySpace()); - delete coarseSolver; - coarseSolver = NULL; - coarseSolverExists = false; - } - - //Check if there is no coarse solver, then setup the smoother to solve the coarsest level - // If n is 0 then the matrix is consolidated so we don't setup the smoother - // We always setup the smoother on finest level - if (!coarseSolverExists) - { - level->setup_smoother(); - } - - return level; - } - - // Allocate next level or use existing one - int reuse_next_level; - AMG_Level *nextLevel; - - if (!level->getNextLevel(MemorySpace()) || (amg->m_structure_reuse_levels <= amg->num_levels && amg->m_structure_reuse_levels != -1)) - { - if (level->getNextLevel(MemorySpace())) - { - delete level->getNextLevel(MemorySpace()); - } - - reuse_next_level = 0; - level->setReuseLevel(false); - nextLevel = AMG_LevelFactory::allocate(amg, level->getSmoother()->get_thread_manager()); - level->setNextLevel( nextLevel ); - } - else - { - // reuse existing next level - reuse_next_level = 1; - level->setReuseLevel(true); - nextLevel = level->getNextLevel(MemorySpace()); - /* WARNING: we do not recompute prolongation (P) and restriction (R) when we - are reusing the level structure (structure_reuse_levels > 0), but - we do need to modify an existing coarse matrix Ac=R*A*P. - Instead of calling Ac.set_initialized(0) in every path afterwards, - we wil call it here. Notice that in the if part of this statement - above when the new level is allocated it creates a new matrix which - is not initialized by default (see the matrix constructor): - AMG_Level_Factory::allocate -> Classical_AMG_LevelFactory::create -> - new Classical_AMG_Level -> new AMG_Level -> new Matrix - We are just matching this Ac.set_initialized(0) setting here. */ - Matrix &Ac = nextLevel->getA(); - Ac.set_initialized(0); - } - - nextLevel->setLevelIndex( amg->num_levels ); - level->getA().template setParameter("level", amg->num_levels); -#if 0 //AMGX_ASYNCCPU_PROOF_OF_CONCEPT - - if (async_global::singleton()->using_async_coloring) - { - struct task_setupsmoother : public task - { - AMG_Level *level; - bool coarseSolverExists; - - int profiler_color() {return 0x00ffff;} - std::string name() { return "setup_smoother"; } - void run() - { - // Setup smoother unless coarseSolver exists and reached coarsest level - if ( !( level->isCoarsest() && coarseSolverExists ) ) - { - level->setup_smoother(); - } - } - }; - task_setupsmoother *task_setupsmoother_ = new task_setupsmoother; - task_setupsmoother_->level = level; - task_setupsmoother_->coarseSolverExists = coarseSolverExists; - // create the aggregates (aggregation) or coarse points (classical) - level->createCoarseVertices( ); - enqueue_async(asyncmanager::singleton()->main_thread_queue(0), task_setupsmoother_); - } - else -#endif - { - // only compute aggregates if we can't reuse existing ones - if (!reuse_next_level) - { - level->createCoarseVertices( ); - } - } - - //set the amg_level_index for this matrix - nextLevel->getA().amg_level_index = amg->num_levels; - int64_t N = num_rows_global * level->getA().get_block_dimy(); - num_rows[0] = num_rows_global = level->getNumCoarseVertices(); - - // Do reduction across all partitions - if (level->getA().is_matrix_distributed()) - { - level->getA().manager->getComms()->global_reduce( partition_rows, num_rows, - level->getA(), level->tag * 100 + 8 ); - num_rows_global = 0; - - for (int i = 0; i < partition_rows.size(); i++) - { - num_rows_global += partition_rows[i][0]; - } - } - - // num_rows[0] contains the total number of rows across all partitions - int64_t nextN = num_rows_global * level->getA().get_block_dimy(); - - if (!level->getA().is_matrix_distributed()) - { - min_partition_rows = num_rows[0]; - } - else - { - // level->setIsConsolidationLevel(true); // coaese root partions exited some time in classical - if (!amg->m_sum_stopping_criteria) - { - min_partition_rows = INT_MAX; - - for (int i = 0; i < partition_rows.size(); i++) - { - // Before we did - // If aggregation AMG, ignore partitions with 0 rows, since those are caused by consolidation - // If classical AMG, include all partitions - if (partition_rows[i][0] != 0) - { - min_partition_rows = std::min(partition_rows[i][0], min_partition_rows); - } - } - - // if we exit the previous loop with min_partition_rows == INT_MAX it means all next size are 0 - if (min_partition_rows == INT_MAX) - { - min_partition_rows = 0; - } - } - else - { - // use sum instead of min - min_partition_rows = 0; - - for (int i = 0; i < partition_rows.size(); i++) - { - // If aggregation AMG, ignore partitions with 0 rows, since those are caused by consolidation - // If classical AMG, include all partitions - if (partition_rows[i][0] != 0) - { - min_partition_rows += partition_rows[i][0]; - } - } - } - } - - // stop here if next level size is < min_rows - if ( nextN <= amg->coarsen_threshold * N && nextN != N && min_partition_rows >= min_rows ) - { - level->createCoarseMatrices(); - // Resize coarse vectors. - int nextSize = level->getNextLevelSize(); - level->getxc( ).resize( nextSize ); - level->getxc().set_block_dimy(level->getA( ).get_block_dimy()); - level->getxc().set_block_dimx(1); - level->getxc().tag = nextLevel->tag * 100 + 1; - level->getbc( ).resize( nextSize ); - level->getbc().set_block_dimy(level->getA( ).get_block_dimy()); - level->getbc().set_block_dimx(1); - level->getbc().tag = nextLevel->tag * 100 + 0; - int size, offset; - level->getA().getOffsetAndSizeForView(FULL, &offset, &size); - level->getr().resize( size * level->getA( ).get_block_dimy() ); - level->getr().set_block_dimy(level->getA( ).get_block_dimy()); - level->getr().set_block_dimx(1); - level->getr().tag = nextLevel->tag * 100 + 2; - } - else - { - // delete next level that we just created - level->deleteNextLevel( memorySpaceTag ); - } - -#if 0 //AMGX_ASYNCCPU_PROOF_OF_CONCEPT - - if (async_global::singleton()->using_async_coloring) - { - //cancel the CPU coloring task if the GPU is idle - cudaStreamSynchronize(thrust::global_thread_handle::get_stream()); - enqueue_async(asyncmanager::singleton()->global_parallel_queue, async_global::singleton()->cancel_cpu_coloring_task); - //wait for every spawning task - asyncmanager::singleton()->waitall(); - } - else -#endif - - // If n is 0 then the matrix is consolidated so we don't setup the smoother - if (!level->isCoarsest() || (!coarseSolverExists)) - { - level->setup_smoother(); - } - - if (level->isCoarsest()) - { - break; - } - - // Barrier (might be removed) - // ****************************************** - if (level->getA().is_matrix_distributed()) { level->getA().manager->getComms()->barrier(); } - - // ****************************************** nextLevel->setup(); - nextLevel->getA().setResources(level->getA().getResources()); -#if 0 //AMGX_ASYNCCPU_PROOF_OF_CONCEPT - - // color the matrix ASAP - if (!nextmin_fine_rowsmin_fine_rowsmin_fine_rowsLevel->getA().is_matrix_setup()) - { - nextLevel->getA().setupMatrix(nextLevel->getSmoother(), *amg->m_cfg, false); - } - -#endif // Move to the next level. prev_level = level; level = nextLevel; @@ -827,60 +381,9 @@ class AMG_Setup amg->num_levels++; } //end of while(true) -#if 0 //AMGX_ASYNCCPU_PROOF_OF_CONCEPT - cudaStreamSynchronize(thrust::global_thread_handle::threadStream[getCurrentThreadId()]); - thrust::global_thread_handle::threadStream[getCurrentThreadId()] = 0; -#endif return prev_level; } - template< typename TConfig_hd > - static - int glue_level(AMG *amg, AMG_Level *&level, int num_active_parts) - { -#ifdef AMGX_WITH_MPI - if (level->getA().manager->getComms() != NULL) - { - MPI_Comm A_com, temp_com; - int new_num_parts, n_global, num_parts, avg; - bool wantneighbors = true; - A_com = level->getA().manager->getComms()->get_mpi_comm(); - - if (level->getA().manager->part_offsets_h.size() == 0) // create part_offsets_h & part_offsets - { - create_part_offsets(A_com, level->getA()); - } - - n_global = level->getA().manager->part_offsets_h.back(); - num_parts = level->getA().manager->getComms()->get_num_partitions(); - avg = n_global / num_parts; - level->getA().manager->computeDestinationPartitions(amg->m_consolidation_upper_threshold, - avg, num_parts, new_num_parts, wantneighbors); - - if (new_num_parts != num_active_parts) - { - // Compute consolidation info - compute_glue_info(level->getA()); - // Compute a temporary splited communicator to glue matrices - temp_com = compute_glue_matrices_communicator(level->getA()); - // glue_matrices does the following : unpack --> glue --> upload --> repack - glue_matrices(level->getA(), A_com, temp_com); - return new_num_parts; - } - else - { - return num_active_parts; - } - } - else - { - return 0; - } -#else - return 0; -#endif - } - template< typename TConfig0, AMGX_MemorySpace MemSpace0, AMGX_MemorySpace MemSpace1 > static void @@ -921,72 +424,7 @@ class AMG_Setup level_0->setLevelIndex( 0 ); level_0->setup(); - if (level_0->isClassicalAMGLevel() && amg->m_amg_consolidation_flag == 1 && level_0->getA().is_matrix_distributed()) - { -#ifdef AMGX_WITH_MPI - - if (amg->m_consolidation_lower_threshold == 0 ) // m_consolidation_lower_threshold is unset - { - int root = 0; - int max = 0, min = 0; - MPI_Comm comm = level_0->getA().manager->getComms()->get_mpi_comm(); - - if (level_0->getA().manager->global_id() == 0 ) - { - size_t avail, total; - cudaMemGetInfo (&avail, &total); - size_t used = level_0->bytes(); // Memory used by the finest level. - size_t hierarchy = 6 * used; // Estimation of the size of the hierarchy - size_t overhead = 1000000000; // 1GB of storage for other AMGX stuff - // The Strength factor represents how many time a matrix like the one we localy have can fit into this GPU - // This is based on the one we have on the finest level on rank 0 and considering the total hierarchy can be 6x larger - double strength = (static_cast(total - overhead)) / hierarchy; - - // The sum of memory required by coarse levels should be (approximately) smaller or equal than 6x the memory requiered by the finest level. - // This assume a good load balencing - // We should check when we glue matrices that we are not going out of memory. - if (strength > 1.0) - { - int rows = level_0->getNumRows(); - max = (strength * rows) / 6; // We divide by 6 because we increase the size of the following coarse levels by increasing the size of the current matrix - - if (max > 0) - { - min = max - 1; - } - else - { - max = 1; - min = 0; - } - } - else - { - max = 1; - min = 0; - } - } - - MPI_Bcast( &max, 1, MPI_INT, root, comm ); - MPI_Bcast( &min, 1, MPI_INT, root, comm ); - amg->m_consolidation_lower_threshold = min; - amg->m_consolidation_upper_threshold = max; - } - - if (amg->m_consolidation_lower_threshold > 0) - { - prev_level_0 = setup_v2( amg, level_0, min_fine_rows, min_fine_rows > min_coarse_rows ); // entering in gluing path - } - else -#endif - { - prev_level_0 = setup( amg, level_0, min_fine_rows, min_fine_rows > min_coarse_rows ); // no glue because the matrix is too big - } - } - else - { - prev_level_0 = setup( amg, level_0, min_fine_rows, min_fine_rows > min_coarse_rows ); // usual path / aggregation consolidation path - } + prev_level_0 = setup( amg, level_0, min_fine_rows, min_fine_rows > min_coarse_rows ); // Move to the other memory space if needed. if ( min_fine_rows == min_coarse_rows ) diff --git a/src/amg_config.cu b/src/amg_config.cu index 0b24244e..b05a236b 100644 --- a/src/amg_config.cu +++ b/src/amg_config.cu @@ -540,7 +540,7 @@ void AMG_Config::importNamedParameter(const char *c_name, const T &c_value, cons FatalError(err.c_str(), AMGX_ERR_CONFIGURATION); } - if ( (name == "determinism_flag" || name == "block_format" || name == "separation_interior" || name == "separation_exterior" || name == "min_rows_latency_hiding" || name == "fine_level_consolidation" || name == "use_cuda_ipc_consolidation") && current_scope != "default" ) + if ( (name == "determinism_flag" || name == "block_format" || name == "separation_interior" || name == "separation_exterior" || name == "min_rows_latency_hiding") && current_scope != "default" ) { string err = "Incorrect config entry. Parameter " + name + " can only be specified with default scope."; FatalError(err.c_str(), AMGX_ERR_CONFIGURATION); @@ -1354,7 +1354,7 @@ void AMG_Config::setParameter(const string &str) FatalError(err.c_str(), AMGX_ERR_CONFIGURATION); } - if ( (name == "determinism_flag" || name == "block_format" || name == "separation_interior" || name == "separation_exterior" || name == "min_rows_latency_hiding" || name == "fine_level_consolidation" || name == "use_cuda_ipc_consolidation") && current_scope != "default" ) + if ( (name == "determinism_flag" || name == "block_format" || name == "separation_interior" || name == "separation_exterior" || name == "min_rows_latency_hiding") && current_scope != "default" ) { string err = "Incorrect config entry. Parameter " + name + " can only be specified with default scope."; FatalError(err.c_str(), AMGX_ERR_CONFIGURATION); diff --git a/src/amg_level.cu b/src/amg_level.cu index e8a9b4bd..f02e9fce 100644 --- a/src/amg_level.cu +++ b/src/amg_level.cu @@ -49,7 +49,7 @@ AMG_Level::~AMG_Level() } template -AMG_Level::AMG_Level(AMG_Class *amg, ThreadManager *tmng) : smoother(0), amg(amg), next_h(0), next_d(0), init(false), tag(0), is_setup(0), m_amg_level_name("AMGLevelNameNotSet"), m_is_reuse_level(false), m_is_consolidation_level(false), m_next_level_size(0) +AMG_Level::AMG_Level(AMG_Class *amg, ThreadManager *tmng) : smoother(0), amg(amg), next_h(0), next_d(0), init(false), tag(0), is_setup(0), m_amg_level_name("AMGLevelNameNotSet"), m_is_reuse_level(false), m_next_level_size(0) { Aoriginal = new Matrix(); A = Aoriginal; @@ -75,11 +75,7 @@ void AMG_Level::transfer_from(AMG_Level *ref_lvl) this->m_next_level_size = ref_lvl->m_next_level_size; this->init = ref_lvl->init; this->m_amg_level_name = ref_lvl->m_amg_level_name; - this->m_is_consolidation_level = ref_lvl->m_is_consolidation_level; this->m_is_reuse_level = ref_lvl->m_is_reuse_level; - this->m_is_root_partition = ref_lvl->m_is_root_partition; - this->m_destination_part = ref_lvl->m_destination_part; - this->m_num_parts_to_consolidate = ref_lvl->m_num_parts_to_consolidate; this->transfer_level(ref_lvl); } @@ -119,23 +115,6 @@ void AMG_Level::setup_smoother() delete this->smoother;*/ smoother->tag = this->tag * 100 + 0; ThreadManager *tmng = smoother->get_thread_manager(); -#ifdef AMGX_WITH_MPI - - if ( this->getA().is_matrix_distributed() && this->getA().manager != NULL) - { - int offset, n; - this->getA().getOffsetAndSizeForView(FULL, &offset, &n); - - //if ( this->getA().manager->isGlued() && !this->getA().manager->isRootPartition() ) { - if (!n && this->isClassicalAMGLevel()) - { - // Skip the solve in gluing path by looking at this flag - // XXXX: actually setup is skipped. Check if solve can/need to be skipped too - smoother->setGluedSetup(true); - } - } - -#endif // deferred execution: just push work to queue if (tmng == NULL) diff --git a/src/amgx_c.cu b/src/amgx_c.cu index 518f409b..7a13f13b 100644 --- a/src/amgx_c.cu +++ b/src/amgx_c.cu @@ -926,20 +926,9 @@ inline AMGX_RC matrix_replace_coefficients(AMGX_matrix_handle mtx, cudaSetDevice(A.getResources()->getDevice(0)); typedef typename MatPrecisionMap::Type ValueType; - if (A.manager != NULL && - (A.manager->isFineLevelConsolidated() && A.manager->getFineLevelComms()->halo_coloring != LAST || - !A.manager->isFineLevelConsolidated() && A.manager->getComms()->halo_coloring != LAST) - ) + if (A.manager != NULL && !A.is_matrix_singleGPU()) { - AMGX_CHECK_API_ERROR(AMGX_ERR_BAD_PARAMETERS, resources); - } - else if (A.manager != NULL && (A.manager->isFineLevelConsolidated() || A.manager->isFineLevelGlued())) - { - A.manager->replaceMatrixCoefficientsWithCons(n, nnz, (const ValueType *)data, (const ValueType *)diag_data); - } - else if (A.manager != NULL && !A.is_matrix_singleGPU()) - { - A.manager->replaceMatrixCoefficientsNoCons(n, nnz, (const ValueType *)data, (const ValueType *)diag_data); + A.manager->replaceMatrixCoefficients(n, nnz, (const ValueType *)data, (const ValueType *)diag_data); } else { @@ -1173,7 +1162,7 @@ inline AMGX_RC vector_download_impl(const AMGX_vector_handle vec, int block_dimy = v.get_block_dimy(); v.getManager()->getView(OWNED, n, nnz); - if (v.is_transformed() || v.getManager()->isFineLevelGlued()) + if (v.is_transformed()) { v.getManager()->revertAndDownloadVector(v, data, n, block_dimy); } @@ -1202,19 +1191,7 @@ inline AMGX_RC vector_get_size(AMGX_vector_handle vec, typedef typename VecPrecisionMap::Type ValueTypeB; VectorW wrapV(vec); VectorLetterT &v = *wrapV.wrapped(); - - //if (!wrapV.is_valid()) - // AMGX_CHECK_API_ERROR(AMGX_ERR_BAD_PARAMETERS, resources) - - if (v.getManager() != NULL && (v.getManager()->isFineLevelConsolidated() || v.getManager()->isFineLevelGlued() ) ) - { - *n = v.get_unconsolidated_size() / v.get_block_dimy(); - } - else - { - *n = v.size() / v.get_block_dimy(); - } - + *n = v.size() / v.get_block_dimy(); *block_dim = v.get_block_dimy(); return AMGX_RC_OK; } @@ -3064,14 +3041,7 @@ extern "C" { MatrixLetterT* mtx_ptr = get_mode_object_from(mtx); \ if (mtx_ptr->manager != NULL) \ { \ - if (mtx_ptr->manager->isFineLevelGlued()) \ - { \ - *n = mtx_ptr->manager->halo_offsets_before_glue[0]; \ - } \ - else \ - { \ - *n = mtx_ptr->get_num_rows(); \ - } \ + *n = mtx_ptr->get_num_rows(); \ } \ else \ { \ diff --git a/src/classical/classical_amg_level.cu b/src/classical/classical_amg_level.cu index 4c525909..4c79b25e 100644 --- a/src/classical/classical_amg_level.cu +++ b/src/classical/classical_amg_level.cu @@ -24,7 +24,6 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#define COARSE_CLA_CONSO 0 #include #include @@ -54,7 +53,6 @@ #include #include -#include namespace amgx { @@ -627,25 +625,10 @@ void Classical_AMG_Level_Base::restrictResidual(VVector &r, VVector &r { typedef typename TConfig::MemSpace MemorySpace; Matrix &Ac = this->getNextLevel( MemorySpace( ) )->getA(); -#if COARSE_CLA_CONSO - int desired_size ; - - if (this->getNextLevel(MemorySpace())->isConsolidationLevel()) - { - desired_size = std::max(P.manager->halo_offsets[P.manager->neighbors.size()], Ac.manager->halo_offsets_before_glue[Ac.manager->neighbors_before_glue.size()] * rr.get_block_size()); - } - else - { - desired_size = std::max(P.manager->halo_offsets[P.manager->neighbors.size()], Ac.manager->halo_offsets[Ac.manager->neighbors.size()] * rr.get_block_size()); - } - -#else int desired_size = std::max(P.manager->halo_offsets[P.manager->neighbors.size()], Ac.manager->halo_offsets[Ac.manager->neighbors.size()] * rr.get_block_size()); -#endif rr.resize(desired_size); } -#if 1 this->Profile.tic("restrictRes"); // Disable speculative send of rr @@ -658,7 +641,6 @@ void Classical_AMG_Level_Base::restrictResidual(VVector &r, VVector &r multiply_with_mask_restriction( R, r, rr, P); } -#endif // exchange halo residuals & add residual contribution from neighbors rr.dirtybit = 1; @@ -893,24 +875,8 @@ void Classical_AMG_Level_Base::prolongateAndApplyCorrection(VVector &e // get coarse matrix typedef typename TConfig::MemSpace MemorySpace; Matrix &Ac = this->getNextLevel( MemorySpace( ) )->getA(); -#if COARSE_CLA_CONSO - int e_size; - - if (this->getNextLevel(MemorySpace())->isConsolidationLevel()) - { - e_size = std::max(P.manager->halo_offsets[P.manager->neighbors.size()], Ac.manager->halo_offsets_before_glue[Ac.manager->neighbors_before_glue.size()]) * e.get_block_size(); - } - else - { - e_size = std::max(P.manager->halo_offsets[P.manager->neighbors.size()], Ac.manager->halo_offsets[Ac.manager->neighbors.size()]) * e.get_block_size(); - } - - if (e.size() < e_size) { e.resize(e_size); } - -#else int e_size = std::max(P.manager->halo_offsets[P.manager->neighbors.size()], Ac.manager->halo_offsets[Ac.manager->neighbors.size()]) * e.get_block_size(); e.resize(e_size); -#endif } if (P.is_matrix_singleGPU()) @@ -970,39 +936,6 @@ void Classical_AMG_Level_Base::computeAOperator_distributed() } } - -template -void Classical_AMG_Level_Base::consolidateVector(VVector &x) -{ -#ifdef AMGX_WITH_MPI -#if COARSE_CLA_CONSO - typedef typename TConfig::MemSpace MemorySpace; - Matrix &A = this->getA(); - Matrix &Ac = this->getNextLevel( MemorySpace( ) )->getA(); - MPI_Comm comm, temp_com; - comm = Ac.manager->getComms()->get_mpi_comm(); - temp_com = compute_glue_matrices_communicator(Ac); - glue_vector(Ac, comm, x, temp_com); -#endif -#endif -} - -template -void Classical_AMG_Level_Base::unconsolidateVector(VVector &x) -{ -#ifdef AMGX_WITH_MPI -#if COARSE_CLA_CONSO - typedef typename TConfig::MemSpace MemorySpace; - Matrix &A = this->getA(); - Matrix &Ac = this->getNextLevel( MemorySpace( ) )->getA(); - MPI_Comm comm, temp_com; - comm = Ac.manager->getComms()->get_mpi_comm(); - temp_com = compute_glue_matrices_communicator(Ac); - unglue_vector(Ac, comm, x, temp_com, x); -#endif -#endif -} - /**************************************** * Explict instantiations ***************************************/ diff --git a/src/core.cu b/src/core.cu index 4558b364..62bffbe0 100644 --- a/src/core.cu +++ b/src/core.cu @@ -335,17 +335,8 @@ inline void registerParameters() //Register Determinism and Exception Handling Parameters AMG_Config::registerParameter("determinism_flag", "a flag that forces the various aggregators and matrix coloring algorithms to be deterministic (0:non-deterministic, 1:deterministic) <0>", 0, bool_flag_values); AMG_Config::registerParameter("exception_handling", "a flag that forces internal exception processing instead of returning error codes(1:internal, 0:external)", 0, bool_flag_values); - //Register System Parameters (fine level consolidation) - AMG_Config::registerParameter("fine_level_consolidation", "flag that controls whether or not fine level is consolidated", 0, bool_flag_values); - AMG_Config::registerParameter("use_cuda_ipc_consolidation", "flag that controls whether or not to use cudaIpc for fine level consolidation", 0, bool_flag_values); - //Register System Parameters (amg level consolidation) - AMG_Config::registerParameter("amg_consolidation_flag", "flag that controls whether or not to use amg level consolidation", 0); - // matrix_consolidation_lower_threshold and matrix_consolidation_upper_threshold are obsolete - AMG_Config::registerParameter("matrix_consolidation_lower_threshold", "Average number of rows at which matrices from different processes must be merged", 0); - AMG_Config::registerParameter("matrix_consolidation_upper_threshold", "Average number of rows that merged matrices from different processes should have", 1000); //Register System Parameters (memory pools) AMG_Config::registerParameter("device_mem_pool_size", "size of the device memory pool in bytes", 256 * 1024 * 1024); - AMG_Config::registerParameter("device_consolidation_pool_size", "size of the device memory pool for root partition in bytes", 256 * 1024 * 1024); AMG_Config::registerParameter("device_mem_pool_max_alloc_size", "maximum size of a single allocation in the device memory pool in bytes", 20 * 1024 * 1024); AMG_Config::registerParameter("device_alloc_scaling_factor", "over allocation for large buffers (in %% -- a value of X will lead to 100+X%% allocations)", 10); AMG_Config::registerParameter("device_alloc_scaling_threshold", "buffers smaller than that threshold will NOT be scaled", 16 * 1024); diff --git a/src/cycles/fixed_cycle.cu b/src/cycles/fixed_cycle.cu index 616169be..f83b0ec6 100644 --- a/src/cycles/fixed_cycle.cu +++ b/src/cycles/fixed_cycle.cu @@ -38,7 +38,6 @@ #include #include #include -#include #include @@ -154,65 +153,33 @@ void FixedCycle::cycle( AMG_Class *amg, AMG_LevelProfile.toc("ComputeResidual"); //apply restriction - // in classical the current level is consolidated while in aggregation this is the next one. - // Hence, in classical, given a level L, if we want to consolidate L+1 vectors (ie coarse vectors of L) we have to look at L+1 flags. - bool consolidation_flag = false; - bool isRootPartition_flag = false; - - if (level->isClassicalAMGLevel() && !A.is_matrix_singleGPU()) // In classical consolidation we want to use A.is_matrix_distributed(), this might be an issue when n=1 - { - consolidation_flag = level->getNextLevel(MemorySpace())->isConsolidationLevel(); - isRootPartition_flag = level->getNextLevel(MemorySpace())->getA().manager->isRootPartition(); - } - else if (!level->isClassicalAMGLevel() && !A.is_matrix_singleGPU()) - { - consolidation_flag = level->isConsolidationLevel(); - isRootPartition_flag = A.manager->isRootPartition(); - } - level->Profile.tic("restrictRes"); level->restrictResidual(r, bc); level->Profile.toc("restrictRes"); - // we have to be very carreful with !A.is_matrix_singleGPU() by A.is_matrix_distributed(). - // In classical consolidation we want to use A.is_matrix_distributed() in order to consolidateVector / unconsolidateVector - if (!A.is_matrix_singleGPU() && consolidation_flag) - { - level->consolidateVector(bc); - level->consolidateVector(xc); - } + //mark the next level guess for initialization + level->setNextInitCycle( ); + static const AMGX_VecPrecision vecPrec = T_Config::vecPrec; + static const AMGX_MatPrecision matPrec = T_Config::matPrec; + static const AMGX_IndPrecision indPrec = T_Config::indPrec; - // This should work - if ( !( !A.is_matrix_singleGPU() && consolidation_flag && !isRootPartition_flag)) + //WARNING: coarse solver might be called inside generateNextCycles routine + if ( level->isNextCoarsest( )) { - //mark the next level guess for initialization - level->setNextInitCycle( ); - static const AMGX_VecPrecision vecPrec = T_Config::vecPrec; - static const AMGX_MatPrecision matPrec = T_Config::matPrec; - static const AMGX_IndPrecision indPrec = T_Config::indPrec; - - //WARNING: coarse solver might be called inside generateNextCycles routine - if ( level->isNextCoarsest( )) - { - //if the next level is the coarsest then don't dispatch an entire cycle, instead just launch a single Vfixed cycle. - //std::cout << "launching coarsest" << std::endl; - level->generateNextCycles( amg, bc, xc, V_CycleDispatcher( ) ); - } - else - { - //solve the next level using the cycle that was passed in - level->generateNextCycles( amg, bc, xc, CycleDispatcher( ) ); - } + //if the next level is the coarsest then don't dispatch an entire cycle, instead just launch a single Vfixed cycle. + //std::cout << "launching coarsest" << std::endl; + level->generateNextCycles( amg, bc, xc, V_CycleDispatcher( ) ); } - - if (!A.is_matrix_singleGPU() && consolidation_flag) + else { - level->unconsolidateVector(xc); + //solve the next level using the cycle that was passed in + level->generateNextCycles( amg, bc, xc, CycleDispatcher( ) ); } //prolongate correction level->prolongateAndApplyCorrection(xc, bc, x, r); level->Profile.toc("proCorr"); + //post smooth *smoothing_direction = 1; level->Profile.tic("Smoother"); diff --git a/src/distributed/distributed_io.cu b/src/distributed/distributed_io.cu index 0d58954f..e4d95135 100644 --- a/src/distributed/distributed_io.cu +++ b/src/distributed/distributed_io.cu @@ -225,11 +225,6 @@ void DistributedRead FatalError("Only integer number of partitions per rank is supported", AMGX_ERR_IO); } - if (read_partitions != partitions) - { - msg << "Found " << read_partitions << " performing consolidation\n"; - } - int partsPerRank = read_partitions / partitions; partSize.resize(partitions); thrust::fill(partSize.begin(), partSize.end(), 0); @@ -511,15 +506,6 @@ AMGX_ERROR DistributedRead -void getConsolidationFlags( const Mat *A, int *consolidate_flag, int *cuda_ipc_flag) -{ - AMG_Config *rsrc_cfg = A->getResources()->getResourcesConfig(); - std::string scope; - rsrc_cfg->getParameter("fine_level_consolidation", *consolidate_flag, "default", scope); - rsrc_cfg->getParameter("use_cuda_ipc_consolidation", *cuda_ipc_flag, "default", scope); -} - template AMGX_ERROR DistributedRead >::distributedRead(const char *fnamec, Matrix &A, Vector &b, Vector &x, int allocated_halo_depth, int part, int partitions, IVector_h &partSize, IVector_h &partitionVec, unsigned int props) { @@ -559,23 +545,7 @@ AMGX_ERROR DistributedRead* Ad = new Matrix(); //Ad->setResources(A.getResources()); Matrix *Ad; - int consolidate_flag, cuda_ipc_flag; - getConsolidationFlags( &A, &consolidate_flag, &cuda_ipc_flag); - - if (consolidate_flag != 0 && partitions > 1 && A.get_allow_boundary_separation()) - { - Ad = new Matrix; - Ad->setResources(resources); - } - else - { - Ad = &A; - } - - if (isClassical && consolidate_flag) - { - FatalError("Fine level consolidation not supported in CLASSICAL", AMGX_ERR_BAD_PARAMETERS); - } + Ad = &A; // Reset distributed manager if (A.manager != NULL ) @@ -631,73 +601,7 @@ AMGX_ERROR DistributedReadcacheMapsOneRing((int const **) btl_maps, (const int *)btl_sizes, (int const **)lth_maps, (const int *)lth_sizes); A.setManagerExternal(); A.manager->createComms(A.getResources()); - A.manager->setAConsolidationFlags(A); - - if (A.manager->isFineLevelConsolidated()) - { - A.addProps(CSR); - A.setColsReorderedByColor(false); - A.delProps(COO); - A.delProps(DIAG); - A.setColsReorderedByColor(false); - - if (Ah.hasProps(DIAG)) - { - A.addProps(DIAG); - } - - int nnz = Ah.get_num_nz(); - typedef typename MatPrecisionMap::Type ValueType; - int *row_ptrs = NULL, *col_indices = NULL; - void *values, *diag_data = NULL; - // Use local column indicies now - col_indices = Ad->col_indices.raw(); - //row_ptrs = Ad->row_offsets.raw(); - // values = Ad->values.raw(); - //col_indices = Ad->col_indices.raw(); - // row offsets are still global and not reordered - row_ptrs = Ah.row_offsets.raw(); - values = Ah.values.raw(); - // Do pinning of some buffers since fine level consolidation crashes when only one GPU is used - int sizeof_m_val = sizeof(ValueType); - cudaHostRegister(values, nnz * block_size * sizeof_m_val, cudaHostRegisterMapped); - cudaCheckError(); - - if (Ah.hasProps(DIAG)) - { - //diag_data = (Ad->values.raw() + nnz*block_size); - diag_data = (Ah.values.raw() + nnz * block_size); - cudaHostRegister((void *)diag_data, num_rows * block_size * sizeof_m_val, cudaHostRegisterMapped); - cudaCheckError(); - } - - /* - cudaHostRegister(col_indices,nnz*sizeof(int),cudaHostRegisterMapped); - cudaCheckError(); - */ - cudaHostRegister(row_ptrs, (num_rows + 1)*sizeof(int), cudaHostRegisterMapped); - cudaCheckError(); - cudaSetDevice(A.getResources()->getDevice(0)); - A.manager->consolidateAndUploadAll(num_rows, nnz, block_dimx, block_dimy, row_ptrs, col_indices, values, diag_data, A); - A.set_initialized(1); - cudaSetDevice(A.getResources()->getDevice(0)); - - if (diag_data != NULL) - { - cudaHostUnregister(diag_data); - } - - cudaHostUnregister(values); - cudaHostUnregister(row_ptrs); - //cudaHostUnregister(col_indices); - cudaCheckError(); - delete Ad; - } - else - { - A.manager->createComms(A.getResources()); - A.manager->updateMapsReorder(); - } // End consolidation check + A.manager->updateMapsReorder(); free_maps_one_ring(num_neighbors, neighbors, btl_sizes, btl_maps, lth_sizes, lth_maps); A.set_is_matrix_read_partitioned(true); diff --git a/src/distributed/distributed_manager.cu b/src/distributed/distributed_manager.cu index b0e1d9e9..64a790d6 100644 --- a/src/distributed/distributed_manager.cu +++ b/src/distributed/distributed_manager.cu @@ -279,144 +279,6 @@ void poisson7pt_set_col_values(const index_type *__restrict__ row_offsets, index } } -template -__global__ -void set_halo_cols_values(int *row_offsets, int *col_indices, mat_value_type *values, int n, int total_rows, int bsize) -{ - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - while (tid < (total_rows - n) ) - { - int offset = row_offsets[n + tid]; - col_indices[offset] = n + tid; -#pragma unroll - - for (int i = 0; i < bsize; i++) - { - values[offset * bsize + i] = types::util::get_one(); // This is arbitrary - } - - tid += gridDim.x * blockDim.x; - } -} - -template -__global__ -void zero_copy_row_lengths_ids_offsets(int *d_old_row_offsets, int *root_row_offsets, int *d_row_ids, int n, int total_num_halos, mat_value_type *diag) -{ - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - while (tid < n + total_num_halos) - { - int new_row_id = d_row_ids[tid]; - - if (tid < n) - { - int start = d_old_row_offsets[tid]; - int row_length = d_old_row_offsets[tid + 1] - start; // zero-copy - - if (diag != NULL) // will insert the diagonal - { - row_length++; - } - - root_row_offsets[new_row_id] = row_length; - } - - tid += gridDim.x * blockDim.x; - } -} - -template< typename mat_value_type> -__global__ -void ipc_consolidation_upload_matrix(int num_rows, int *row_ids, const int *old_row_offsets, int *new_row_offsets, const int *h_old_col_indices, int *new_col_indices, const mat_value_type *h_old_values, mat_value_type *new_values, const mat_value_type *h_old_diag, int bsize) -{ - int row = blockIdx.x * blockDim.x + threadIdx.x; - - while (row < num_rows) - { - int new_row = row_ids[row]; - int src_base = old_row_offsets[row]; - int dst_base = new_row_offsets[new_row]; - - // Insert the diagonal at the beginning of each row - if (h_old_diag != NULL) - { - new_col_indices[dst_base] = new_row; -#pragma unroll - - for (int j = 0; j < bsize; j++) - { - new_values[dst_base * bsize + j] = h_old_diag[row * bsize + j]; - } - - // Increment dst_base by one - dst_base++; - } - - int end = old_row_offsets[row + 1] - src_base; - - for (int i = 0; i < end; i++) - { - int old_col = h_old_col_indices[src_base + i]; - int new_col = row_ids[old_col]; - new_col_indices[dst_base + i] = new_col; -#pragma unroll - - for (int j = 0; j < bsize; j++) - { - new_values[ (dst_base + i)*bsize + j ] = h_old_values[ (src_base + i) * bsize + j ]; - } - } - - row += gridDim.x * blockDim.x; - } -} - - -template< typename mat_value_type> -__global__ -void ipc_consolidation_replace_values(int num_rows, int *row_ids, const int *old_row_offsets, int *new_row_offsets, const mat_value_type *h_old_values, mat_value_type *new_values, const mat_value_type *h_old_diag, int bsize) -{ - int row = blockIdx.x * blockDim.x + threadIdx.x; - - while (row < num_rows) - { - int new_row = row_ids[row]; - int src_base = old_row_offsets[row]; - int dst_base = new_row_offsets[new_row]; - - // Insert the diagonal at the beginning of each row - if (h_old_diag != NULL) - { -#pragma unroll - - for (int j = 0; j < bsize; j++) - { - new_values[dst_base * bsize + j] = h_old_diag[row * bsize + j]; - } - - // Increment dst_base by one - dst_base++; - } - - int end = old_row_offsets[row + 1] - src_base; - - for (int i = 0; i < end; i++) - { -#pragma unroll - - for (int j = 0; j < bsize; j++) - { - new_values[ (dst_base + i)*bsize + j ] = h_old_values[ (src_base + i) * bsize + j ]; - } - } - - row += gridDim.x * blockDim.x; - } -} - - __global__ void flag_halo_ids_kernel(INDEX_TYPE *flags, INDEX_TYPE *ids, INDEX_TYPE offset, INDEX_TYPE size, INDEX_TYPE upper) { int idx = blockDim.x * blockIdx.x + threadIdx.x; @@ -639,7 +501,6 @@ __global__ void renumber_P_col_indices(INDEX_TYPE *__restrict__ col_indices, con } } - template __global__ void reorder_R_matrix(const INDEX_TYPE *old_rows, const INDEX_TYPE *old_cols, const T *old_vals, const INDEX_TYPE *rows, INDEX_TYPE *cols, T *vals, const INDEX_TYPE *renumbering, INDEX_TYPE bsize, INDEX_TYPE num_rows, INDEX_TYPE num_owned_rows) { @@ -665,8 +526,6 @@ __global__ void reorder_R_matrix(const INDEX_TYPE *old_rows, const INDEX_TYPE *o } } - - template __global__ void reorder_whole_matrix(INDEX_TYPE *old_rows, INDEX_TYPE *old_cols, T *old_vals, INDEX_TYPE *rows, INDEX_TYPE *cols, T *vals, INDEX_TYPE *renumbering, INDEX_TYPE bsize, INDEX_TYPE num_rows, INDEX_TYPE insert_diagonal) { @@ -802,38 +661,6 @@ __global__ void calc_rowlen_reorder(INDEX_TYPE *row_offsets, INDEX_TYPE *row_len } } - -template < class TConfig > -void DistributedManagerBase::remove_boundary(IVector_d &flagArray, IVector_d &B2L_map, int size) -{ - int num_blocks = min(4096, (size + 127) / 128); - remove_boundary_kernel <<< num_blocks, 128>>>(flagArray.raw(), B2L_map.raw(), size); - cudaCheckError(); -} - -template < class TConfig > -void DistributedManagerBase::get_unassigned(IVector_d &flagArray, IVector_d &B2L_map, IVector_d &partition_flags, int size, int global_size /*, int rank*/) -{ - int num_blocks = min(4096, (size + 191) / 192); - get_unassigned_kernel <<< num_blocks, 192>>>(flagArray.raw(), - B2L_map.raw(), - partition_flags.raw(), size, global_size /*, rank*/); - cudaCheckError(); -} - -template < class TConfig > -void DistributedManagerBase::set_unassigned(IVector_d &partition_flags, IVector_d &partition_renum, IVector_d &B2L_map, IVector_d &renumbering, int size, int max_element, int global_size /*, int rank*/) -{ - int num_blocks = min(4096, (size + 191) / 192); - set_unassigned_kernel <<< num_blocks, 192>>>(partition_flags.raw(), - partition_renum.raw(), - B2L_map.raw(), - renumbering.raw(), - size, max_element, global_size /*,rank*/); - cudaCheckError(); -} - - template inline void DistributedManagerBase::set_initialized(IVector &row_offsets) { @@ -877,98 +704,11 @@ inline void DistributedManagerBase::set_initialized(IVector &row_offset } } -template -void DistributedManagerBase::createAggregatesRenumbering(IVector_h &renumbering, IVector_h_vector &B2L_maps, int size, int num_neighbors, int &num_interior_aggregates, int &num_boundary_aggregates, int num_rings) -{ - createAggRenumbering(renumbering, B2L_maps, size, num_neighbors, num_interior_aggregates, num_boundary_aggregates, num_rings); -} - -template -void DistributedManagerBase::createAggregatesRenumbering(IVector_d &renumbering, IVector_d_vector &B2L_maps, int size, int num_neighbors, int &num_interior_aggregates, int &num_boundary_aggregates, int num_rings) -{ - createAggRenumbering(renumbering, B2L_maps, size, num_neighbors, num_interior_aggregates, num_boundary_aggregates, num_rings); -} - -template -template -void DistributedManagerBase::createAggRenumbering(IVector_hd &renumbering, std::vector &B2L_maps, int size, int num_neighbors, int &num_interior_aggregates, int &num_boundary_aggregates, int num_rings) -{ - if (num_rings != 1) - { - FatalError("num_rings > 1 not supported in consolidation", AMGX_ERR_NOT_IMPLEMENTED); - } - - //int num_neighbors = this->neighbors.size(); - if (num_neighbors == 0) - { - num_boundary_aggregates = 0; - num_interior_aggregates = size; - return; - } - - //initial size to size+1 so we have the total size after a scan - int global_size = size; - renumbering.resize(size + 1); - // - // Step 1 - in the main matrix, separate interior and boundary nodes (1/0 in flagArray), renumber interior ones with an exclusive scan - // - IVector_hd flagArray(size + 1); - thrust::fill(flagArray.begin(), flagArray.begin() + size + 1, 1); - cudaCheckError(); - - //sets 1 for interior nodes, 0 for boundary node - for (int i = 0; i < num_neighbors; i++ ) - { - int size = B2L_maps[i].size(); - remove_boundary(flagArray, B2L_maps[i], size); - } - - //gets the renumbering of interior nodes - thrust::exclusive_scan(flagArray.begin(), flagArray.begin() + size + 1, renumbering.begin()); - cudaCheckError(); - // - // Step 2 - Renumber nodes that are in the boundary, stepping through each B2L map, and renumbering ones that have not been renumbered yet - // - //what is the biggest B2L size - INDEX_TYPE max_size = 0; - - for (int i = 0; i < num_neighbors; i++) - { - max_size = max_size > B2L_maps[i].size() ? max_size : B2L_maps[i].size(); - } - - //allocate work vectors (should be pretty small) - IVector_hd partition_flags(max_size); - IVector_hd partition_renum(max_size); - //the number of renumbered nodes so far - int max_element = renumbering[size]; - num_interior_aggregates = max_element; - num_boundary_aggregates = size - max_element; - renumbering.resize(size); - - for (int i = 0; i < num_neighbors; i++) - { - //find nodes that are part of the current boundary and they haven't been renumbered yet - thrust::fill(partition_flags.begin(), partition_flags.begin() + max_size, 0); - int size = B2L_maps[i].size(); - get_unassigned(flagArray, B2L_maps[i], partition_flags, size, global_size/*,0*/); - //calculate the local renumbering (within this boundary region) of these nodes - thrust::exclusive_scan(partition_flags.begin(), partition_flags.begin() + max_size, partition_renum.begin()); - //apply renumbering to the big numbering table - set_unassigned(partition_flags, partition_renum, B2L_maps[i], renumbering, size, max_element, global_size/*,0*/); - //update the number of renumbered nodes - max_element += partition_renum[max_size - 1] + partition_flags[max_size - 1]; - } - - cudaCheckError(); -} - - template inline DistributedManagerBase::DistributedManagerBase(Matrix &a) : m_fine_level_comms(NULL), A(&a), m_pinned_buffer_size(0), m_pinned_buffer(NULL), _num_interior_nodes(0), _num_boundary_nodes(0), _comms(NULL), has_B2L(false), neighbors(_neighbors), B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), - halo_rows_ref_count(0), halo_btl_ref_count(0), halo_ranges(_halo_ranges), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), halo_rows(NULL), halo_btl(NULL), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + halo_rows_ref_count(0), halo_btl_ref_count(0), halo_ranges(_halo_ranges), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), halo_rows(NULL), halo_btl(NULL), m_fixed_view_size(false) { cudaEventCreate(&comm_event); cudaStreamCreateWithFlags(&m_int_stream, cudaStreamNonBlocking); @@ -1451,7 +1191,7 @@ void DistributedManagerinitialize_B2L_maps_offsets(*(this->A), 1); delete prep; - //Use the exchanged halo row matrices and the boundary/halo index lists to renumber and consolidate the matrix + //Use the exchanged halo row matrices and the boundary/halo index lists to renumber the matrix // Step 5: renumber all owned rows and columns this->reorder_matrix_owned(); // Step 6: renumber local_to_global_map @@ -1556,43 +1296,6 @@ void DistributedManager -void DistributedManager >::createOneRingB2Lmaps() -{ - // Input: - // a matrix with N rows, whose column indices are local indices from 0 to N+M-1, - // where M is a number of 1-ring halo vertices - // The matrix also contains array "local_to_global_map" of size M, which stores the global index of each halo index - // Ex: assuming a column has index N+K, where 0 <= K < M, then it's global id is local_to_global_map[K] - // The matrix also contains part_offsets_h and part_offsets array, which stores where each partition begins - // Output: - // This function creates all the necessary data to to 1-ring exchanges - // i.e. list of 1-ring neighbors, B2L_maps for 1-ring, halo_offsets for 1-ring, - // Also, the function reorders the halo indices, such that 1-ring indices are in the order - // of neighbors, and therefore, exchange_halo doesn't have to be changed (i.e. L2H = identity) - // What is does: - // Based on the global indices of its halo vertices, count the number of neighbors - // For each neighbor, receive the halo indices that will be needed by neighbor - // From those, create B2L_maps[0], which contains for all neighbors - // This function assumes that: - // part_offset is defined - - // B2L_maps - int my_id = this->global_id(); - int num_parts = this->get_num_partitions(); - this->set_base_index(this->part_offsets_h[my_id]); - this->set_index_range(this->part_offsets_h[my_id + 1] - this->part_offsets_h[my_id]); - DistributedArranger *prep = new DistributedArranger; - // This function creates the array neighbors, which contains a list of partitions to which data - // needs to be sent and/or received - prep->create_neighbors_v2(*(this->A)); - // Here change the manager if some partitions have no neighbors - this->getComms()->set_neighbors(this->neighbors.size()); - prep->create_B2L_one_ring(*(this->A)); - delete prep; -} - template void DistributedManager >::createOneRingHaloRows() { @@ -1622,7 +1325,7 @@ inline DistributedManagerBase::DistributedManagerBase( INDEX_TYPE num_import_rings, int num_neighbors, const VecInt_t *neighbors_) : m_fine_level_comms(NULL), A(&a), m_pinned_buffer_size(0), m_pinned_buffer(NULL), _num_interior_nodes(0), _num_boundary_nodes(0), _comms(NULL), has_B2L(false), neighbors(_neighbors), halo_rows_ref_count(0), halo_rows(NULL), halo_btl_ref_count(0), halo_btl(NULL), halo_ranges(_halo_ranges), halo_ranges_h(_halo_ranges_h), part_offsets(_part_offsets), part_offsets_h(_part_offsets_h), - B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), m_is_root_partition(false), m_is_glued(false), m_is_fine_level_glued(false), m_is_fine_level_consolidated(false), m_is_fine_level_root_partition(false), m_use_cuda_ipc_consolidation(false), m_fixed_view_size(false) + B2L_maps(_B2L_maps), L2H_maps(_L2H_maps), B2L_rings(_B2L_rings), m_fixed_view_size(false) { cudaStreamCreateWithFlags(&m_int_stream, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&m_bdy_stream, cudaStreamNonBlocking); @@ -1720,32 +1423,10 @@ inline void DistributedManagerBase::cacheMapsOneRing(const VecInt_t **b } } -template -void DistributedManagerBase::setAConsolidationFlags( Matrix &in_A) -{ - this->A = &in_A; - AMG_Config *rsrc_cfg = this->A->getResources()->getResourcesConfig(); - std::string scope; - int consolidate_flag, cuda_ipc_flag; - rsrc_cfg->getParameter("fine_level_consolidation", consolidate_flag, "default", scope); - rsrc_cfg->getParameter("use_cuda_ipc_consolidation", cuda_ipc_flag, "default", scope); - this->m_is_fine_level_consolidated = (consolidate_flag != 0); - this->m_use_cuda_ipc_consolidation = (cuda_ipc_flag != 0); -} - template void DistributedManagerBase::uploadMatrix(int n, int nnz, int block_dimx, int block_dimy, const int *row_ptrs, const int *col_indices, const void *data, const void *diag, Matrix &in_A) { - this->setAConsolidationFlags(in_A); - - if (this->m_is_fine_level_consolidated) - { - this->A->manager->consolidateAndUploadAll(n, nnz, block_dimx, block_dimy, row_ptrs, col_indices, data, diag, *(this->A)); - } - else - { - this->A->manager->initializeUploadReorderAll(n, nnz, block_dimx, block_dimy, row_ptrs, col_indices, data, diag, *(this->A)); - } + this->A->manager->initializeUploadReorderAll(n, nnz, block_dimx, block_dimy, row_ptrs, col_indices, data, diag, *(this->A)); } template @@ -2199,7 +1880,7 @@ void DistributedManagerBase::updateMapsReorder() prep->create_B2L_from_maps( (*(this->A)), my_id, this->num_halo_rings(), neighbors, B2L_maps, L2H_maps, B2L_rings, comms_, &halo_rows, &halo_btl); DistributedManagerBaseInit(my_id, 0, this->A->get_num_rows(), *(this->A), comms_, NULL, NULL); - //Use the exchanged halo row matrices and the boundary/halo index lists to renumber and consolidate the matrix + //Use the exchanged halo row matrices and the boundary/halo index lists to renumber the matrix this->reorder_matrix(); prep->initialize_B2L_maps_offsets(*(this->A), this->num_halo_rings()); delete prep; @@ -3136,1576 +2817,82 @@ void DistributedManagerBase::createNeighToDestPartMap(IVector_h &neigh_ } template -void DistributedManagerBase::createConsolidatedNeighToPartMap(IVector_h &cons_neigh_to_part, IVector_h &neigh_to_part, int my_destination_part, IVector_h &destination_part, int &num_cons_neighbors) +void DistributedManagerBase::read_halo_ids(int size, IVector_d &scratch, IVector_d &halo_aggregates, VecInt_t min_index_coarse_halo) { - // input: non-initialized cons_neigh_to_part - // fine_neigh_to_part - // my_destination_part - // output: cons_neigh_to_part - // num_cons_neighbors - cons_neigh_to_part = neigh_to_part; - thrust::sort(cons_neigh_to_part.begin(), cons_neigh_to_part.end()); - cudaCheckError(); - cons_neigh_to_part.erase(thrust::unique(cons_neigh_to_part.begin(), cons_neigh_to_part.end()), cons_neigh_to_part.end()); - // Remove if fine_neigh maps to same coarse partition - cons_neigh_to_part.erase(thrust::remove_if(cons_neigh_to_part.begin(), cons_neigh_to_part.end(), is_my_part(my_destination_part)), cons_neigh_to_part.end()); - num_cons_neighbors = cons_neigh_to_part.size(); + int block_size = 128; + const int num_blocks = min( AMGX_GRID_MAX_SIZE, (size - 1) / block_size + 1); + read_halo_ids_kernel <<< num_blocks, block_size>>>(scratch.raw(), halo_aggregates.raw(), min_index_coarse_halo, size); cudaCheckError(); } +template +void DistributedManagerBase::read_halo_ids(int size, IVector_h &scratch, IVector_h &halo_aggregates, VecInt_t min_index_coarse_halo) +{ + FatalError("read_halo_ids not implemented on host yet", AMGX_ERR_NOT_IMPLEMENTED); +} + template -void DistributedManagerBase::createNeighToConsNeigh(IVector_h &neigh_to_cons_neigh, IVector_h &cons_neigh_to_part, IVector_h &neigh_to_part, int my_destination_part, int &num_neighbors) +void DistributedManagerBase::flag_halo_ids(int size, IVector_d &scratch, IVector_d &halo_aggregates, VecInt_t min_index_coarse_halo, int max_index, int min_index) { - neigh_to_cons_neigh.resize(num_neighbors); - thrust::lower_bound(cons_neigh_to_part.begin(), cons_neigh_to_part.end(), neigh_to_part.begin(), neigh_to_part.end(), neigh_to_cons_neigh.begin()); + int block_size = 128; + const int num_blocks = min( AMGX_GRID_MAX_SIZE, (size - 1) / block_size + 1); + flag_halo_ids_kernel <<< num_blocks, block_size>>>(scratch.raw(), halo_aggregates.raw(), min_index_coarse_halo, size, max_index - min_index + 1); cudaCheckError(); - - // Flagging fine neighbors that go to same partition (haven't been found in previous step) - for (int i = 0; i < num_neighbors; i++) - { - if ( neigh_to_part[i] == my_destination_part) - { - neigh_to_cons_neigh[i] = -1; - } - } } template -template -void DistributedManagerBase::consB2Lmaps(std::vector &dest_coarse_B2L_maps, std::vector &coarse_B2L_maps, IVector_h &fine_neigh_to_coarse_neigh, int num_coarse_neighbors, int num_fine_neighbors) +void DistributedManagerBase::flag_halo_ids(int size, IVector_h &scratch, IVector_h &halo_aggregates, VecInt_t min_index_coarse_halo, int max_index, int min_index) { - //Merge B2L fine maps per coarse destination - dest_coarse_B2L_maps.resize(num_coarse_neighbors); - std::vector dest_coarse_B2L_maps_scratch_sizes(num_coarse_neighbors, 0); - int my_id = this->global_id(); - - // Loop over the fine neighbors, to compute size of each dest_coarse_B2L_maps - for (int i = 0; i < num_fine_neighbors; i++) - { - int k = fine_neigh_to_coarse_neigh[i]; + FatalError("flag_halo_ids not implemented on host yet", AMGX_ERR_NOT_IMPLEMENTED); +} - if (k != -1) - { - dest_coarse_B2L_maps_scratch_sizes[k] += coarse_B2L_maps[i].size(); - } - } +template +void DistributedManager >::replaceMatrixCoefficients(int n, int nnz, const mat_value_type *data_pinned, const mat_value_type *diag_pinned) +{ + //matrix parameters + //int num_nnz = this->A->get_num_nz(); + int num_rows = this->halo_offsets[0]; + int total_rows = num_rows + this->num_halo_rows(); + int block_size = this->A->get_block_size(); + mat_value_type *data_hd = NULL; + mat_value_type *diag_hd = NULL; + int data_alloc = 0; + int diag_alloc = 0; + //cuda parameters + int num_blocks = min(4096, (num_rows + 127) / 128); - // Now fill dest_coarse_B2L_maps - for (int k = 0; k < num_coarse_neighbors; k++) - { - dest_coarse_B2L_maps[k].resize( dest_coarse_B2L_maps_scratch_sizes[k] ); - // Reset sizes to 0 (fill use as offset in next loop); - dest_coarse_B2L_maps_scratch_sizes[k] = 0; - } + /* WARNING: the number of non-zero elements (nnz) in the array data_pinned and A->values (num_nnz) might be different at this point. + 1. If the matrix has CSR property and therefore diagonal is included in the matrix this values will be the same. + 2. If the matrix has DIAG property and therefore diagonal is originally stored separately, and later appended to the array + of values, and subsequently inserted into the matrix than num_nnz = nnz + n. We have to account for this fact when replacing the + coefficients (and use nnz not num_nnz). + obs.: see calls to computeDiagonal (matrix.cu), AMGX_matrix_upload and AMGX_replace_coefficients (amgx_c.cu), and + uploadMatrix and replaceMatrixCoefficients[No|With]Cons (distributed_manager.cu) for details. */ - for (int i = 0; i < num_fine_neighbors; i++) + /* check early exit */ + if (this->neighbors.size() == 0 || this->renumbering.size() == 0) { - int k = fine_neigh_to_coarse_neigh[i]; - - if (k != -1) - { - int offset = dest_coarse_B2L_maps_scratch_sizes[k]; - thrust::copy(coarse_B2L_maps[i].begin(), coarse_B2L_maps[i].end(), dest_coarse_B2L_maps[k].begin() + offset); - dest_coarse_B2L_maps_scratch_sizes[k] += coarse_B2L_maps[i].size(); - } + return; } cudaCheckError(); - int max_size = 0; - - for (int i = 0; i < num_coarse_neighbors; i++) - { - int size = dest_coarse_B2L_maps[i].size(); - - if (size > max_size) { max_size = size; } - } + /* allocate if data and diag if they are not pinned */ + data_hd = (mat_value_type *) this->getDevicePointerForData((void *)data_pinned, nnz * block_size * sizeof(mat_value_type), &data_alloc); - // Remove duplicates (aggregates in boundary that go to same merged partition) - for (int i = 0; i < num_coarse_neighbors; i++) + if (diag_pinned != NULL) { - int size = dest_coarse_B2L_maps[i].size(); - thrust::sort(dest_coarse_B2L_maps[i].begin(), dest_coarse_B2L_maps[i].begin() + size); - index_type num_unique = thrust::unique(dest_coarse_B2L_maps[i].begin(), dest_coarse_B2L_maps[i].begin() + size) - dest_coarse_B2L_maps[i].begin(); - dest_coarse_B2L_maps[i].erase(dest_coarse_B2L_maps[i].begin() + num_unique, dest_coarse_B2L_maps[i].end()); + diag_hd = (mat_value_type *) this->getDevicePointerForData((void *)diag_pinned, num_rows * block_size * sizeof(mat_value_type), &diag_alloc); } - cudaCheckError(); -} - -template -void DistributedManagerBase::computeConsolidatedOffsets(const int my_id, const int my_destination_part, const bool is_root_partition, const int num_interior_rows, const int num_boundary_rows, IVector_h_vector &vertex_counts, const IVector_h &parts_to_consolidate, const int num_parts_to_consolidate, int &interior_offset, int &boundary_offset, int &total_interior_rows_in_merged, int &total_boundary_rows_in_merged, int &total_rows_in_merged, DistributedComms *comms) -{ - IVector_h my_offsets(4); - IVector_h my_sizes(2); - my_sizes[0] = num_interior_rows; - my_sizes[1] = num_boundary_rows; - - if (!is_root_partition) + /* replace the values (reordering them if needed) */ + if (insertDiagonals && diag_pinned != NULL) { - //Send number of interior and boundary nodes to root - comms->send_vector_async(my_sizes, my_destination_part, 777); - comms->recv_vector(my_offsets, my_destination_part, 778); - comms->send_vector_wait_all(my_sizes); + replace_values_matrix <32> <<< num_blocks, 512>>>(data_hd, diag_hd, this->old_row_offsets.raw(), this->A->row_offsets.raw(), this->A->values.raw(), this->renumbering.raw(), block_size, num_rows); } else { - vertex_counts.resize(num_parts_to_consolidate); - IVector_h child_sizes(2); - IVector_h offsets_interior(num_parts_to_consolidate); - IVector_h offsets_boundary(num_parts_to_consolidate); - int count_int = 0; - int count_bdy = 0; - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - if (parts_to_consolidate[i] == my_id) - { - child_sizes = my_sizes; - } - else - { - comms->recv_vector(child_sizes, parts_to_consolidate[i], 777); - } - - //Do a simple cumulative sum to determine total number of interior/boundary rows and their offsets on a per contributing partition basis - offsets_interior[i] = count_int; - offsets_boundary[i] = count_bdy; - count_int += child_sizes[0]; - count_bdy += child_sizes[1]; - //Save them - vertex_counts[i].resize(2); - vertex_counts[i][0] = child_sizes[0]; - vertex_counts[i][1] = child_sizes[1]; - } - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - //Send back to contributing partitions - IVector_h offsets_to_send(4); - offsets_to_send[0] = offsets_interior[i]; - offsets_to_send[1] = offsets_boundary[i]; - offsets_to_send[2] = count_int; - offsets_to_send[3] = count_bdy; - - if (parts_to_consolidate[i] == my_id) - { - my_offsets = offsets_to_send; - } - else - { - comms->send_vector(offsets_to_send, parts_to_consolidate[i], 778); // cannot make async, rewriting internal buffer - } - } - } - - interior_offset = my_offsets[0]; - boundary_offset = my_offsets[1] + my_offsets[2] - num_interior_rows; - total_interior_rows_in_merged = my_offsets[2]; - total_boundary_rows_in_merged = my_offsets[3]; - total_rows_in_merged = my_offsets[2] + my_offsets[3]; -} - - - - -template -template -void DistributedManagerBase::consB2LmapsOnRoot(int &num_consolidated_neighbors, std::vector &consolidated_B2L_maps, IVector_h &consolidated_coarse_ids, std::vector &dest_coarse_B2L_maps, IVector_h &coarse_neigh_to_fine_part, IVector_h &num_bdy_per_coarse_neigh, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int my_id, int my_destination_part, bool is_root_partition, int num_coarse_neighbors, DistributedComms *comms) -{ - // TODO: it is possible to optimize exchanges, for example fuse recv_vector in recreating coarse neigbours - - // output: num_consolidated_neighbor, consolidated_B2L_maps, consolidated_coarse_ids - // input: dest_coarse_B2L_maps, is_root_partition, my_id, my_destination_part, num_fine_parts_to_consolidate, num_coarse_neighbors, coarse_neigh_to_fine_part, num_bdy_per_coarse_neigh - if (my_destination_part != my_id) - { - //if not root, send coarse neighbor list using fine indices and the corresponding boundary lists - IVector_h num_coarse_neigh(1); - num_coarse_neigh[0] = num_coarse_neighbors; - comms->send_vector_async(num_coarse_neigh, my_destination_part, 1111); - comms->send_vector_async(coarse_neigh_to_fine_part, my_destination_part, 2222); - comms->send_vector_async(num_bdy_per_coarse_neigh, my_destination_part, 3333); - - for (int i = 0; i < num_coarse_neighbors; i++) - { - comms->send_vector_async(dest_coarse_B2L_maps[i], my_destination_part, 4444 + i) ; - } - - comms->send_vector_wait_all(num_coarse_neigh); - comms->send_vector_wait_all(coarse_neigh_to_fine_part); - comms->send_vector_wait_all(num_bdy_per_coarse_neigh); - - for (int i = 0; i < num_coarse_neighbors; i++) - { - comms->send_vector_wait_all(dest_coarse_B2L_maps[i]) ; - } - } - - if (is_root_partition) - { - IVector_h num_coarse_ids_from_part(fine_parts_to_consolidate); - IVector_h_vector coarse_ids_from_part(num_fine_parts_to_consolidate); - IVector_h_vector num_coarse_neigh_bdys_from_part(num_fine_parts_to_consolidate); - //If root, receive sizes, and resize receive buffers - int total_num_coarse_ids = 0; - - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - IVector_h temp(1); - - if (current_part != my_id) - { - comms->recv_vector(temp, current_part, 1111); - } - else - { - temp[0] = num_coarse_neighbors; - } - - num_coarse_ids_from_part[i] = temp[0]; - coarse_ids_from_part[i].resize(temp[0]); - num_coarse_neigh_bdys_from_part[i].resize(temp[0]); - total_num_coarse_ids += temp[0]; - } - - //Create a neighbor list for the consolidated coarse matrix, by merging coarse neighbor lists from partitions that are being merged - consolidated_coarse_ids.resize(total_num_coarse_ids); - int count = 0; - - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - - // Get from each partition the coarse partition ids in their B2L maps - if (current_part != my_id) - { - comms->recv_vector(coarse_ids_from_part[i], current_part, 2222); - comms->recv_vector(num_coarse_neigh_bdys_from_part[i], current_part, 3333); - } - else - { - coarse_ids_from_part[i] = coarse_neigh_to_fine_part; - num_coarse_neigh_bdys_from_part[i] = num_bdy_per_coarse_neigh; - } - - thrust::copy(coarse_ids_from_part[i].begin(), coarse_ids_from_part[i].end(), consolidated_coarse_ids.begin() + count); - count += num_coarse_ids_from_part[i]; - } - - cudaCheckError(); - //eliminate duplicates - thrust::sort(consolidated_coarse_ids.begin(), consolidated_coarse_ids.end()); - cudaCheckError(); - consolidated_coarse_ids.erase(thrust::unique(consolidated_coarse_ids.begin(), consolidated_coarse_ids.end()), consolidated_coarse_ids.end()); - cudaCheckError(); - num_consolidated_neighbors = consolidated_coarse_ids.size(); - IVector_h_vector coarse_ids_from_part_to_consolidated_neighbor(num_fine_parts_to_consolidate);; - - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - coarse_ids_from_part_to_consolidated_neighbor[i].resize(num_coarse_ids_from_part[i]); - thrust::lower_bound(consolidated_coarse_ids.begin(), consolidated_coarse_ids.end(), coarse_ids_from_part[i].begin(), coarse_ids_from_part[i].end(), coarse_ids_from_part_to_consolidated_neighbor[i].begin()); - } - - cudaCheckError(); - // Map each coarse partition to new coarse ID - consolidated_B2L_maps.resize(num_consolidated_neighbors); - IVector_h consolidated_B2L_maps_sizes(num_consolidated_neighbors); - // Offset in the consolidated_B2L_maps - IVector_h_vector coarse_ids_offsets(num_fine_parts_to_consolidate); - - for (int i = 0; i < num_consolidated_neighbors; i++) - { - consolidated_B2L_maps_sizes[i] = 0; - } - - // Compute the size of each consolidated_B2L_maps and offsets into it, where we will receive the parts coming from partitions that are getting merged into this one - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - coarse_ids_offsets[i].resize(num_coarse_ids_from_part[i]); - - for (int j = 0; j < num_coarse_ids_from_part[i]; j++) - { - int coarse_id = coarse_ids_from_part[i][j]; - int k = num_coarse_neigh_bdys_from_part[i][j]; - coarse_ids_offsets[i][j] = consolidated_B2L_maps_sizes[ coarse_ids_from_part_to_consolidated_neighbor[i][j] ]; - consolidated_B2L_maps_sizes[ coarse_ids_from_part_to_consolidated_neighbor[i][j] ] += k; - } - } - - for (int i = 0; i < num_consolidated_neighbors; i++) - { - consolidated_B2L_maps[i].resize(consolidated_B2L_maps_sizes[i]); - } - - // Receive the B2L maps from each child partition, concatenate them (gets sorted outside) - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - - for (int j = 0; j < num_coarse_ids_from_part[i]; j++) - { - int my_coarse_neigh = coarse_ids_from_part_to_consolidated_neighbor[i][j]; - int offset = coarse_ids_offsets[i][j]; - - if (current_part != my_id) - { - comms->recv_vector( consolidated_B2L_maps[my_coarse_neigh], current_part, 4444 + j, offset, num_coarse_neigh_bdys_from_part[i][j]); //Need to do proper tagging here, otherwise messages from the same source would get mixed up - } - else - { - thrust::copy(dest_coarse_B2L_maps[j].begin(), dest_coarse_B2L_maps[j].end(), consolidated_B2L_maps[my_coarse_neigh].begin() + offset); - } - } - } - - cudaCheckError(); - } -} - - -template -void DistributedManagerBase::consolidateAndRenumberHalos(IVector_h &aggregates, const IVector_h &manager_halo_offsets, IVector_h &halo_offsets, const IVector_h &neighbors, int num_fine_neighbors, const IVector_h &consolidated_coarse_ids, int num_consolidated_neighbors, const IVector_h &destination_part, int my_destination_part, bool is_root_partition, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int num_parts, int my_id, int total_rows_in_merged, int &num_all_aggregates, DistributedComms *comms) -{ - consAndRenumberHalos(aggregates, manager_halo_offsets, halo_offsets, neighbors, num_fine_neighbors, consolidated_coarse_ids, num_consolidated_neighbors, destination_part, my_destination_part, is_root_partition, fine_parts_to_consolidate, num_fine_parts_to_consolidate, num_parts, my_id, total_rows_in_merged, num_all_aggregates, comms); -} - -template -void DistributedManagerBase::consolidateAndRenumberHalos(IVector_d &aggregates, const IVector_h &manager_halo_offsets, IVector_h &halo_offsets, const IVector_h &neighbors, int num_fine_neighbors, const IVector_h &consolidated_coarse_ids, int num_consolidated_neighbors, const IVector_h &destination_part, int my_destination_part, bool is_root_partition, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int num_parts, int my_id, int total_rows_in_merged, int &num_all_aggregates, DistributedComms *comms) -{ - consAndRenumberHalos(aggregates, manager_halo_offsets, halo_offsets, neighbors, num_fine_neighbors, consolidated_coarse_ids, num_consolidated_neighbors, destination_part, my_destination_part, is_root_partition, fine_parts_to_consolidate, num_fine_parts_to_consolidate, num_parts, my_id, total_rows_in_merged, num_all_aggregates, comms); -} - -template -template -void DistributedManagerBase::consAndRenumberHalos(IVector_hd &aggregates, const IVector_h &manager_halo_offsets, IVector_h &halo_offsets, const IVector_h &neighbors, int num_fine_neighbors, const IVector_h &consolidated_coarse_neigh_to_fine_part, int num_consolidated_neighbors, const IVector_h &destination_part, int my_destination_part, bool is_root_partition, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int num_parts, int my_id, int total_rows_in_merged, int &num_all_aggregates, DistributedComms *comms) -{ - /* - * EXAMPLE 2 - This example is independent from the previous ones. - Say partition 0 and 1 are merging (into 0) partition 0 is neighbors with 1,2,3 and partition 1 is neighbors with 0,3,4 - Partitions 3 and 4 are merging (into partition 3) and partition 2 is not merging with anyone. - This example details the renumbering of halo indices on partition 0 and partition 1. - aggregates on partition 0: - [(fine interior nodes) (fine boundary nodes) (fine halo from part 1) (fine halo from part 2) (fine halo from part 3)] - [(fine interior nodes) (fine boundary nodes) (13 13 15) (12 15 17) (14 16 18)] - aggregates on partition 1: - [(fine interior nodes) (fine boundary nodes) (fine halo from part 0) (fine halo from part 3) (fine halo from part 4)] - [(fine interior nodes) (fine boundary nodes) (14 16 17) (18 19 19) (15 15 17)] - - manager_halo_offsets on partition 0: - [22 25 28 31] - manager_halo_offsets on partition 1: - [20 23 26 29] - halo_offsets on both partitions are uninitialised: [0 0 0] and [0 0] - neighbors on partition 0: [1 2 3] partition 1: [0 3 4] - num_fine_neighbors partition 0: 3 partition 1: 3 - consolidated_coarse_neigh_to_fine_part partition 0: [2 3] partition 1: [3] - num_consolidated_neighbors partition 0: 2 partition 1: 1 - destination_part [0 0 2 3 3] - my_destination_part partition 0: 0 partition 1: 0 - is_root_partition partition 0: true partition 1: false - fine_parts_to_consolidate partition 0: [0 1] - num_fine_parts_to_consolidate partition 0: 2 - num_parts 5 - my_id partition 0: 0 partition 1: 1 - total_rows_in_merged partition 0 and 1: 24 (=sum of the two below) - num_all_aggregates partition partition 0: 13 partition 1: 11 - will be updated with the number of halo aggregates - */ - // - // Step 9.2 - com up with nonmerge lists - // - int num_fine_nonmerge_neighbors;// = fine_nonmerge_neighbors.size(); - //NUmber of neighbors we are not merging with - num_fine_nonmerge_neighbors = 0; - - for (int i = 0 ; i < num_fine_neighbors; i++) - { - if (destination_part[neighbors[i]] != my_destination_part) - { - num_fine_nonmerge_neighbors++; - } - } - - IVector_h halo_sizes(num_fine_nonmerge_neighbors); - IVector_h fine_nonmerge_neigh_to_cons_fine_part(num_fine_nonmerge_neighbors); - IVector_h fine_nonmerge_neighbor_to_fine_neighbor(num_fine_nonmerge_neighbors); - num_fine_nonmerge_neighbors = 0; - - for (int i = 0 ; i < num_fine_neighbors; i++) - { - if (destination_part[neighbors[i]] != my_destination_part) - { - halo_sizes[num_fine_nonmerge_neighbors] = manager_halo_offsets[i + 1] - manager_halo_offsets[i]; - fine_nonmerge_neighbor_to_fine_neighbor[num_fine_nonmerge_neighbors] = i; - fine_nonmerge_neigh_to_cons_fine_part[num_fine_nonmerge_neighbors] = destination_part[neighbors[i]]; - num_fine_nonmerge_neighbors++; - } - } - - /* - * EXAMPLE 2 - num_fine_nonmerge_neighbors partition 0: 2 partition 1: 2 - fine_nonmerge_neighbor_to_fine_neighbor partition 0: [1 2] partition 1: [1 2] - fine_nonmerge_neigh_to_cons_fine_part partition 0: [2 3] partition 1: [3 3] - halo_sizes partition 0: [3 3] partition 1: [3 3] - */ - - //Send them to root along with the halo parts of the aggregates vector - if (!is_root_partition) - { - IVector_h num_fine_nonmerge_neigh(1); - num_fine_nonmerge_neigh[0] = num_fine_nonmerge_neighbors; - // TODO: async? might be faster. - comms->send_vector(num_fine_nonmerge_neigh, my_destination_part, 1111); - comms->send_vector(halo_sizes, my_destination_part, 2222); - comms->send_vector(fine_nonmerge_neigh_to_cons_fine_part, my_destination_part, 3333); - - // Here check l2h_identity flag and act accordingly - for (int i = 0; i < num_fine_nonmerge_neighbors; i++) - { - comms->send_vector_async(aggregates, my_destination_part, 4444 + i, manager_halo_offsets[fine_nonmerge_neighbor_to_fine_neighbor[i]], halo_sizes[i]) ; - } - - //comms->send_vector_wait_all(num_fine_nonmerge_neigh); - //comms->send_vector_wait_all(halo_sizes); - //comms->send_vector_wait_all(fine_nonmerge_neigh_to_cons_fine_part); - comms->send_vector_wait_all(aggregates); - /* - * EXAMPLE 2 - Partition 1 sends to partition 0: - num_fine_nonmerge_neigh 2 - halo_sizes [3 3] - fine_nonmerge_neigh_to_cons_fine_part [3 3] - for loop: sends two pieces: [(18 19 19)] [(15 15 17)] - */ - } - - if (is_root_partition) - { - // - // Step 9.3 Root receives this info, creates metadata - // - std::vector num_fine_nonmerge_neigh_array(num_fine_parts_to_consolidate); - IVector_h_vector halo_sizes_array(num_fine_parts_to_consolidate); - IVector_h_vector fine_nonmerge_neigh_to_cons_fine_part_array(num_fine_parts_to_consolidate); - std::vector > fine_halo_aggregates_to_root_array(num_fine_parts_to_consolidate); - std::vector min_index_coarse_halo(num_consolidated_neighbors, 0x7FFFFFFF); - std::vector max_index_coarse_halo(num_consolidated_neighbors, 0); - std::vector fine_part_to_consolidated_neighbor(num_parts, -1); - - for (int i = 0; i < num_consolidated_neighbors; i++) - { - fine_part_to_consolidated_neighbor[consolidated_coarse_neigh_to_fine_part[i]] = i; - } - - /* - * EXAMPLE 2 - everything from here on is for partition 0, since that is the root partition - fine_part_to_consolidated_neighbor [-1 -1 0 1 -1] - */ - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - IVector_h temp(1); - - if (current_part != my_id) - { - comms->recv_vector(temp, current_part, 1111); - } - else - { - temp[0] = num_fine_nonmerge_neighbors; - } - - num_fine_nonmerge_neigh_array[i] = temp[0]; - halo_sizes_array[i].resize(temp[0]); - fine_nonmerge_neigh_to_cons_fine_part_array[i].resize(temp[0]); - fine_halo_aggregates_to_root_array[i].resize(temp[0]); - - if (current_part != my_id) - { - comms->recv_vector(halo_sizes_array[i], current_part, 2222); - } - else - { - halo_sizes_array[i] = halo_sizes; - } - - if (current_part != my_id) - { - comms->recv_vector(fine_nonmerge_neigh_to_cons_fine_part_array[i], current_part, 3333); - } - else - { - fine_nonmerge_neigh_to_cons_fine_part_array[i] = fine_nonmerge_neigh_to_cons_fine_part; - } - - //Receive the halo regions - for (int j = 0; j < temp[0]; j++) - { - fine_halo_aggregates_to_root_array[i][j].resize(halo_sizes_array[i][j]); - - if (current_part != my_id) - { - comms->recv_vector(fine_halo_aggregates_to_root_array[i][j], current_part, 4444 + j); - } - else - { - //HERE - thrust::copy(aggregates.begin() + manager_halo_offsets[fine_nonmerge_neighbor_to_fine_neighbor[j]], - aggregates.begin() + manager_halo_offsets[fine_nonmerge_neighbor_to_fine_neighbor[j]] + halo_sizes[j], - fine_halo_aggregates_to_root_array[i][j].begin()); //TODO: not do this copying around on the root - } - -#define MIN(a,b) ab?a:b; - //Find minimum and maximum halo indices as not to allocate too much scratch space later - int min_index = thrust::reduce(fine_halo_aggregates_to_root_array[i][j].begin(), fine_halo_aggregates_to_root_array[i][j].end(), int(0x7FFFFFFF), thrust::minimum()); - int max_index = thrust::reduce(fine_halo_aggregates_to_root_array[i][j].begin(), fine_halo_aggregates_to_root_array[i][j].end(), int(0), thrust::maximum()); - min_index_coarse_halo[fine_part_to_consolidated_neighbor[fine_nonmerge_neigh_to_cons_fine_part_array[i][j]]] = MIN((int)min_index_coarse_halo[fine_part_to_consolidated_neighbor[fine_nonmerge_neigh_to_cons_fine_part_array[i][j]]], min_index); - max_index_coarse_halo[fine_part_to_consolidated_neighbor[fine_nonmerge_neigh_to_cons_fine_part_array[i][j]]] = MAX((int)max_index_coarse_halo[fine_part_to_consolidated_neighbor[fine_nonmerge_neigh_to_cons_fine_part_array[i][j]]], max_index); - } - } - - cudaCheckError(); - /* - * EXAMPLE 2 - num_fine_nonmerge_neigh_array = [2 2] - halo_sizes_array = [[3 3][3 3]] - fine_nonmerge_neigh_to_cons_fine_part_array[][] = [[2 3][3 3]] - fine_halo_aggregates_to_root_array[from][to][fine halo vertex] [[[12 15 17][14 16 18]] - [[18 19 19][15 15 17]]] - min_index_coarse_halo[12 14] - max_index_coarse_halo[17 19] - */ - halo_offsets[0] = total_rows_in_merged; - //Now we have all the halo nodes, let's renumber them. - int min_index = thrust::reduce(min_index_coarse_halo.begin(), min_index_coarse_halo.end(), int(0x7FFFFFFF), thrust::minimum()); - int max_index = thrust::reduce(max_index_coarse_halo.begin(), max_index_coarse_halo.end(), int(0), thrust::maximum()); - cudaCheckError(); - // - // Step 9.4 compute halo indices on root nodes - // - int scratch_size; - - if (num_consolidated_neighbors == 0) - { - scratch_size = 1; - } - else - { - scratch_size = max_index - min_index + 2; - } - - IVector scratch(scratch_size); - - for (int i = 0; i < num_consolidated_neighbors; i++) - { - thrust::fill(scratch.begin(), scratch.end(), 0); - int dest_part = consolidated_coarse_neigh_to_fine_part[i]; - - //Flag halo indices that occur for a specific coarse neighbor - for (int j = 0; j < num_fine_parts_to_consolidate; j++) - { - for (int k = 0; k < num_fine_nonmerge_neigh_array[j]; k++) - { - if (fine_nonmerge_neigh_to_cons_fine_part_array[j][k] == dest_part) - { - int size = halo_sizes_array[j][k]; - this->flag_halo_ids(size, scratch, fine_halo_aggregates_to_root_array[j][k], min_index_coarse_halo[i], max_index, min_index); - } - } - } - - thrust::exclusive_scan(scratch.begin(), scratch.end(), scratch.begin(), halo_offsets[i]); //renumber them with the proper offset into our halo - halo_offsets[i + 1] = scratch[scratch.size() - 1]; - - //now read them back - for (int j = 0; j < num_fine_parts_to_consolidate; j++) - { - for (int k = 0; k < num_fine_nonmerge_neigh_array[j]; k++) - { - if (fine_nonmerge_neigh_to_cons_fine_part_array[j][k] == dest_part) - { - int size = halo_sizes_array[j][k]; - int block_size = 128; - const int num_blocks = min( AMGX_GRID_MAX_SIZE, (size - 1) / block_size + 1); - this->read_halo_ids(size, scratch, fine_halo_aggregates_to_root_array[j][k], min_index_coarse_halo[i]); - //and send them back to contributing partitions - cudaDeviceSynchronize(); //TODO: don't need to synchronize when using GPUDirect - int current_part = fine_parts_to_consolidate[j]; - int tag = 4444 + dest_part; - - if (current_part != my_id) - { - comms->send_vector_async(fine_halo_aggregates_to_root_array[j][k], current_part, tag); //!!!!: we are sending them back not in sequential order, need tags!!!! - } - else - { - thrust::copy(fine_halo_aggregates_to_root_array[j][k].begin(), fine_halo_aggregates_to_root_array[j][k].end(), aggregates.begin() + manager_halo_offsets[fine_nonmerge_neighbor_to_fine_neighbor[k]]); - } - } - } - } - - /* - * EXAMPLE 2 - the array that is sent back in pieces: - fine_halo_aggregates_to_root_array[from][to][fine halo vertex] [[[24 25 26][27 29 31]] - [[31 32 32][28 28 30]]] - halo_offsets = [24 27 33] - */ - } // Loop over consolidated neighbors - - cudaCheckError(); - - // Wait for sends to have completed (this is to prevent fine_halo_aggregates_to_root_array to be destroyed before send has finished) - for (int i = 0; i < num_consolidated_neighbors; i++) - { - int dest_part = consolidated_coarse_neigh_to_fine_part[i]; - - for (int j = 0; j < num_fine_parts_to_consolidate; j++) - { - for (int k = 0; k < num_fine_nonmerge_neigh_array[j]; k++) - { - if (fine_nonmerge_neigh_to_cons_fine_part_array[j][k] == dest_part) - { - int current_part = fine_parts_to_consolidate[j]; - - if (current_part != my_id) - { - comms->send_vector_wait_all(fine_halo_aggregates_to_root_array[j][k]); - } - } - } - } - } // Loop over consolidated neighbors - - //Send total number of rows in the aggregated matrix - for (int i = 0; i < num_fine_parts_to_consolidate; i++) - { - int current_part = fine_parts_to_consolidate[i]; - IVector_h total_rows(1); - total_rows[0] = halo_offsets[num_consolidated_neighbors]; - - if (current_part != my_id) - { - comms->send_vector(total_rows, current_part, 5555); - } - else - { - num_all_aggregates = total_rows[0]; - } - } - } // If is root partition - - if (!is_root_partition) - { - for (int i = 0; i < num_fine_nonmerge_neighbors; i++) - { - int tag = 4444 + fine_nonmerge_neigh_to_cons_fine_part[i]; - comms->recv_vector(aggregates, my_destination_part, tag, manager_halo_offsets[fine_nonmerge_neighbor_to_fine_neighbor[i]], halo_sizes[i]); - } - - IVector_h total_rows(1); - comms->recv_vector(total_rows, my_destination_part, 5555); - num_all_aggregates = total_rows[0]; - } - - /* - * EXAMPLE 2 - num_all_aggregates = 33 (both partitions 0 and 1 - */ -} - -template -void DistributedManagerBase::ipcExchangePtr(void *&ptr, bool is_root_partition, int num_parts_to_consolidate, IVector_h &parts_to_consolidate, int my_root_partition, int my_id, DistributedComms *comms) -{ - cudaIpcMemHandle_t handle; - - if (is_root_partition) - { - cudaIpcGetMemHandle( (cudaIpcMemHandle_t *) &handle, ptr ) ; - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - int current_part = parts_to_consolidate[i]; - - if (current_part != my_id) - { - comms->send_raw_data(&handle, sizeof(handle), current_part, 456); - } - } - } - else - { - comms->recv_raw_data(&handle, sizeof(handle), my_root_partition, 456); - cudaError_t err = cudaIpcOpenMemHandle( (void **) &ptr, handle, cudaIpcMemLazyEnablePeerAccess); - } -} - -template -void DistributedManagerBase::ipcWaitForChildren(bool is_root_partition, int num_parts_to_consolidate, IVector_h &parts_to_consolidate, int my_destination_part, int my_id, DistributedComms *comms) -{ - cudaEvent_t event; - cudaIpcEventHandle_t event_handle; - cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess); - cudaIpcGetEventHandle( &event_handle, event); - // Each rank record the event - cudaEventRecord(event); - - if (is_root_partition) - { - std::vector child_events(num_parts_to_consolidate); - std::vector child_event_handles(num_parts_to_consolidate); - - // Root partition receives event_handles from child and stores in child_event_handles - for (int i = 0; i < num_parts_to_consolidate; i++) - { - int current_part = parts_to_consolidate[i]; - - if (current_part != my_id) - { - comms->recv_raw_data(&(child_event_handles[i]), sizeof(cudaIpcEventHandle_t), current_part, 987 + current_part); - cudaIpcOpenEventHandle(&child_events[i], child_event_handles[i]); - } - } - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - if (parts_to_consolidate[i] != my_id) - { - cudaEventSynchronize(child_events[i]); - } - } - } - else - { - comms->send_raw_data(&event_handle, sizeof(cudaIpcEventHandle_t), my_destination_part, 987 + my_id); - } -} - -template -void DistributedManagerBase::ipcWaitForRoot(bool is_root_partition, int num_parts_to_consolidate, IVector_h &parts_to_consolidate, int my_destination_part, int my_id, DistributedComms *comms) -{ - cudaEvent_t event; - cudaIpcEventHandle_t event_handle; - cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess); - - if (is_root_partition) - { - cudaIpcGetEventHandle( &event_handle, event); - // Root records the event - cudaEventRecord(event); - - // Root partition sends event_handles to child - for (int i = 0; i < num_parts_to_consolidate; i++) - { - int current_part = parts_to_consolidate[i]; - - if (current_part != my_id) - { - comms->send_raw_data(&event_handle, sizeof(event_handle), current_part, 988 + current_part); - } - } - } - else - { - comms->recv_raw_data(&event_handle, sizeof(event_handle), my_destination_part, 988 + my_id); - cudaIpcOpenEventHandle(&event, event_handle); - cudaEventSynchronize(event); - } -} - - - -template -void DistributedManagerBase::read_halo_ids(int size, IVector_d &scratch, IVector_d &halo_aggregates, VecInt_t min_index_coarse_halo) -{ - int block_size = 128; - const int num_blocks = min( AMGX_GRID_MAX_SIZE, (size - 1) / block_size + 1); - read_halo_ids_kernel <<< num_blocks, block_size>>>(scratch.raw(), halo_aggregates.raw(), min_index_coarse_halo, size); - cudaCheckError(); -} - -template -void DistributedManagerBase::read_halo_ids(int size, IVector_h &scratch, IVector_h &halo_aggregates, VecInt_t min_index_coarse_halo) -{ - FatalError("read_halo_ids not implemented on host yet", AMGX_ERR_NOT_IMPLEMENTED); -} - - -template -void DistributedManagerBase::flag_halo_ids(int size, IVector_d &scratch, IVector_d &halo_aggregates, VecInt_t min_index_coarse_halo, int max_index, int min_index) -{ - int block_size = 128; - const int num_blocks = min( AMGX_GRID_MAX_SIZE, (size - 1) / block_size + 1); - flag_halo_ids_kernel <<< num_blocks, block_size>>>(scratch.raw(), halo_aggregates.raw(), min_index_coarse_halo, size, max_index - min_index + 1); - cudaCheckError(); -} - -template -void DistributedManagerBase::flag_halo_ids(int size, IVector_h &scratch, IVector_h &halo_aggregates, VecInt_t min_index_coarse_halo, int max_index, int min_index) -{ - FatalError("flag_halo_ids not implemented on host yet", AMGX_ERR_NOT_IMPLEMENTED); -} - -template -void DistributedManager >::consolidateAndUploadAll(int n, int nnz, int block_dimx, int block_dimy, const int *row_ptrs, const int *col_indices, const void *data, const void *diag, Matrix &A) -{ - FatalError("Fine level consolidation not implemented on host yet", AMGX_ERR_NOT_IMPLEMENTED); -} - -template -void DistributedManagerBase::exchangeSolveResultsConsolidation(int &num_iters, std::vector &res_history, AMGX_STATUS &status, bool store_res_history) -{ - int bsize = this->A->get_block_size(); - PODVector_h res_history_tmp; - - if (!m_is_fine_level_consolidated) - { - return; - } - else - { - int my_id = this->getFineLevelComms()->get_global_id(); - IVector_h my_num_iters(1); - - if (m_is_fine_level_root_partition) - { - my_num_iters[0] = num_iters; - - if (store_res_history) - { - // Pack the res_history vector into array - res_history_tmp.resize( (num_iters + 1)*bsize); - - for (int i = 0; i < num_iters + 1; i++) - { - for (int j = 0; j < bsize; j++) - { - res_history_tmp[i * bsize + j] = res_history[i][j]; - } - } - } - - for (int i = 0; i < m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = m_fine_level_parts_to_consolidate[i]; - - if (my_id != current_part) - { - getFineLevelComms()->send_vector_async(my_num_iters, current_part, 245); - - if (store_res_history) - { - getFineLevelComms()->send_vector_async(res_history_tmp, current_part, 246); - } - } - } - - for (int i = 0; i < m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = m_fine_level_parts_to_consolidate[i]; - - if (my_id != current_part) - { - getFineLevelComms()->send_raw_data(&status, sizeof(status), current_part, 247); - } - } - - getFineLevelComms()->send_vector_wait_all(my_num_iters); - - if (store_res_history) - { - getFineLevelComms()->send_vector_wait_all(res_history_tmp); - } - } - else - { - // Store num_iters - getFineLevelComms()->recv_vector(my_num_iters, m_my_fine_level_destination_part, 245); - num_iters = my_num_iters[0]; - - if (store_res_history) - { - // Fill res_history vector - res_history.resize(num_iters + 1); - res_history_tmp.resize( (num_iters + 1)*bsize); - getFineLevelComms()->recv_vector(res_history_tmp, m_my_fine_level_destination_part, 246); - - for (int i = 0; i < num_iters + 1; i++) - { - res_history[i].resize(bsize); - - for (int j = 0; j < bsize; j++) - { - res_history[i][j] = res_history_tmp[i * bsize + j]; - } - } - } - - getFineLevelComms()->recv_raw_data(&status, sizeof(status), m_my_fine_level_destination_part, 247); - } - } -} - -template -void DistributedManager >::consolidateAndUploadAll(int n, int nnz, int block_dimx, int block_dimy, const int *row_ptrs, const int *col_indices, const void *data, const void *diag, Matrix &in_A) -{ - this->A = &in_A; - this->createComms(this->A->getResources()); //refresh comms - DistributedComms *comms = this->getComms(); - int my_id = comms->get_global_id(); - int num_parts = comms->get_num_partitions(); - int num_rings = this->num_halo_rings(); - int num_neighbors = this->neighbors.size(); - // All partitions have to call this, otherwise it fails - // Step 1: Figure out which partition should be consolidated together based on their host_name and their PCI-E slot ID - IVector_h destination_part(num_parts); - this->computeDestinationPartitionsWithCons(my_id, num_parts, destination_part, comms); - int my_destination_part = destination_part[my_id]; - // Check if I'm root partition and how many msgs I will receive - bool is_root_partition = false; - int num_parts_to_consolidate = 0; - - for (int i = 0; i < num_parts; i++) - { - if (destination_part[i] == my_id) - { - is_root_partition = true; - num_parts_to_consolidate++; - } - } - - if (my_destination_part >= num_parts) - { - FatalError("During consolidation, sending data to partition that doesn't exist", AMGX_ERR_NOT_IMPLEMENTED); - } - - // Create cons_part_to_part map - IVector_h cons_part_to_part = destination_part; - thrust::sort(cons_part_to_part.begin(), cons_part_to_part.end()); - cudaCheckError(); - cons_part_to_part.erase(thrust::unique(cons_part_to_part.begin(), cons_part_to_part.end()), cons_part_to_part.end()); - cudaCheckError(); - int num_cons_partitions = cons_part_to_part.size(); - - // If number of consolidated partitions is the same as number of partitions, simply call uploadAll - if (num_cons_partitions == num_parts) - { - this->initializeUploadReorderAll(n, nnz, block_dimx, block_dimy, row_ptrs, col_indices, data, diag, *(this->A)); - this->m_is_fine_level_consolidated = false; - return; - } - - if (is_root_partition) - { - this->A->getResources()->expandRootPool(); - } - - this->m_is_fine_level_consolidated = true; - - if (num_rings != 1) - { - FatalError("num_rings > 1 not supported in fine_level consolidation", AMGX_ERR_NOT_IMPLEMENTED); - } - - // Fill with b2l_maps - IVector_h_vector B2L_maps_tmp; - B2L_maps_tmp.resize(num_neighbors); - - for (int i = 0; i < num_neighbors; i++) - { - B2L_maps_tmp[i] = this->cached_B2L_maps[i]; - } - - bool useCudaIpc = this->m_use_cuda_ipc_consolidation; - mat_value_type *data_hd = NULL; - mat_value_type *diag_hd = NULL; - int *col_indices_hd = NULL; - int data_alloc = 0; - int diag_alloc = 0; - int col_alloc = 0; - col_indices_hd = (int *) this->getDevicePointerForData((void *)col_indices, nnz * block_dimx * block_dimy * sizeof(int), &col_alloc); - data_hd = (mat_value_type *) this->getDevicePointerForData((void *)data, nnz * block_dimx * block_dimy * sizeof(mat_value_type), &data_alloc); - - if (diag != NULL) - { - diag_hd = (mat_value_type *) this->getDevicePointerForData((void *)diag, nnz * block_dimx * block_dimy * sizeof(mat_value_type), &diag_alloc); - } - - // Copy the original row_offsets array (this is required when replacing coefficients - this->m_old_row_offsets_CONS.resize(n + 1); - cudaMemcpy(this->m_old_row_offsets_CONS.raw(), row_ptrs, (n + 1)*sizeof(int), cudaMemcpyDefault); - cudaCheckError(); - this->m_old_nnz_CONS = nnz; - // This function: - // Creates fine level consolidated matrices - // Modifies the btl_maps, lth_maps - // Create part_to_cons_part map - IVector_h part_to_cons_part(num_parts); - thrust::lower_bound(cons_part_to_part.begin(), cons_part_to_part.end(), destination_part.begin(), destination_part.end(), part_to_cons_part.begin()); - cudaCheckError(); - IVector_h neigh_to_part; - this->createNeighToDestPartMap(neigh_to_part, this->neighbors, destination_part, num_neighbors); - IVector_h cons_neigh_to_part; - int num_cons_neighbors; - this->createConsolidatedNeighToPartMap(cons_neigh_to_part, neigh_to_part, my_destination_part, destination_part, num_cons_neighbors); - IVector_h neigh_to_cons_neigh; - this->createNeighToConsNeigh( neigh_to_cons_neigh, cons_neigh_to_part, neigh_to_part, my_destination_part, num_neighbors); - // --------------------------------------- - // MERGE B2L MAPS BASED ON DEST PARTITION - // --------------------------------------- - IVector_h_vector dest_B2L_maps; - this->consolidateB2Lmaps(dest_B2L_maps, B2L_maps_tmp, neigh_to_cons_neigh, num_cons_neighbors, num_neighbors); - // ------------------------------------ - // Renumber interior and boundary rows - // ------------------------------------ - int num_interior_rows; - int num_boundary_rows; - IVector_h renumbering; - this->createAggregatesRenumbering(renumbering, dest_B2L_maps, n, num_cons_neighbors, num_interior_rows, num_boundary_rows, num_rings); - // -------------------------------------------------- - // Create list of destination parts to consolidate - // -------------------------------------------------- - // Store whether or not this is a root partition on fine level - IVector_h parts_to_consolidate; - parts_to_consolidate.resize(num_parts_to_consolidate); - int count = 0; - - for (int i = 0; i < num_parts; i++) - { - if (destination_part[i] == my_id) - { - parts_to_consolidate[count] = i; - count++; - } - } - - // --------------------------------------------------------------------- - // Each partition computes its offset for its interior and boundary nodes - // --------------------------------------------------------------------- - IVector_h_vector vertex_counts; - int interior_offset, boundary_offset, total_interior_rows_in_merged, total_boundary_rows_in_merged; - int total_rows_in_merged; - this->computeConsolidatedOffsets(my_id, my_destination_part, is_root_partition, num_interior_rows, num_boundary_rows, vertex_counts, parts_to_consolidate, num_parts_to_consolidate, interior_offset, boundary_offset, total_interior_rows_in_merged, total_boundary_rows_in_merged, total_rows_in_merged, comms); - // ----------------------------------- - // Each partition renumber it's rows - // ----------------------------------- - int total_num_halos = 0; - - // Pack new bdy_ids - for (int i = 0; i < num_neighbors; i++) - { - total_num_halos += this->cached_L2H_maps[i].size(); - } - - IVector_h row_ids(n + total_num_halos, -1); - this->m_row_ids_CONS.resize(n + total_num_halos); - - // Renumber the interior and boundary rows - for (int i = 0; i < n; i++) - { - int new_id; - - if (renumbering.size() == 0) - { - new_id = i; - } - else - { - new_id = renumbering[i]; - } - - new_id += ((new_id >= num_interior_rows) ? boundary_offset : interior_offset); - row_ids[i] = new_id; - } - - for (int i = 0; i < num_cons_neighbors; i++) - { - thrust::transform(dest_B2L_maps[i].begin(), - dest_B2L_maps[i].end(), - thrust::constant_iterator(boundary_offset), - dest_B2L_maps[i].begin(), - thrust::plus()); - } - - cudaCheckError(); - // ------------------------------------------------- - // Send dest_B2L_maps to root partitions - // ------------------------------------------------ - IVector_h num_bdy_per_cons_neigh(num_cons_neighbors); - - for (int i = 0; i < num_cons_neighbors; i++) - { - num_bdy_per_cons_neigh[i] = dest_B2L_maps[i].size(); - } - - IVector_h root_cons_neighbors; - int root_num_cons_neighbors = 0; - IVector_h_vector cons_B2L_maps; - this->consolidateB2LmapsOnRoot(root_num_cons_neighbors, cons_B2L_maps, root_cons_neighbors, dest_B2L_maps, cons_neigh_to_part, num_bdy_per_cons_neigh, parts_to_consolidate, num_parts_to_consolidate, my_id, my_destination_part, is_root_partition, num_cons_neighbors, comms); - IVector_h halo_ids_offsets(num_neighbors + 1); - IVector_h halo_ids; - int halo_ids_size = 0; - halo_ids_offsets[0] = 0; - - for (int i = 0; i < num_neighbors; i++) - { - halo_ids_size += this->cached_L2H_maps[i].size(); - halo_ids_offsets[i + 1] = halo_ids_size; - } - - halo_ids.resize(halo_ids_size); - // Do exchange with neighbors - // Pack new bdy_ids - IVector_h_vector bdy_ids; - bdy_ids.resize(num_neighbors); - - for (int i = 0; i < num_neighbors; i++) - { - int size = this->cached_B2L_maps[i].size(); - bdy_ids[i].resize(size); - - // Pack buffer - for (int j = 0; j < size; j++) - { - bdy_ids[i][j] = row_ids[this->cached_B2L_maps[i][j]]; - } - } - - for (int i = 0; i < num_neighbors; i++) - { - comms->send_vector_async(bdy_ids[i], this->neighbors[i], 6666 + this->neighbors[i]); - } - - for (int i = 0; i < num_neighbors; i++) - { - comms->recv_vector(halo_ids, this->neighbors[i], 6666 + my_id, halo_ids_offsets[i], this->cached_L2H_maps[i].size()); - } - - for (int i = 0; i < num_neighbors; i++) - { - comms->send_vector_wait_all(bdy_ids[i]); - } - - IVector_h halo_offsets(root_num_cons_neighbors + 1, 0); - int root_num_rows; - this->consolidateAndRenumberHalos(halo_ids, halo_ids_offsets, halo_offsets, this->neighbors, num_neighbors, root_cons_neighbors, root_num_cons_neighbors, destination_part, my_destination_part, is_root_partition, parts_to_consolidate, num_parts_to_consolidate, num_parts, my_id, total_rows_in_merged, root_num_rows, comms); - - if (is_root_partition) - { - this->B2L_maps.resize(cons_B2L_maps.size()); - - for (int i = 0; i < cons_B2L_maps.size(); i++) - { - thrust::sort(cons_B2L_maps[i].begin(), cons_B2L_maps[i].end()); - this->B2L_maps[i].copy(cons_B2L_maps[i]); // H2D copy of B2L maps - } - - cudaCheckError(); - } - - // Now renumber the row_ids based on lth_maps - count = 0; - - for (int i = 0; i < num_neighbors; i++) - { - for (int j = 0; j < this->cached_L2H_maps[i].size(); j++) - { - row_ids[this->cached_L2H_maps[i][j]] = halo_ids[count]; - count++; - } - } - - cudaMemcpy(this->m_row_ids_CONS.raw(), row_ids.raw(), (n + total_num_halos)*sizeof(int), cudaMemcpyDefault); - cudaCheckError(); - int bsize = block_dimx * block_dimy; - - if (is_root_partition) - { - this->A->row_offsets.resize(root_num_rows + 1); - } - - void *root_row_ptr = (void *) this->A->row_offsets.raw(); - - if (useCudaIpc) - { - // ---------------------------------------------------- - // 1. cudaIPC to get pointer to root's row_offset array - // ---------------------------------------------------- - this->ipcExchangePtr(root_row_ptr, is_root_partition, num_parts_to_consolidate, parts_to_consolidate, my_destination_part, my_id, comms); - cudaCheckError(); - // ------------------------------------------------------------------- - // 2. each rank copy it's row length on root partition using row_ids - // ------------------------------------------------------------------- - int cta_size = 128; - int grid_size = min(4096, (n + total_num_halos + cta_size - 1) / cta_size); - zero_copy_row_lengths_ids_offsets <<< grid_size, cta_size>>>(this->m_old_row_offsets_CONS.raw(), ((int *) root_row_ptr) /* IPC */, this->m_row_ids_CONS.raw(), n, total_num_halos, (mat_value_type *) diag); - cudaCheckError(); - // Root partition waits for children to be done writing their result - this->ipcWaitForChildren(is_root_partition, num_parts_to_consolidate, parts_to_consolidate, my_destination_part, my_id, comms); - cudaCheckError(); - } - else // CudaIpcNotAvailable - { - this->checkPinnedBuffer( max( nnz * sizeof(mat_value_type), (n + 1)*max(sizeof(index_type), sizeof(value_type)) ) ); - - if (!is_root_partition) - { - IVector_h data_to_send(3); - data_to_send[0] = n; - data_to_send[1] = nnz; - data_to_send[2] = total_num_halos; - int dummy; - void *row_ptrs_to_send = this->getHostPointerForData((void *)row_ptrs, sizeof(index_type) * (n + 1), &dummy); - comms->send_vector(data_to_send, my_destination_part, 10000 + my_id); - comms->send_raw_data(row_ptrs_to_send, (n + 1)*sizeof(int), my_destination_part, 10001 + my_id); - comms->send_raw_data(&row_ids[0], (n + total_num_halos)*sizeof(int), my_destination_part, 10002 + my_id); - } - else - { - cudaEvent_t event; - cudaEventCreate(&event); - //TODO: Could use streams here - //TODO: Avoid extra device to host copies - std::vector data_recv(num_parts_to_consolidate); - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - data_recv[i].resize(3); - int current_part = parts_to_consolidate[i]; - - if (current_part != my_id) - { - comms->recv_vector(data_recv[i], current_part, 10000 + current_part); - } - else - { - data_recv[i][0] = n; - data_recv[i][1] = nnz; - data_recv[i][2] = total_num_halos; - } - } - - this->m_child_n.resize(num_parts_to_consolidate); - this->m_child_nnz.resize(num_parts_to_consolidate); - this->m_child_num_halos.resize(num_parts_to_consolidate); - this->m_child_row_ids.resize(num_parts_to_consolidate); - this->m_child_old_row_offsets.resize(num_parts_to_consolidate); - int max_n = 0; - int max_nnz = 0; - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - int current_part = parts_to_consolidate[i]; - this->m_child_n[i] = data_recv[i][0]; - this->m_child_nnz[i] = data_recv[i][1]; - this->m_child_num_halos[i] = data_recv[i][2]; - - if (this->m_child_n[i] > max_n) { max_n = this->m_child_n[i]; } - - if (this->m_child_nnz[i] > max_nnz) { max_nnz = this->m_child_nnz[i]; } - - this->m_child_row_ids[i].resize(this->m_child_n[i] + this->m_child_num_halos[i]); - this->m_child_old_row_offsets[i].resize(this->m_child_n[i] + 1); - } - - this->m_child_max_n = max_n; - this->m_child_max_nnz = max_nnz; - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - int current_part = parts_to_consolidate[i]; - int cta_size = 128; - int grid_size = min(4096, (this->m_child_n[i] + this->m_child_num_halos[i] + cta_size - 1) / cta_size); - - if (current_part != my_id) - { - comms->recv_vector(this->m_child_old_row_offsets[i], current_part, 10001 + current_part, 0, this->m_child_n[i] + 1); - comms->recv_vector(this->m_child_row_ids[i], current_part, 10002 + current_part, 0, this->m_child_n[i] + this->m_child_num_halos[i]); - zero_copy_row_lengths_ids_offsets <<< grid_size, cta_size>>>(this->m_child_old_row_offsets[i].raw(), this->A->row_offsets.raw(), this->m_child_row_ids[i].raw(), this->m_child_n[i], this->m_child_num_halos[i], (mat_value_type *) diag); - // Wait for kernel to finish before overwriting host buffer - cudaEventRecord(event); - cudaEventSynchronize(event); - } - else - { - zero_copy_row_lengths_ids_offsets <<< grid_size, cta_size>>>(this->m_old_row_offsets_CONS.raw(), this->A->row_offsets.raw(), this->m_row_ids_CONS.raw(), n, total_num_halos, (mat_value_type *) diag); - cudaEventRecord(event); - cudaEventSynchronize(event); - } - } - - cudaCheckError(); - cudaEventDestroy(event); - } // If root partition - - //TODO: is this necessary - comms->barrier(); - } - - //3. root does a exclusive_scan - if (is_root_partition) - { - cudaEvent_t event; - cudaEventCreate(&event); - // Populate the halo rows with diagonal, increase the length of the halo rows - thrust::fill(this->A->row_offsets.begin() + halo_offsets[0], this->A->row_offsets.begin() + halo_offsets[root_num_cons_neighbors], 1); - thrust_wrapper::exclusive_scan(this->A->row_offsets.begin(), this->A->row_offsets.end(), this->A->row_offsets.begin()); - cudaEventRecord(event); - cudaEventSynchronize(event); - cudaCheckError(); - this->A->set_initialized(0); - this->A->delProps(DIAG); // We always insert the diagonal - this->A->delProps(COO); // No COO - this->A->setColsReorderedByColor(false); // Cols not reordered by color - int nnz = this->A->row_offsets[root_num_rows]; // This is a device to host copy - this->A->resize(root_num_rows, root_num_rows, nnz, block_dimx, block_dimy); - this->A->set_num_nz(nnz); // num_nz doesn't include halo rows - //this->A->set_initialized(1); - cudaEventDestroy(event); - } - else - { - this->A->set_initialized(0); - this->A->resize( 0, 0, 0, block_dimx, block_dimy ); - this->A->delProps(DIAG); // We always insert the diagonal - this->A->delProps(COO); // No COO - this->A->setColsReorderedByColor(false); // Cols not reordered by color - //this->A->set_initialized(1); - } - - if (useCudaIpc) - { - // ---------------------------------------------- - // 4. Do ipc consolidation of values and columns - // ---------------------------------------------- - // Child partition waits for parent to create row_offsets - this->ipcWaitForRoot(is_root_partition, num_parts_to_consolidate, parts_to_consolidate, my_destination_part, my_id, comms); - void *root_col_ptr = (void *) this->A->col_indices.raw(); - void *root_val_ptr = (void *) this->A->values.raw(); - this->ipcExchangePtr(root_col_ptr, is_root_partition, num_parts_to_consolidate, parts_to_consolidate, my_destination_part, my_id, comms); - this->ipcExchangePtr(root_val_ptr, is_root_partition, num_parts_to_consolidate, parts_to_consolidate, my_destination_part, my_id, comms); - int cta_size2 = 128; - int grid_size2 = min(4096, (n + cta_size2 - 1) / cta_size2); - ipc_consolidation_upload_matrix <<< grid_size2, cta_size2>>>(n, this->m_row_ids_CONS.raw(), this->m_old_row_offsets_CONS.raw(), ( (int *) root_row_ptr ) /*IPC*/, col_indices_hd, ( (int *) root_col_ptr) /*IPC*/, data_hd, ( (mat_value_type *) root_val_ptr ) /*IPC*/, diag_hd, bsize); - cudaCheckError(); - // Root partition waits for children to upload their matrices - this->ipcWaitForChildren(is_root_partition, num_parts_to_consolidate, parts_to_consolidate, my_destination_part, my_id, comms); - cudaCheckError(); - - // Child partitions close their mem handle (they are done upload data) - if (!is_root_partition) - { - cudaIpcCloseMemHandle(root_row_ptr); - cudaIpcCloseMemHandle(root_val_ptr); - cudaIpcCloseMemHandle(root_col_ptr); - } - } - else // If cudaIpcNotAvailable - { - if (!is_root_partition) - { - int dummy; - void *col_indices_to_send = this->getHostPointerForData((void *)col_indices, sizeof(index_type) * nnz, &dummy); - comms->send_raw_data(col_indices_to_send, nnz * sizeof(int), my_destination_part, 10000 + my_id); - void *data_to_send = this->getHostPointerForData((void *)data, sizeof(mat_value_type) * nnz, &dummy); - comms->send_raw_data(data_to_send, nnz * bsize * sizeof(mat_value_type), my_destination_part, 10001 + my_id); - - if (diag != NULL) - { - void *diag_to_send = this->getHostPointerForData((void *)diag, sizeof(mat_value_type) * n, &dummy); - comms->send_raw_data(diag_to_send, n * bsize * sizeof(mat_value_type), my_destination_part, 10002 + my_id); - } - } - else - { - cudaEvent_t event; - cudaEventCreate(&event); - //TODO: Could use streams here - int *child_col_indices; - mat_value_type *child_data; - mat_value_type *child_diag = NULL; - cudaHostAlloc( (void **) &child_col_indices, this->m_child_max_nnz * sizeof(int), cudaHostAllocMapped); - cudaHostAlloc( (void **) &child_data, this->m_child_max_nnz * bsize * sizeof(mat_value_type), cudaHostAllocMapped); - - if (diag != NULL) - { - cudaHostAlloc( (void **) &child_diag, (this->m_child_max_n)*bsize * sizeof(mat_value_type), cudaHostAllocMapped); - } - - for (int i = 0; i < num_parts_to_consolidate; i++) - { - int current_part = parts_to_consolidate[i]; - int cta_size2 = 128; - int grid_size2 = min(4096, (this->m_child_n[i] + cta_size2 - 1) / cta_size2); - - if (current_part != my_id) - { - comms->recv_raw_data(child_col_indices, this->m_child_nnz[i]*sizeof(int), current_part, 10000 + current_part); - comms->recv_raw_data(child_data, this->m_child_nnz[i]*bsize * sizeof(mat_value_type), current_part, 10001 + current_part); - - if (diag != NULL) - { - comms->recv_raw_data(child_diag, this->m_child_n[i]*bsize * sizeof(mat_value_type), current_part, 10002 + current_part); - } - - int *child_col_indices_hd; - mat_value_type *child_data_hd; - mat_value_type *child_diag_hd = NULL; - cudaHostGetDevicePointer(&child_col_indices_hd, child_col_indices, 0); - cudaHostGetDevicePointer(&child_data_hd, child_data, 0); - - if (diag != NULL) - { - cudaHostGetDevicePointer(&child_diag_hd, child_diag, 0); - } - - ipc_consolidation_upload_matrix <<< grid_size2, cta_size2>>>(this->m_child_n[i], this->m_child_row_ids[i].raw(), this->m_child_old_row_offsets[i].raw(), this->A->row_offsets.raw(), child_col_indices_hd, this->A->col_indices.raw(), child_data_hd, this->A->values.raw(), child_diag_hd, bsize); - // Wait for kernel to finish before overwriting host buffer - cudaEventRecord(event); - cudaEventSynchronize(event); - } - else - { - ipc_consolidation_upload_matrix <<< grid_size2, cta_size2>>>(n, this->m_row_ids_CONS.raw(), this->m_old_row_offsets_CONS.raw(), this->A->row_offsets.raw(), col_indices_hd, this->A->col_indices.raw(), data_hd, this->A->values.raw(), diag_hd, bsize); - cudaEventRecord(event); - cudaEventSynchronize(event); - } - } - - cudaCheckError(); - cudaEventDestroy(event); - cudaFreeHost(child_col_indices); - cudaFreeHost(child_data); - - if (diag != NULL) - { - cudaFreeHost(child_diag); - } - } // If root partition - - //TODO: is this necessary - comms->barrier(); - } - - // Store the original fine level communicator - this->m_is_fine_level_root_partition = is_root_partition; - this->m_my_fine_level_destination_part = my_destination_part; - // Create a clone of the original communicator - this->m_fine_level_comms = comms; //this->_comms is the same pointer that this->m_fine_level_comms right now, so we can overwrite this->_comms, but make sure that we release m_fine_level_cons - this->_comms = this->m_fine_level_comms->CloneSubComm(cons_part_to_part, is_root_partition); // this->_comms will be empty comm for non-root partition and new comm for root ranks only if root partition - this->m_fine_level_id = my_id; - - if (is_root_partition) - { - int cta_size = 128; - int grid_size3 = min(4096, ( (root_num_rows - halo_offsets[0]) + cta_size - 1) / cta_size); - - if (grid_size3 != 0) - { - set_halo_cols_values <<< grid_size3, cta_size>>>(this->A->row_offsets.raw(), this->A->col_indices.raw(), this->A->values.raw(), halo_offsets[0], root_num_rows, bsize); - cudaCheckError(); - } - - int my_cons_id = part_to_cons_part[my_id]; - this->_global_id = my_cons_id; - this->_num_interior_nodes = total_interior_rows_in_merged; - this->_num_boundary_nodes = total_boundary_rows_in_merged; - - for (int i = 0; i < root_num_cons_neighbors; i++) - { - root_cons_neighbors[i] = part_to_cons_part[root_cons_neighbors[i]]; - } - - this->_comms->set_neighbors(root_num_cons_neighbors); - this->neighbors = root_cons_neighbors; - this->halo_offsets = halo_offsets; // H2D copy of halo offsets - this->m_num_fine_level_parts_to_consolidate = num_parts_to_consolidate; - this->m_fine_level_parts_to_consolidate = parts_to_consolidate; - this->set_num_halo_rings(num_rings); - this->set_num_halo_rows(halo_offsets[root_num_cons_neighbors] - halo_offsets[0]); - // B2L_maps has already been copied - this->B2L_rings.resize(root_num_cons_neighbors); - - for (int i = 0; i < root_num_cons_neighbors; i++) - { - this->B2L_rings[i].resize(2); - this->B2L_rings[i][0] = 0; - this->B2L_rings[i][1] = cons_B2L_maps[i].size(); - } - - this->set_initialized(this->A->row_offsets); - this->A->set_initialized(0); - this->A->delProps(DIAG); - this->A->diag.resize(root_num_rows); - this->A->computeDiagonal(); // - this->A->setView(OWNED); - cudaEventCreate(&(this->comm_event)); - this->A->set_initialized(1); - } - else - { - this->neighbors.resize(0); - this->halo_offsets.resize(0); - } - - /* free memory (if needed) */ - if (col_alloc) { cudaFree(col_indices_hd); } - if (data_alloc) { cudaFree(data_hd); } - if (diag_alloc) { cudaFree(diag_hd); } -} - -template -void DistributedManager >::replaceMatrixCoefficientsNoCons(int n, int nnz, const mat_value_type *data_pinned, const mat_value_type *diag_pinned) -{ - //matrix parameters - //int num_nnz = this->A->get_num_nz(); - int num_rows = this->halo_offsets[0]; - int total_rows = num_rows + this->num_halo_rows(); - int block_size = this->A->get_block_size(); - mat_value_type *data_hd = NULL; - mat_value_type *diag_hd = NULL; - int data_alloc = 0; - int diag_alloc = 0; - //cuda parameters - int num_blocks = min(4096, (num_rows + 127) / 128); - - /* WARNING: the number of non-zero elements (nnz) in the array data_pinned and A->values (num_nnz) might be different at this point. - 1. If the matrix has CSR property and therefore diagonal is included in the matrix this values will be the same. - 2. If the matrix has DIAG property and therefore diagonal is originally stored separately, and later appended to the array - of values, and subsequently inserted into the matrix than num_nnz = nnz + n. We have to account for this fact when replacing the - coefficients (and use nnz not num_nnz). - obs.: see calls to computeDiagonal (matrix.cu), AMGX_matrix_upload and AMGX_replace_coefficients (amgx_c.cu), and - uploadMatrix and replaceMatrixCoefficients[No|With]Cons (distributed_manager.cu) for details. */ - - /* check early exit */ - if ((this->neighbors.size() == 0 || this->renumbering.size() == 0) && !this->m_is_fine_level_glued) - { - return; - } - - cudaCheckError(); - /* allocate if data and diag if they are not pinned */ - data_hd = (mat_value_type *) this->getDevicePointerForData((void *)data_pinned, nnz * block_size * sizeof(mat_value_type), &data_alloc); - - if (diag_pinned != NULL) - { - diag_hd = (mat_value_type *) this->getDevicePointerForData((void *)diag_pinned, num_rows * block_size * sizeof(mat_value_type), &diag_alloc); - } - - /* replace the values (reordering them if needed) */ - if (insertDiagonals && diag_pinned != NULL) - { - replace_values_matrix <32> <<< num_blocks, 512>>>(data_hd, diag_hd, this->old_row_offsets.raw(), this->A->row_offsets.raw(), this->A->values.raw(), this->renumbering.raw(), block_size, num_rows); - } - else - { - replace_values_matrix <32> <<< num_blocks, 512>>>(data_hd, this->old_row_offsets.raw(), this->A->row_offsets.raw(), this->A->values.raw(), this->renumbering.raw(), block_size, num_rows); + replace_values_matrix <32> <<< num_blocks, 512>>>(data_hd, this->old_row_offsets.raw(), this->A->row_offsets.raw(), this->A->values.raw(), this->renumbering.raw(), block_size, num_rows); if (diag_pinned != NULL) { @@ -4720,461 +2907,19 @@ void DistributedManager -void DistributedManager >::replaceMatrixCoefficientsWithCons(int n, int nnz, const mat_value_type *data_pinned, const mat_value_type *diag_pinned) -{ - //matrix parameters - //int num_nnz = this->A->get_num_nz(); - /* WARNING: in consolidation, for non-root partitions, halo_offsets - might be NULL due to the call halo_offsets.resize(0); at the end - of the routine uploadMatrix->consolidateAndUploadAll. We should - use the parameter n instead this->halo_offsets[0] for num_rows. */ - int num_rows = n; - int block_size = this->A->get_block_size(); - mat_value_type *data_hd = NULL; - mat_value_type *diag_hd = NULL; - int data_alloc = 0; - int diag_alloc = 0; - data_hd = (mat_value_type *) this->getDevicePointerForData((void *)data_pinned, nnz * block_size * sizeof(mat_value_type), &data_alloc); - - if (diag_pinned != NULL) - { - diag_hd = (mat_value_type *) this->getDevicePointerForData((void *)diag_pinned, num_rows * block_size * sizeof(mat_value_type), &diag_alloc); - } - - bool useCudaIpc = this->m_use_cuda_ipc_consolidation; - - if (useCudaIpc) - { - // Child partitions wait for root to be done - this->ipcWaitForRoot(this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - cudaCheckError(); - void *root_row_ptr = (void *) this->A->row_offsets.raw(); - void *root_val_ptr = (void *) this->A->values.raw(); - this->ipcExchangePtr(root_row_ptr, this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - this->ipcExchangePtr(root_val_ptr, this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - // replace the values, insert the diagonal - int ncons = this->m_old_row_offsets_CONS.size() - 1; - int cta_size = 128; - int grid_size2 = min(4096, (ncons + cta_size - 1) / cta_size); - ipc_consolidation_replace_values <<< grid_size2, cta_size>>>(ncons, this->m_row_ids_CONS.raw(), this->m_old_row_offsets_CONS.raw(), ( (int *) root_row_ptr )/*IPC*/, data_hd, ( (mat_value_type *) root_val_ptr )/*IPC*/, diag_hd, this->A->get_block_size() ); - cudaCheckError(); - // Root partition wait for child to be done replacing their values - this->ipcWaitForChildren(this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - cudaCheckError(); - - if (!this->m_is_fine_level_root_partition) - { - cudaIpcCloseMemHandle(root_row_ptr); - cudaIpcCloseMemHandle(root_val_ptr); - } - } - else // cudaIpcNotAvailable - { - if (this->m_is_fine_level_consolidated) // aggregation - { - int bsize = this->A->get_block_size(); - int ncons = this->m_old_row_offsets_CONS.size() - 1; - - if (!this->m_is_fine_level_root_partition) - { - int dummy; - int nnzcons = this->m_old_nnz_CONS; - void *data_to_send = this->getHostPointerForData((void *)data_pinned, nnzcons * bsize * sizeof(mat_value_type), &dummy); - this->getFineLevelComms()->send_raw_data(data_to_send, nnzcons * bsize * sizeof(mat_value_type), this->m_my_fine_level_destination_part, 10001 + this->fine_level_id()); - - if (diag_pinned != NULL) - { - void *diag_to_send = this->getHostPointerForData((void *)diag_pinned, ncons * bsize * sizeof(mat_value_type), &dummy); - this->getFineLevelComms()->send_raw_data(diag_to_send, ncons * bsize * sizeof(mat_value_type), this->m_my_fine_level_destination_part, 10002 + this->fine_level_id()); - } - } - else - { - cudaEvent_t event; - cudaEventCreate(&event); - //TODO: Could use streams here - mat_value_type *child_data; - mat_value_type *child_diag = NULL; - cudaHostAlloc( (void **) &child_data, this->m_child_max_nnz * bsize * sizeof(mat_value_type), cudaHostAllocMapped); - - if (diag_pinned != NULL) - { - cudaHostAlloc( (void **) &child_diag, (this->m_child_max_n)*bsize * sizeof(mat_value_type), cudaHostAllocMapped); - } - - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - int cta_size2 = 128; - int grid_size2 = min(4096, (this->m_child_n[i] + cta_size2 - 1) / cta_size2); - - if (current_part != this->fine_level_id()) - { - this->getFineLevelComms()->recv_raw_data(child_data, this->m_child_nnz[i]*bsize * sizeof(mat_value_type), current_part, 10001 + current_part); - - if (diag_pinned != NULL) - { - this->getFineLevelComms()->recv_raw_data(child_diag, this->m_child_n[i]*bsize * sizeof(mat_value_type), current_part, 10002 + current_part); - } - - mat_value_type *child_data_hd; - mat_value_type *child_diag_hd = NULL; - cudaHostGetDevicePointer(&child_data_hd, child_data, 0); - - if (diag_pinned != NULL) - { - cudaHostGetDevicePointer(&child_diag_hd, child_diag, 0); - } - - ipc_consolidation_replace_values <<< grid_size2, cta_size2>>>(this->m_child_n[i], this->m_child_row_ids[i].raw(), this->m_child_old_row_offsets[i].raw(), this->A->row_offsets.raw(), child_data_hd, this->A->values.raw(), child_diag_hd, bsize); - // Wait for kernel to finish before overwriting host buffer - cudaEventRecord(event); - cudaEventSynchronize(event); - } - else - { - ipc_consolidation_replace_values <<< grid_size2, cta_size2>>>(ncons, this->m_row_ids_CONS.raw(), this->m_old_row_offsets_CONS.raw(), this->A->row_offsets.raw(), data_hd, this->A->values.raw(), diag_hd, bsize); - //cudaEventRecord(event); - //cudaEventSynchronize(event); - } - } - - cudaCheckError(); - cudaEventDestroy(event); - cudaFreeHost(child_data); - - if (diag_pinned != NULL) - { - cudaFreeHost(child_diag); - } - } // If root partition - - //TODO: is this necessary - this->getFineLevelComms()->barrier(); - } //agg - else if (this->m_is_fine_level_glued) // classical - { - int bsize = this->A->get_block_size(); - int ncons = this->m_old_row_offsets_CONS.size() - 1; - IVector_h nnz_off; - nnz_off.resize(this->getConsolidationArrayOffsets().size()); - IVector_h nnz_array; - nnz_array.resize(this->getConsolidationArrayOffsets().size() - 1); - this->getFineLevelComms()->all_gather( nnz, - nnz_array, - this->getConsolidationArrayOffsets().size() - 1); - nnz_off[0] = 0; - - for (int i = 0; i < nnz_array.size(); i++) - { - nnz_off[i + 1] = nnz_off[i] + nnz_array[i]; - } - - if (!this->m_is_fine_level_root_partition) - { - int dummy; - void *data_to_send = this->getHostPointerForData((void *)data_pinned, nnz * bsize * sizeof(mat_value_type), &dummy); - this->getFineLevelComms()->send_raw_data( data_to_send, - nnz * bsize * sizeof(mat_value_type), - this->m_my_fine_level_destination_part, - 10001 + this->fine_level_id()); - - if (diag_pinned != NULL) - { - void *diag_to_send = this->getHostPointerForData((void *)diag_pinned, n * sizeof(mat_value_type), &dummy); - this->getFineLevelComms()->send_raw_data( diag_to_send, - n * bsize * sizeof(mat_value_type), - this->m_my_fine_level_destination_part, - 10002 + this->fine_level_id()); - //diag.resize(0); - cudaCheckError(); - } - - //values.resize(0); - cudaCheckError(); - } - else - { - //TODO: Could use streams here - mat_value_type *child_data; - mat_value_type *child_diag = NULL; - // Assumes partions have been glued already - this->A->getNnzForView(OWNED, &nnz); - cudaHostAlloc( (void **) &child_data, nnz * bsize * sizeof(mat_value_type), cudaHostAllocMapped); - - if (diag_pinned != NULL) - { - cudaHostAlloc( (void **) &child_diag, this->halo_offsets[this->neighbors.size()]*bsize * sizeof(mat_value_type), cudaHostAllocMapped); - } - - // roots copy their data - memcpy ( &child_data[0], data_pinned, nnz_array[this->fine_level_id()]*sizeof(value_type)); - - if (diag_pinned != NULL) - { - memcpy ( &child_diag[0], diag_pinned, n * sizeof(value_type)); - } - - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - int current_offset = nnz_off[current_part] - nnz_off[this->fine_level_id()] ; - int current_nnz = nnz_array[current_part]; - - if (current_part != this->fine_level_id()) - { - this->getFineLevelComms()->recv_raw_data( &child_data[current_offset], - current_nnz * bsize * sizeof(mat_value_type), - current_part, - 10001 + current_part); - - if (diag_pinned != NULL) - this->getFineLevelComms()->recv_raw_data( &child_diag[this->getConsolidationArrayOffsets()[current_part] - this->getConsolidationArrayOffsets()[this->fine_level_id()]], - (this->getConsolidationArrayOffsets()[current_part + 1] - this->getConsolidationArrayOffsets()[current_part])*bsize * sizeof(mat_value_type), - current_part, - 10002 + current_part); - } - } - - cudaCheckError(); - // we can follow the usual upload path for raw data now - // Assumes partions have been glued already - int os; - this->A->getOffsetAndSizeForView(OWNED, &os, &n); - replaceMatrixCoefficientsNoCons( n, nnz, child_data, child_diag); - cudaCheckError(); - cudaFreeHost(child_data); - - if (diag_pinned != NULL) - { - cudaFreeHost(child_diag); - } - } // If root partition - - //TODO: is this necessary - this->getFineLevelComms()->barrier(); - } // cla - } // not ipc - - this->A->setView(OWNED); - - /* free memory (if needed) */ - if (data_alloc) { cudaFree(data_hd); } - - if (diag_alloc) { cudaFree(diag_hd); } -} - template void DistributedManager >::transformAndUploadVector(VVector_v &v, const void *data, int n, int block_dim) { - if (this->isFineLevelConsolidated() || (this->isFineLevelGlued() && !this->isGlued())) - { - transformAndUploadVectorWithCons(v, data, n, block_dim); - } - else - { - v.resize(n * block_dim); - cudaCheckError(); - // Upload on host - cudaMemcpy(v.raw(), (value_type *)data, n * block_dim * sizeof(value_type), cudaMemcpyDefault); - cudaCheckError(); - // Permute based on renumbering vector - transformVector(v); - int tag = 0; - // Exchange halos - this->exchange_halo(v, tag); - } -} - -template -void DistributedManager >::transformAndUploadVectorWithCons(VVector_v &v, const void *data_pinned, int n, int block_dim) -{ - if (v.get_block_size() != this->A->get_block_dimx()) { printf("Blocksize mismatch!\n"); } - - bool useCudaIpc = this->m_use_cuda_ipc_consolidation; - this->getFineLevelComms()->barrier(); - void *root_temp_ptr = NULL; - VVector_v temp; - - if (this->m_is_fine_level_root_partition && !this->m_is_fine_level_glued ) - { - temp.resize(this->halo_offsets[this->neighbors.size()]*v.get_block_size(), types::util::get_zero()); - temp.set_block_dimx(v.get_block_dimx()); - temp.set_block_dimy(v.get_block_dimy()); - root_temp_ptr = (void *) temp.raw(); - } - + v.resize(n * block_dim); cudaCheckError(); - int data_alloc = 0; - value_type *data_hd = NULL; - - if (!this->m_is_fine_level_glued ) - { - data_hd = (value_type *) this->getDevicePointerForData((void *)data_pinned, n * block_dim * sizeof(value_type), &data_alloc); - } - - if (useCudaIpc) - { - // Do IPC - this->ipcExchangePtr(root_temp_ptr, this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - cudaCheckError(); - int num_blocks = min(4096, (n + 511) / 512); - reorder_vector_values <<< num_blocks, 512>>>( (value_type *) root_temp_ptr, data_hd, this->m_row_ids_CONS.raw(), v.get_block_size(), n); - // Root partition waits for children to be done - this->ipcWaitForChildren(this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - cudaCheckError(); - - if (!this->m_is_fine_level_root_partition) - { - cudaIpcCloseMemHandle(root_temp_ptr); - } - } - else // If cudaIpcNotAvail - { - if (this->m_is_fine_level_consolidated) // aggregation - { - // Exchange the vector between root and child - if (!this->m_is_fine_level_root_partition) - { - IVector_h size(1); - size[0] = n; - this->getFineLevelComms()->send_vector(size, this->m_my_fine_level_destination_part, 20000 + this->fine_level_id()); - int dummy; - void *data_to_send = this->getHostPointerForData((void *)data_pinned, n * v.get_block_size() * sizeof(value_type), &dummy); - this->getFineLevelComms()->send_raw_data(data_to_send, n * v.get_block_size()*sizeof(value_type), this->m_my_fine_level_destination_part, 20001 + this->fine_level_id()); - } - else - { - cudaEvent_t event; - cudaEventCreate(&event); - IVector_h child_n(this->m_num_fine_level_parts_to_consolidate); - int max_n = 0; - - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - - if (current_part != this->fine_level_id()) - { - this->getFineLevelComms()->recv_vector(child_n, current_part, 20000 + current_part, i, 1); - } - else - { - child_n[i] = n; - } - - if (child_n[i] > max_n) { max_n = child_n[i]; } - } - - value_type *child_data; - cudaHostAlloc( (void **) &child_data, max_n * v.get_block_size()*sizeof(value_type), cudaHostAllocMapped); - value_type *child_data_hd; - cudaHostGetDevicePointer(&child_data_hd, child_data, 0); - - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - int num_blocks = min(4096, (child_n[i] + 511) / 512); - - if (current_part != this->fine_level_id()) - { - this->getFineLevelComms()->recv_raw_data(&child_data[0], child_n[i]*v.get_block_size()*sizeof(value_type), current_part, 20001 + current_part); - reorder_vector_values <<< num_blocks, 512>>>( (value_type *) root_temp_ptr, child_data_hd, this->m_child_row_ids[i].raw(), v.get_block_size(), child_n[i]); - cudaEventRecord(event); - cudaEventSynchronize(event); - cudaCheckError(); - } - else - { - reorder_vector_values <<< num_blocks, 512>>>( (value_type *) root_temp_ptr, data_hd, this->m_row_ids_CONS.raw(), v.get_block_size(), n); - } - } // Loop over parts to consolidate - - cudaCheckError(); - cudaEventDestroy(event); - cudaFreeHost(child_data); - } // If root partition - } //agg - else if (this->m_is_fine_level_glued) // cla - { - value_type *child_data = NULL; - - if (!this->m_is_fine_level_root_partition) - { - int dummy; - void *data_to_send = this->getHostPointerForData((void *)data_pinned, n * v.get_block_size() * sizeof(value_type), &dummy); - this->getFineLevelComms()->send_raw_data( data_to_send, - n * v.get_block_size()*sizeof(value_type), - this->m_my_fine_level_destination_part, - 20001 + this->fine_level_id()); - //v.resize(0); // just in case something resized it betwen iterations - cudaCheckError(); - } - else - { - cudaHostAlloc( (void **) &child_data, this->halo_offsets[this->neighbors.size()]*v.get_block_size()*sizeof(value_type), cudaHostAllocMapped); - value_type *child_data_hd; - cudaHostGetDevicePointer(&child_data_hd, child_data, 0); - // roots copy their data - int dummy; - void *my_data = this->getHostPointerForData((void *)data_pinned, n * v.get_block_size() * sizeof(value_type), &dummy); - memcpy ( &child_data[0], data_pinned, n * v.get_block_size()*sizeof(value_type)); - - // Loop over parts to consolidate - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - - if (current_part != this->fine_level_id()) - { - this->getFineLevelComms()->recv_raw_data( &child_data[this->getConsolidationArrayOffsets()[current_part] - this->getConsolidationArrayOffsets()[this->fine_level_id()]], - sizeof(value_type) * (this->getConsolidationArrayOffsets()[current_part + 1] - this->getConsolidationArrayOffsets()[current_part]), - current_part, - 20001 + current_part ); - } - } - - // usual path - // Upload on host - cudaMemcpy(v.raw(), (value_type *)child_data, v.size()* sizeof(value_type), cudaMemcpyDefault); - cudaCheckError(); - } // If root partition - - // Permute based on renumbering vector - transformVector(v); - cudaCheckError(); - // Exchange halos - int tag = 0; - this->exchange_halo(v, tag); - cudaCheckError(); - v.set_unconsolidated_size(n); - - // free host - if (child_data) { cudaFreeHost(child_data); } - - cudaCheckError(); - } //cla - } // If cudaIpcAvailable - - if (!this->m_is_fine_level_glued) // not needed for classcical - { - if (this->m_is_fine_level_root_partition) - { - v.swap(temp); - int tag = 0; - // Root partitions do the exchange - this->exchange_halo(v, tag); - } - - v.set_unconsolidated_size(n * v.get_block_size()); - v.set_transformed(); - } - - /* free memory (if needed) */ - if (data_alloc) { cudaFree(data_hd); } - + // Upload on host + cudaMemcpy(v.raw(), (value_type *)data, n * block_dim * sizeof(value_type), cudaMemcpyDefault); cudaCheckError(); + // Permute based on renumbering vector + transformVector(v); + int tag = 0; + // Exchange halos + this->exchange_halo(v, tag); } template @@ -5237,109 +2982,40 @@ void DistributedManagerhalo_offsets[0]; int num_blocks = min(4096, (size + 511) / 512); inverse_reorder_vector_values <<< num_blocks, 512>>>(temp.raw(), v.raw(), this->renumbering.raw(), v.get_block_size(), size); - //reorder_vector_values<<>>(temp.raw(), v.raw(), this->inverse_renumbering.raw(), v.get_block_size(), size); cudaCheckError(); v.resize(this->halo_offsets[0]*this->A->get_block_dimx()); thrust::copy(temp.begin(), temp.end(), v.begin()); cudaCheckError(); } - -template -void DistributedManagerBase::computeDestinationPartitions(INDEX_TYPE upper_threshold, float avg_size, const int num_parts, int &new_num_parts, bool &wantNeighbors) +template +void DistributedManager >::revertAndDownloadVector(VVector_v &v_in, const void *data, int n, int block_dimy) { - m_destination_partitions.resize(num_parts); - std::vector dp(num_parts); - - if (avg_size < 1.f) { avg_size = 1.f; } // avoid floating point exception - - int wanted_num_fine_parts_to_consolidate = ( upper_threshold + (int) avg_size - 1) / ( (int) avg_size ); - new_num_parts = (num_parts + wanted_num_fine_parts_to_consolidate - 1) / wanted_num_fine_parts_to_consolidate; - - for (int i = 0; i < num_parts; i++) + if ( n == 0 ) { - dp[i] = i % new_num_parts; + FatalError("Cannot download if size = 0", AMGX_ERR_NOT_IMPLEMENTED); } - // example wantNeighbors = true -> destination_part = [0 0 0 0 4 4 4 4 8 8 8 8] - // example wantNeighbors = false -> destination_part = [0 1 2 3 0 1 2 3 0 1 2 3] - if (wantNeighbors) + if (data == NULL ) { - std::sort (dp.begin(), dp.end()); - m_destination_partitions[0] = 0; - - for (int i = 1; i < num_parts; i++) - { - if (dp[i - 1] < dp[i]) - { - m_destination_partitions[i] = i; - } - else - { - m_destination_partitions[i] = m_destination_partitions[i - 1]; - } - } + FatalError("Cannot download to a NULL pointer", AMGX_ERR_NOT_IMPLEMENTED); } - m_my_destination_part = m_destination_partitions[global_id()]; -} - -template -void DistributedManagerBase::computeDestinationPartitionsWithCons(int my_id, int num_parts, IVector_h &destination_part, DistributedComms *comms) -{ - int device_id = this->A->getResources()->getDevice(0); - std::string my_hostname_tmp; - comms->get_hostname(my_hostname_tmp); - // Append PCI-E ID to string - cudaDeviceProp dev_prop; - cudaGetDeviceProperties(&dev_prop, device_id); - std::stringstream s; - s << my_hostname_tmp << "_" << dev_prop.pciBusID << "_" << dev_prop.pciDeviceID; - std::string my_hostname(s.str()); - std::vector hostnames; - comms->exchange_hostnames(my_hostname, hostnames, num_parts); - std::vector::iterator low = std::find( hostnames.begin(), hostnames.end(), my_hostname ); - int my_destination_part = low - hostnames.begin(); - // Do a gather into destination_part - comms->all_gather(my_destination_part, destination_part, num_parts); -} - - -template -void DistributedManager >::revertAndDownloadVector(VVector_v &v_in, const void *data, int n, int block_dimy) -{ - if (this->isFineLevelConsolidated() || this->isFineLevelGlued()) + if (v_in.size() == 0 ) { - revertAndDownloadVectorWithCons(v_in, data, n, block_dimy); + FatalError("Cannot download an empty vector", AMGX_ERR_NOT_IMPLEMENTED); } - else - { - if ( n == 0 ) - { - FatalError("Cannot download if size = 0", AMGX_ERR_NOT_IMPLEMENTED); - } - - if (data == NULL ) - { - FatalError("Cannot download to a NULL pointer", AMGX_ERR_NOT_IMPLEMENTED); - } - if (v_in.size() == 0 ) - { - FatalError("Cannot download an empty vector", AMGX_ERR_NOT_IMPLEMENTED); - } - - VVector_v v_out; - revertVector(v_in, v_out); - cudaMemcpy((value_type *)data, v_out.raw(), n * block_dimy * sizeof(value_type), cudaMemcpyDefault); - cudaCheckError(); - } + VVector_v v_out; + revertVector(v_in, v_out); + cudaMemcpy((value_type *)data, v_out.raw(), n * block_dimy * sizeof(value_type), cudaMemcpyDefault); + cudaCheckError(); } template void DistributedManager >::revertVector(VVector_v &v_in, VVector_v &v_out) { - if (!this->isFineLevelGlued() && this->neighbors.size() == 0 || this->renumbering.size() == 0) { return;} + if (this->neighbors.size() == 0 || this->renumbering.size() == 0) { return;} if (v_in.get_block_size() != this->A->get_block_dimx()) { printf("Blocksize mismatch!\n"); } @@ -5361,138 +3037,6 @@ void DistributedManager -void DistributedManager >::revertAndDownloadVectorWithCons(VVector_v &v_in, const void *data_pinned, int n, int block_dimy) -{ - if (v_in.get_block_size() != this->A->get_block_dimx()) { printf("Blocksize mismatch!\n"); } - - void *root_v_ptr = NULL; - int size = v_in.get_unconsolidated_size(); - int num_rows = size / v_in.get_block_size(); - - if (this->m_is_fine_level_root_partition) - { - root_v_ptr = (void *) v_in.raw(); - } - - VVector_v temp; - temp.set_block_dimx(v_in.get_block_dimx()); - temp.set_block_dimy(v_in.get_block_dimy()); - temp.resize(size); - bool useCudaIpc = this->m_use_cuda_ipc_consolidation; - - if (useCudaIpc) - { - // Do IPC - this->ipcExchangePtr(root_v_ptr, this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - // Children partition waits for parent to be done updating vector - this->ipcWaitForRoot(this->m_is_fine_level_root_partition, this->m_num_fine_level_parts_to_consolidate, this->m_fine_level_parts_to_consolidate, this->m_my_fine_level_destination_part, this->fine_level_id(), this->getFineLevelComms()); - cudaCheckError(); - //reorder based on row permutation - int num_blocks = min(4096, (num_rows + 511) / 512); - inverse_reorder_vector_values <<< num_blocks, 512>>>( temp.raw(), (value_type *) root_v_ptr, this->m_row_ids_CONS.raw(), v_in.get_block_size(), num_rows); - cudaCheckError(); - - if (!this->m_is_fine_level_root_partition) - { - cudaIpcCloseMemHandle(root_v_ptr); - } - } - else - { - if (this->m_is_fine_level_consolidated) // aggregation - { - if (this->m_is_fine_level_root_partition) - { - IVector_h child_n(this->m_num_fine_level_parts_to_consolidate); - int max_n = 0; - - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - - if (current_part != this->fine_level_id()) - { - this->getFineLevelComms()->recv_vector(child_n, current_part, 30000 + current_part, i, 1); - } - else - { - child_n[i] = num_rows; - } - - if (child_n[i] > max_n) { max_n = child_n[i]; } - } - - // Resize temp vector - VVector_v child_temp;; - child_temp.resize(max_n * v_in.get_block_size()); - - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - // Pack the vector to be sent - int num_blocks = min(4096, (child_n[i] + 511) / 512); - - if (current_part != this->fine_level_id()) - { - inverse_reorder_vector_values <<< num_blocks, 512>>>( child_temp.raw(), (value_type *) root_v_ptr, this->m_child_row_ids[i].raw(), v_in.get_block_size(), child_n[i]); - this->getFineLevelComms()->send_vector(child_temp, current_part, 30001 + current_part, 0, child_n[i]*v_in.get_block_size()); - } - else - { - inverse_reorder_vector_values <<< num_blocks, 512>>>( temp.raw(), (value_type *) root_v_ptr, this->m_row_ids_CONS.raw(), v_in.get_block_size(), child_n[i]); - } - } - - cudaCheckError(); - } - else - { - IVector_h size(1); - size[0] = num_rows; - this->getFineLevelComms()->send_vector(size, this->m_my_fine_level_destination_part, 30000 + this->fine_level_id()); - this->getFineLevelComms()->recv_vector(temp, this->m_my_fine_level_destination_part, 30001 + this->fine_level_id()); - } - } - else if (this->m_is_fine_level_glued) // classical - { - if (this->m_is_fine_level_root_partition) - { - temp.resize(v_in.size()); - revertVector(v_in, temp); - cudaCheckError(); - - for (int i = 0; i < this->m_num_fine_level_parts_to_consolidate; i++) - { - int current_part = this->m_fine_level_parts_to_consolidate[i]; - - if (current_part != this->fine_level_id()) - { - this->getFineLevelComms()->send_vector( temp, - current_part, - current_part + 30001, - this->getConsolidationArrayOffsets()[current_part] - this->getConsolidationArrayOffsets()[this->fine_level_id()], - this->getConsolidationArrayOffsets()[current_part + 1] - this->getConsolidationArrayOffsets()[current_part] ); - cudaCheckError(); - } - } - } - else - { - this->getFineLevelComms()->recv_vector(temp, this->m_my_fine_level_destination_part, 30001 + this->fine_level_id()); - cudaCheckError(); - } - - temp.resize(this->getConsolidationArrayOffsets()[this->fine_level_id() + 1] - this->getConsolidationArrayOffsets()[this->fine_level_id()]); - cudaCheckError(); - } - } - - // Copy on host - cudaMemcpy((value_type *)data_pinned, temp.raw(), temp.size() * sizeof(value_type), cudaMemcpyDefault); - cudaCheckError(); -} - template void DistributedManager >::transformAndUploadVector(VVector_v &v, const void *data, int n, int block_dim) { @@ -5512,16 +3056,6 @@ void DistributedManager -void DistributedManager >::transformAndUploadVectorWithCons(VVector_v &v, const void *data, int n, int block_dim) -{ - if (this->neighbors.size() > 0) - { - FatalError("Distributed solve only supported on devices", AMGX_ERR_NOT_IMPLEMENTED); - } -} - - template void DistributedManager >::revertAndDownloadVector(VVector_v &v, const void *data, int n, int block_dim) { @@ -5546,15 +3080,6 @@ void DistributedManager -void DistributedManager >::revertAndDownloadVectorWithCons(VVector_v &v_in, const void *data, int n, int block_dim) -{ - if (this->neighbors.size() > 0) - { - FatalError("Distributed solve only supported on devices", AMGX_ERR_NOT_IMPLEMENTED); - } -} - template void DistributedManager >::reorder_matrix() { @@ -5606,15 +3131,6 @@ void DistributedManager -void DistributedManager >::createOneRingB2Lmaps() -{ - if (this->neighbors.size() > 0) - { - FatalError("Distributed solve only supported on devices", AMGX_ERR_NOT_IMPLEMENTED); - } -} - template void DistributedManager >::createOneRingHaloRows() { @@ -5625,18 +3141,11 @@ void DistributedManager -void DistributedManager >::replaceMatrixCoefficientsNoCons(int n, int nnz, const mat_value_type *data, const mat_value_type *diag) -{ - FatalError("Distributed solve only supported on devices", AMGX_ERR_NOT_IMPLEMENTED); -} - -template -void DistributedManager >::replaceMatrixCoefficientsWithCons(int n, int nnz, const mat_value_type *data, const mat_value_type *diag) +void DistributedManager >::replaceMatrixCoefficients(int n, int nnz, const mat_value_type *data, const mat_value_type *diag) { FatalError("Distributed solve only supported on devices", AMGX_ERR_NOT_IMPLEMENTED); } - template void DistributedManager >::createRenumbering(IVector &renumbering) { @@ -5646,15 +3155,6 @@ void DistributedManager -void DistributedManagerBase::remove_boundary(IVector_h &flagArray, IVector_h &B2L_map, int size) -{ - for (int i = 0; i < size; i++) - { - flagArray[B2L_map[i]] = 0; - } -} - template < class TConfig > void DistributedManagerBase::get_unassigned(IVector_h &flagArray, IVector_h &B2L_map, IVector_h &partition_flags, int size, int fa_size/*, int rank*/) { @@ -5912,8 +3412,7 @@ void DistributedManagerBase::print(char *f, char *s, int trank) //miscellaneous info int64_t bi = m->base_index(); //inlined function int np = m->get_num_partitions(); //inlined function - int rp = (int)m->isRootPartition(); //cast from boolean to int - fprintf(fid, "gid=%d,bi=%ld,np=%d,rp=%d,ir=%d,in=%d,bn=%d\n", m->global_id(), bi, np, rp, m->index_range(), m->num_interior_nodes(), m->num_boundary_nodes()); + fprintf(fid, "gid=%d,bi=%ld,np=%d,ir=%d,in=%d,bn=%d\n", m->global_id(), bi, np, m->index_range(), m->num_interior_nodes(), m->num_boundary_nodes()); cudaDeviceSynchronize(); cudaGetLastError(); @@ -6027,30 +3526,6 @@ DistributedManager< TemplateConfig > { } -template -void DistributedManagerBase::consolidateB2Lmaps(IVector_h_vector &dest_coarse_B2L_maps, IVector_h_vector &coarse_B2L_maps, IVector_h &fine_neigh_to_coarse_neigh, int num_coarse_neighbors, int num_fine_neighbors) -{ - consB2Lmaps(dest_coarse_B2L_maps, coarse_B2L_maps, fine_neigh_to_coarse_neigh, num_coarse_neighbors, num_fine_neighbors); -} - -template -void DistributedManagerBase::consolidateB2Lmaps(IVector_d_vector &dest_coarse_B2L_maps, IVector_d_vector &coarse_B2L_maps, IVector_h &fine_neigh_to_coarse_neigh, int num_coarse_neighbors, int num_fine_neighbors) -{ - consB2Lmaps(dest_coarse_B2L_maps, coarse_B2L_maps, fine_neigh_to_coarse_neigh, num_coarse_neighbors, num_fine_neighbors); -} - -template -void DistributedManagerBase::consolidateB2LmapsOnRoot(int &num_consolidated_neighbors, IVector_h_vector &consolidated_B2L_maps, IVector_h &consolidated_coarse_ids, IVector_h_vector &dest_coarse_B2L_maps, IVector_h &coarse_neigh_to_fine_part, IVector_h &num_bdy_per_coarse_neigh, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int my_id, int my_destination_part, bool is_root_partition, int num_coarse_neighbors, DistributedComms *comms) -{ - consB2LmapsOnRoot(num_consolidated_neighbors, consolidated_B2L_maps, consolidated_coarse_ids, dest_coarse_B2L_maps, coarse_neigh_to_fine_part, num_bdy_per_coarse_neigh, fine_parts_to_consolidate, num_fine_parts_to_consolidate, my_id, my_destination_part, is_root_partition, num_coarse_neighbors, comms); -} - -template -void DistributedManagerBase::consolidateB2LmapsOnRoot(int &num_consolidated_neighbors, IVector_d_vector &consolidated_B2L_maps, IVector_h &consolidated_coarse_ids, IVector_d_vector &dest_coarse_B2L_maps, IVector_h &coarse_neigh_to_fine_part, IVector_h &num_bdy_per_coarse_neigh, IVector_h &fine_parts_to_consolidate, int num_fine_parts_to_consolidate, int my_id, int my_destination_part, bool is_root_partition, int num_coarse_neighbors, DistributedComms *comms) -{ - consB2LmapsOnRoot(num_consolidated_neighbors, consolidated_B2L_maps, consolidated_coarse_ids, dest_coarse_B2L_maps, coarse_neigh_to_fine_part, num_bdy_per_coarse_neigh, fine_parts_to_consolidate, num_fine_parts_to_consolidate, my_id, my_destination_part, is_root_partition, num_coarse_neighbors, comms); -} - /**************************************** * Explict instantiations ***************************************/ diff --git a/src/eigensolvers/eigensolver.cu b/src/eigensolvers/eigensolver.cu index 3b2a44e4..1eb0c45a 100644 --- a/src/eigensolvers/eigensolver.cu +++ b/src/eigensolvers/eigensolver.cu @@ -156,21 +156,6 @@ void EigenSolver::setup(Operator &A) m_setup_time *= 1e-3f; } -template -void EigenSolver::exchangeSolveResultsConsolidation(AMGX_STATUS &status) -{ - std::vector m_res_history; - PODVector_h res(1); - - for (int i = 0; i < m_residuals.size(); i++) - { - res[0] = m_residuals[i]; - m_res_history.push_back(res); - } - - this->m_A->getManager()->exchangeSolveResultsConsolidation(m_num_iters, m_res_history, status, true /*looks like we always store residual history*/); -} - template AMGX_ERROR EigenSolver::solve_no_throw(VVector &x, AMGX_STATUS &status) { @@ -178,23 +163,12 @@ AMGX_ERROR EigenSolver::solve_no_throw(VVector &x, AMGX_STATUS &status) try { - // Check if fine level is consolidated and not a root partition - if ( !(this->m_A->getManager() != NULL && this->m_A->getManager()->isFineLevelConsolidated() && !this->m_A->getManager()->isFineLevelRootPartition() )) + if (x.tag == -1) { - // If matrix is consolidated on fine level and not a root partition - if (x.tag == -1) - { - x.tag = 4242 * 100 + 1; - } - - status = this->solve(x); + x.tag = 4242 * 100 + 1; } - // Exchange residual history, number of iterations, solve status if fine level consoildation was used - if (this->m_A->getManager() != NULL && this->m_A->getManager()->isFineLevelConsolidated()) - { - this->exchangeSolveResultsConsolidation(status); - } + status = this->solve(x); } AMGX_CATCHES(rc) diff --git a/src/resources.cu b/src/resources.cu index a9ce93f2..af9f490a 100644 --- a/src/resources.cu +++ b/src/resources.cu @@ -82,7 +82,6 @@ Resources::Resources() : m_cfg_self(true), m_root_pool_expanded(false), m_tmng(n m_cfg->getParameter("solver", solver_value, "default", solver_scope); m_cfg->getParameter("device_mem_pool_size", m_pool_size, "default", solver_scope); m_cfg->getParameter("device_mem_pool_size_limit", m_pool_size_limit, "default", solver_scope); - m_cfg->getParameter("device_consolidation_pool_size", m_root_pool_size, "default", solver_scope); m_cfg->getParameter("device_mem_pool_max_alloc_size", m_max_alloc_size, "default", solver_scope); m_cfg->getParameter("device_alloc_scaling_factor", m_scaling_factor, "default", solver_scope); m_cfg->getParameter("device_alloc_scaling_threshold", m_scaling_threshold, "default", solver_scope); @@ -114,7 +113,6 @@ Resources::Resources(AMG_Configuration *cfg, void *comm, int device_num, const i m_cfg->getParameter("solver", solver_value, "default", solver_scope); m_cfg->getParameter("device_mem_pool_size", m_pool_size, "default", solver_scope); m_cfg->getParameter("device_mem_pool_size_limit", m_pool_size_limit, "default", solver_scope); - m_cfg->getParameter("device_consolidation_pool_size", m_root_pool_size, "default", solver_scope); m_cfg->getParameter("device_mem_pool_max_alloc_size", m_max_alloc_size, "default", solver_scope); m_cfg->getParameter("device_alloc_scaling_factor", m_scaling_factor, "default", solver_scope); m_cfg->getParameter("device_alloc_scaling_threshold", m_scaling_threshold, "default", solver_scope); diff --git a/src/solvers/solver.cu b/src/solvers/solver.cu index 6e62bcce..09102e84 100644 --- a/src/solvers/solver.cu +++ b/src/solvers/solver.cu @@ -34,14 +34,9 @@ #include #include #include -#include #include "amgx_types/util.h" -#ifdef AMGX_USE_VAMPIR_TRACE -#include -#endif - namespace amgx { @@ -50,7 +45,7 @@ Solver::Solver(AMG_Config &cfg, const std::string &cfg_scope, ThreadManager *tmng) : m_cfg(&cfg), m_cfg_scope(cfg_scope), m_is_solver_setup(false), m_A(NULL), m_r(NULL), m_num_iters(0), m_curr_iter(0), m_ref_count(1), tag(0), - m_solver_name("SolverNameNotSet"), m_skip_glued_setup(false), m_tmng(tmng) + m_solver_name("SolverNameNotSet"), m_tmng(tmng) { m_norm_factor = types::util::get_one(); m_verbosity_level = cfg.getParameter("verbosity_level", cfg_scope); @@ -305,12 +300,6 @@ bool Solver::converged(PODVector_h &nrm) const return m_convergence->convergence_update_and_check(nrm, m_nrm_ini); } -template -void Solver::exchangeSolveResultsConsolidation(AMGX_STATUS &status) -{ - this->get_A().getManager()->exchangeSolveResultsConsolidation(m_num_iters, m_res_history, status, m_store_res_history == 1); -} - template const typename Solver::PODVector_h &Solver::get_residual( int idx) const @@ -404,21 +393,6 @@ void Solver::setup( Operator &A, bool reuse_matrix_structure) FatalError("Cannot call resetup with a different matrix", AMGX_ERR_UNKNOWN); -#ifdef AMGX_WITH_MPI - // skiping setup for glued matrices - // this->level was set to -999 in amg_level.cu because it is empty - // block jacobi fails to find diagonal if the matrix is empty - if (B.manager != NULL) - { - if (this->m_skip_glued_setup) - { - this->set_A(A); - m_is_solver_setup = true; - return; - } - } -#endif - // Color the matrix, set the block format, reorder columns if necessary if (!B.is_matrix_setup()) { @@ -564,22 +538,14 @@ AMGX_ERROR Solver::setup_no_throw(Operator &A, try { - if ( (A.getManager() != NULL && A.getManager()->isFineLevelConsolidated() && !A.getManager()->isFineLevelRootPartition() )) + // Matrix values have changed, so need to repermute values, color the matrix if necessary + if (Matrix *B = dynamic_cast*>(&A)) { - this->set_A(A); - // Do nothing else since this partition shouldn't participate + B->set_is_matrix_setup(false); } - else - { - // Matrix values have changed, so need to repermute values, color the matrix if necessary - if (Matrix *B = dynamic_cast*>(&A)) - { - B->set_is_matrix_setup(false); - } - // Setup the solver - this->setup(A, reuse_matrix_structure); - } + // Setup the solver + this->setup(A, reuse_matrix_structure); } AMGX_CATCHES(rc) @@ -605,42 +571,6 @@ AMGX_STATUS Solver::solve(Vector &b, Vector &x, FatalError("Block sizes do not match", AMGX_ERR_BAD_PARAMETERS); } - // --- Gluing path for vectors --- -#ifdef AMGX_WITH_MPI - Matrix *nv_mtx_ptr = dynamic_cast*>(m_A); - - if (nv_mtx_ptr) - { - if (nv_mtx_ptr->manager != NULL ) - { - if (nv_mtx_ptr->manager->isGlued() && nv_mtx_ptr->manager->getDestinationPartitions().size() != 0 && nv_mtx_ptr->amg_level_index == 0) - { - MPI_Comm comm, temp_com; - comm = nv_mtx_ptr->manager->getComms()->get_mpi_comm(); - // Compute the temporary splited communicator to glue vectors - temp_com = compute_glue_matrices_communicator(*nv_mtx_ptr); - int usz = nv_mtx_ptr->manager->halo_offsets_before_glue[0]; - glue_vector(*nv_mtx_ptr, comm, b, temp_com); - b.set_unconsolidated_size(usz); - b.getManager()->setIsFineLevelGlued(true); - MPI_Barrier(MPI_COMM_WORLD); - glue_vector(*nv_mtx_ptr, comm, x, temp_com); - x.set_unconsolidated_size(usz); - x.getManager()->setIsFineLevelGlued(true); - MPI_Barrier(MPI_COMM_WORLD); - //Make sure we will not glue the vectors twice on the finest level - nv_mtx_ptr->manager->setIsGlued(false); - } - else - { - nv_mtx_ptr->manager->setIsGlued(false); - } - } - } - -#endif - // -- end of gluing path modifications -- - if (b.tag == -1 || x.tag == -1) { b.tag = this->tag * 100 + 0; @@ -657,10 +587,6 @@ AMGX_STATUS Solver::solve(Vector &b, Vector &x, MPI_Barrier(MPI_COMM_WORLD); } -#ifdef AMGX_USE_VAMPIR_TRACE - int tag = VT_User_marker_def__("Solver_Start", VT_MARKER_TYPE_HINT); - VT_User_marker__(tag, "Solver Start"); -#endif cudaDeviceSynchronize(); #endif #endif @@ -977,24 +903,13 @@ AMGX_ERROR Solver::solve_no_throw(VVector &b, VVector &x, try { - // Check if fine level is consolidated and not a root partition - if ( !(this->get_A().getManager() != NULL && this->get_A().getManager()->isFineLevelConsolidated() && !this->get_A().getManager()->isFineLevelRootPartition() )) + if (b.tag == -1 || x.tag == -1) { - // If matrix is consolidated on fine level and not a root partition - if (b.tag == -1 || x.tag == -1) - { - b.tag = this->tag * 100 + 0; - x.tag = this->tag * 100 + 1; - } - - status = this->solve(b, x, xIsZero); + b.tag = this->tag * 100 + 0; + x.tag = this->tag * 100 + 1; } - // Exchange residual history, number of iterations, solve status if fine level consoildation was used - if (this->get_A().getManager() != NULL && this->get_A().getManager()->isFineLevelConsolidated()) - { - this->exchangeSolveResultsConsolidation(status); - } + status = this->solve(b, x, xIsZero); } AMGX_CATCHES(rc)