From ee6772080e310f8425a90a8395732f1c67c4d672 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Fri, 3 Jun 2022 16:13:39 +0200
Subject: [PATCH 01/17] AMGCL smoother and coearsener

read v-cycle hierarchy  matrices
A_ ,R_ (matrix market format ) and vectors A_diag
and use them instead of them (instead of r-b-Gauss-Siedel) as
smoother (A_diag)  and coarsener (R_ and P_=R_^T).
Continue with CG-solver where v-cycle is the preconditioning step.
---
 include/graphblas/algorithms/hpcg/hpcg.hpp    |  28 ++-
 .../graphblas/algorithms/hpcg/hpcg_data.hpp   |   2 +-
 .../algorithms/hpcg/matrix_building_utils.hpp |   2 +
 .../algorithms/hpcg/multigrid_v_cycle.hpp     |  95 ++++++++--
 .../algorithms/hpcg/system_building_utils.hpp | 173 +++++++++++++-----
 tests/smoke/hpcg.cpp                          | 137 +++++++++++---
 6 files changed, 340 insertions(+), 97 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index 6caf22a1c..01c00a50d 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -31,10 +31,14 @@
 #include "hpcg_data.hpp"
 #include "multigrid_v_cycle.hpp"
 
+#include <utils/print_vec_mat.hpp>
 
 namespace grb {
 	namespace algorithms {
 
+
+
+		
 		/**
 		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
 		 *
@@ -140,9 +144,9 @@ namespace grb {
 			size_t iter { 0 };
 
 #ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( p, "start p" );
-			DBG_print_norm( Ap, "start Ap" );
-			DBG_print_norm( r, "start r" );
+			print_norm( p, "start p" );
+			print_norm( Ap, "start Ap" );
+			print_norm( r, "start r" );
 #endif
 
 			do {
@@ -150,14 +154,16 @@ namespace grb {
 				DBG_println( "========= iteration " << iter << " =========" );
 #endif
 				if( with_preconditioning ) {
+					//ret = ret ? ret : grb::set( z, r ); // z = r;
 					ret = ret ? ret : internal::multi_grid( data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
+					//ret = ret ? ret : grb::set( x, z );
 					assert( ret == SUCCESS );
 				} else {
 					ret = ret ? ret : grb::set( z, r ); // z = r;
 					assert( ret == SUCCESS );
 				}
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( z, "initial z" );
+				print_norm( z, "initial z" );
 #endif
 
 				ResidualType pAp;
@@ -182,14 +188,14 @@ namespace grb {
 					assert( ret == SUCCESS );
 				}
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( p, "middle p" );
+				print_norm( p, "middle p" );
 #endif
-
 				ret = ret ? ret : grb::set( Ap, 0 );
 				ret = ret ? ret : grb::mxv( Ap, A, p, ring ); // Ap = A * p;
+
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( Ap, "middle Ap" );
+				print_norm( Ap, "middle Ap" );
 #endif
 				pAp = static_cast< ResidualType >( 0.0 );
 				ret = ret ? ret : grb::dot( pAp, Ap, p, ring ); // pAp = p' * Ap
@@ -200,13 +206,12 @@ namespace grb {
 				ret = ret ? ret : grb::eWiseMul( x, alpha, p, ring ); // x += alpha * p;
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( x, "end x" );
+				print_norm( x, "end x" );
 #endif
-
 				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, ring ); // r += - alpha * Ap;
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( r, "end r" );
+				print_norm( r, "end r" );
 #endif
 
 				norm_residual = static_cast< ResidualType >( 0.0 );
@@ -215,6 +220,9 @@ namespace grb {
 
 				norm_residual = std::sqrt( norm_residual );
 
+#ifdef HPCG_PRINT_STEPS				
+				std::cout << " ---> norm_residual=" << norm_residual << "\n";
+#endif
 				++iter;
 			} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
 
diff --git a/include/graphblas/algorithms/hpcg/hpcg_data.hpp b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
index 96b39856d..d2df78e9d 100644
--- a/include/graphblas/algorithms/hpcg/hpcg_data.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
@@ -38,7 +38,7 @@ namespace grb {
 		/**
 		 * @brief basic data container for the HPCG algorithm, storing \b only the
 		 * data in common between the full CG run and the V-cycle multi-grid solver.
-		 * Additional data are stored in inheriting daata structures.
+		 * Additional data are stored in inheriting data structures.
 		 *
 		 * @tparam IOType type of values of the vectors for intermediate results
 		 * @tparam NonzeroType type of the values stored inside the system matrix #A
diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
index 1facabe49..a291ad9e5 100644
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
@@ -69,6 +69,7 @@ namespace grb {
 			}
 			grb::algorithms::matrix_generator_iterator< DIMS, T > begin( sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
 			grb::algorithms::matrix_generator_iterator< DIMS, T > end( sys_sizes, n, halo_size, diag_value, non_diag_value );
+
 			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
 		}
 
@@ -115,6 +116,7 @@ namespace grb {
 
 			grb::algorithms::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, 0 );
 			grb::algorithms::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, rows );
+			
 			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
 		}
 
diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
index f40296f91..f4cd55bdc 100644
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
@@ -34,14 +34,23 @@
 #include "hpcg_data.hpp"
 #include "red_black_gauss_seidel.hpp"
 
+#include <utils/print_vec_mat.hpp>
+
+
+
+#define DBG_println( args ) std::cout << args << std::endl;
+
 
 namespace grb {
 	namespace algorithms {
+
+		
 		/**
 		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
 		 */
 		namespace internal {
 
+			
 			/**
 			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
 			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
@@ -116,6 +125,18 @@ namespace grb {
 				return ret;
 			}
 
+			template< typename IOType, typename NonzeroType, class Ring >
+			grb::RC spai0_smoother( system_data< IOType, NonzeroType > & data, const Ring & ring ) {
+				RC ret { SUCCESS };
+
+				NonzeroType alpha = 1.;
+				ret = ret ? ret : grb::eWiseMulAdd( data.z, alpha, data.A_diagonal, data.r, ring );
+				assert( ret == SUCCESS );
+					
+				return ret;
+			}
+			
+
 			/**
 			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
 			 * inside \p data.
@@ -131,14 +152,47 @@ namespace grb {
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC run_smoother( system_data< IOType, NonzeroType > & data, const std::size_t smoother_steps, const Ring & ring ) {
+			template< typename IOType, typename NonzeroType, class Ring, class Minus >
+			grb::RC run_spai0_smoother( system_data< IOType, NonzeroType > & data,
+								  const std::size_t smoother_steps, const Ring & ring, const Minus & minus ) {
 				RC ret { SUCCESS };
 
 				for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
-					ret = ret ? ret : red_black_gauss_seidel( data, ring );
+					ret = ret ? ret : grb::set( data.smoother_temp, 0 );
+					ret = ret ? ret : grb::mxv( data.smoother_temp, data.A, data.z, ring );
+					ret = ret ? ret : grb::eWiseApply( data.smoother_temp, data.r, data.smoother_temp,
+													   minus ); 
+
+#ifdef HPCG_PRINT_STEPS
+					print_norm( data.A_diagonal, " data.A_diagonal" );
+					print_norm( data.smoother_temp, " data.smoother_temp" );
+					print_norm( data.z, " data.z" );
+#endif
+
+					ret = ret ? ret :
+						grb::eWiseLambda(
+										 [ &data ]( const size_t i ) {
+#ifdef HPCG_PRINT_STEPS											 
+											 if( i < 100 ){
+												 std::cout << " i= " << i
+														   << " data.z[i]= " << data.z[i]
+														   << " data.A_diagonal[i]= " << data.A_diagonal[i]
+														   << " data.smoother_temp[i]= " << data.smoother_temp[i]
+														   << "\n";
+											 }
+#endif											 
+											 data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
+										 },
+										 data.A_diagonal, data.z, data.smoother_temp );
+					
 					assert( ret == SUCCESS );
 				}
+				
+				
+				// for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
+				// 	ret = ret ? ret : red_black_gauss_seidel( data, ring );
+				// 	assert( ret == SUCCESS );
+				// }
 				return ret;
 			}
 
@@ -184,21 +238,25 @@ namespace grb {
 				const Ring & ring,
 				const Minus & minus ) {
 				RC ret { SUCCESS };
+
 #ifdef HPCG_PRINT_STEPS
 				DBG_println( "mg BEGINNING {" );
 #endif
 
 				// clean destination vector
+				//ret = ret ? ret : grb::set( data.z, data.r );
 				ret = ret ? ret : grb::set( data.z, 0 );
+				
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.r, "initial r" );
+				print_norm( data.z, "first print smoothed z" );
+				print_norm( data.r, "initial r" );
 #endif
 				if( coarsening_data == nullptr ) {
-					// compute one round of Gauss Seidel and return
-					ret = ret ? ret : run_smoother( data, 1, ring );
+					//compute one round of smoother
+					ret = ret ? ret : run_spai0_smoother( data, 1, ring, minus );
 					assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-					DBG_print_norm( data.z, "smoothed z" );
+					print_norm( data.z, "smoothed z" );
 					DBG_println( "} mg END" );
 #endif
 					return ret;
@@ -209,36 +267,45 @@ namespace grb {
 				};
 
 				// pre-smoother
-				ret = ret ? ret : run_smoother( data, presmoother_steps, ring );
+				ret = ret ? ret : run_spai0_smoother( data, presmoother_steps, ring, minus );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "pre-smoothed z" );
+				print_norm( data.z, "pre-smoothed z" );
 #endif
 
 				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
 				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
 				assert( ret == SUCCESS );
 
+#ifdef HPCG_PRINT_STEPS
+				print_norm( cd.r, "before coarse cd.r" );
+#endif				
+
 				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
 				assert( ret == SUCCESS );
+
+				
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( cd.r, "coarse r" );
+				print_norm( cd.z, "after cd.coarse z" );				
+				print_norm( cd.r, "after cd.coarse r" );
 #endif
 
-				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
+				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );			
 				assert( ret == SUCCESS );
 
 				ret = ret ? ret : compute_prolongation( data.z, cd, ring );
 				assert( ret == SUCCESS );
+				
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "prolonged z" );
+				print_norm( data.z, "prolonged z" );
 #endif
 
 				// post-smoother
-				ret = ret ? ret : run_smoother( data, postsmoother_steps, ring );
+				ret = ret ? ret : run_spai0_smoother( data, postsmoother_steps, ring, minus );
 				assert( ret == SUCCESS );
+				
 #ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "post-smoothed z" );
+				print_norm( data.z, "post-smoothed z" );
 				DBG_println( "} mg END" );
 #endif
 
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 11adf82c1..8f9fe13a2 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -1,4 +1,3 @@
-
 /*
  *   Copyright 2021 Huawei Technologies Co., Ltd.
  *
@@ -31,6 +30,7 @@
 #include <memory>
 
 #include <graphblas.hpp>
+#include <graphblas/utils/parser.hpp>
 
 #include "hpcg_data.hpp"
 #include "matrix_building_utils.hpp"
@@ -80,70 +80,149 @@ namespace grb {
 		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors and matrices) succeeded,
 		 * otherwise the first unsuccessful return value
 		 */
-		template< std::size_t DIMS, typename T = double >
-		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, hpcg_system_params< DIMS, T > & params ) {
-			// n is the system matrix size
-			const std::size_t n { std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-
-			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n ) };
-
-			assert( ! holder ); // should be empty
-			holder = std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > >( data );
-
-			// initialize the main (=uncoarsened) system matrix
+		template< std::size_t DIMS, typename T = double, typename SYSINP >
+		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, hpcg_system_params< DIMS, T > & params, SYSINP &in ) {
 			grb::RC rc { grb::SUCCESS };
-			rc = build_ndims_system_matrix< DIMS, T >( data->A, params.physical_sys_sizes, params.halo_size, params.diag_value, params.non_diag_value );
-
-			if( rc != grb::SUCCESS ) {
-				std::cerr << "Failure to generate the initial system (" << toString( rc ) << ") of size " << n << std::endl;
+			std::size_t coarsening_level = 0UL;
+			grb::utils::MatrixFileReader< T, std::conditional<
+				( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+												 grb::config::RowIndexType,
+												 grb::config::ColIndexType >::type
+										  > parser_A( in.matAfiles[coarsening_level].c_str(), true );
+			const size_t n_A = parser_A.n();
+			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n_A ) };
+			rc = buildMatrixUnique( data->A,
+									parser_A.begin( SEQUENTIAL ), parser_A.end( SEQUENTIAL),
+									SEQUENTIAL
+									);
+			/* Once internal issue #342 is resolved this can be re-enabled
+			   const RC rc = buildMatrixUnique( L,
+			   parser.begin( PARALLEL ), parser.end( PARALLEL),
+			   PARALLEL
+			   );*/
+			if( rc != SUCCESS ) {
+				std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+						  << "(" << toString( rc ) << ")." << std::endl;
 				return rc;
 			}
+			
+			assert( ! holder ); // should be empty
+			holder = std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > >( data );
 
-			// set values of diagonal vector
-			set( data->A_diagonal, params.diag_value );
-
-			build_static_color_masks( data->color_masks, n, params.num_colors );
+			{
+				T *buffer;
+				std::ifstream inFile;
+				inFile.open(in.matMfiles[ coarsening_level ]);
+				if (inFile.is_open())  	{
+					size_t n=n_A;
+					buffer = new T [ n ];
+					for (size_t i = 0; i < n; i++) {
+						inFile >> buffer[i];
+					}
+						
+					inFile.close(); // CLose input file
+				
+					RC rc = grb::buildVector( data->A_diagonal, buffer, buffer + n, SEQUENTIAL );
+					if ( rc != SUCCESS ) {
+						std::cerr << " buildVector failed!\n ";
+						return rc;
+					}
+					delete [] buffer;
+				}
+				/* Once internal issue #342 is resolved this can be re-enabled
+				   const RC rc = buildMatrixUnique( L,
+				   parser.begin( PARALLEL ), parser.end( PARALLEL),
+				   PARALLEL
+				   );*/
+			}
 
+			std::size_t coarser_size;
+			std::size_t previous_size=n_A;
+			
 			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
 			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
 			assert( *coarser == nullptr );
-			std::array< std::size_t, DIMS > coarser_sizes;
-			std::array< std::size_t, DIMS > previous_sizes( params.physical_sys_sizes );
-			std::size_t min_physical_coarsened_size { *std::min_element( previous_sizes.cbegin(), previous_sizes.cend() ) / params.coarsening_step };
-			// coarsen system sizes into coarser_sizes
-			divide_array( coarser_sizes, previous_sizes, params.coarsening_step );
-			std::size_t coarsening_level = 0UL;
-
+			
 			// generate linked list of hierarchical coarseners
-			while( min_physical_coarsened_size >= params.min_phys_size && coarsening_level < params.max_levels ) {
+			while( coarsening_level  < params.max_levels ) {
 				assert( *coarser == nullptr );
-				// compute size of finer and coarser matrices
-				std::size_t coarser_size { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-				std::size_t previous_size { std::accumulate( previous_sizes.cbegin(), previous_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+				
+				grb::utils::MatrixFileReader< T, std::conditional<
+					( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+													 grb::config::RowIndexType,
+													 grb::config::ColIndexType >::type
+											  > parser_A_next( in.matAfiles[ coarsening_level + 1 ].c_str(), true );
+				
+				coarser_size = parser_A_next.n();
+
 				// build data structures for new level
 				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
+				
 				// install coarser level immediately to cleanup in case of build error
 				*coarser = new_coarser;
 				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
-				rc = build_ndims_coarsener_matrix< DIMS >( new_coarser->coarsening_matrix, coarser_sizes, previous_sizes );
-				if( rc != grb::SUCCESS ) {
-					std::cerr << "Failure to generate coarsening matrix (" << toString( rc ) << ")." << std::endl;
-					return rc;
-				}
-				rc = build_ndims_system_matrix< DIMS, T >( new_coarser->A, coarser_sizes, params.halo_size, params.diag_value, params.non_diag_value );
-				if( rc != grb::SUCCESS ) {
-					std::cerr << "Failure to generate system matrix (" << toString( rc ) << ")for size " << coarser_size << std::endl;
-					return rc;
-				}
-				set( new_coarser->A_diagonal, params.diag_value );
-				// build color masks for coarser level (same masks, but with coarser system size)
-				rc = build_static_color_masks( new_coarser->color_masks, coarser_size, params.num_colors );
 
+				
+				{
+					grb::utils::MatrixFileReader< T, std::conditional<
+						( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+														 grb::config::RowIndexType,
+														 grb::config::ColIndexType >::type
+												  > parser_P( in.matRfiles[coarsening_level].c_str(), true );
+					
+					rc = buildMatrixUnique( new_coarser->coarsening_matrix,
+											parser_P.begin( SEQUENTIAL ), parser_P.end( SEQUENTIAL),
+											SEQUENTIAL
+											);
+
+					if( rc != SUCCESS ) {
+						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+								  << "(" << toString( rc ) << ")." << std::endl;
+						return rc;
+					}
+					
+				}
+				{
+					rc = buildMatrixUnique( new_coarser->A,
+											parser_A_next.begin( SEQUENTIAL ), parser_A_next.end( SEQUENTIAL),
+											SEQUENTIAL
+											);
+
+					if( rc != SUCCESS ) {
+						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+								  << "(" << toString( rc ) << ")." << std::endl;
+						return rc;
+					}					
+					
+				}				
+
+				//set( new_coarser->A_diagonal, params.diag_value );
+				if( coarsening_level + 1 < params.max_levels ) {
+					T *buffer;
+					std::ifstream inFile;
+					inFile.open(in.matMfiles[ coarsening_level + 1 ]);
+					if (inFile.is_open())  	{
+						size_t n=grb::nrows(new_coarser->A);
+						buffer = new T [ n ];
+						for (size_t i = 0; i < n; i++) {
+							inFile >> buffer[i];
+						}
+						
+						inFile.close(); // CLose input file
+				
+						RC rc = grb::buildVector( new_coarser->A_diagonal, buffer, buffer + n, SEQUENTIAL );
+						if ( rc != SUCCESS ) {
+							std::cerr << " buildVector failed!\n ";
+							return rc;
+						}
+						delete [] buffer;
+					}
+
+				}				
+				
 				// prepare for new iteration
 				coarser = &new_coarser->coarser_level;
-				min_physical_coarsened_size /= params.coarsening_step;
-				previous_sizes = coarser_sizes;
-				divide_array( coarser_sizes, coarser_sizes, params.coarsening_step );
+				previous_size = coarser_size;
 				coarsening_level++;
 			}
 			return rc;
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index d84c157e0..caf47cb92 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -33,12 +33,17 @@
 #include <iostream>
 #include <memory>
 #include <type_traits>
+#include <string>
 
 #include <graphblas.hpp>
-#include <graphblas/algorithms/hpcg/hpcg.hpp>
-#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
 
-// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
+
+// forward declaration for the tracing facility
+template< typename T,
+	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
+>
+void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
+
 #ifdef HPCG_PRINT_STEPS
 
 // HPCG_PRINT_STEPS requires defining the following symbols
@@ -48,11 +53,6 @@
  */
 #define DBG_println( args ) std::cout << args << std::endl;
 
-// forward declaration for the tracing facility
-template< typename T,
-	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
->
-void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
 
 /**
  * @brief prints \p head and the norm of \p r.
@@ -60,6 +60,13 @@ void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring
 #define DBG_print_norm( vec, head ) print_norm( vec, head )
 #endif
 
+
+#include <graphblas/algorithms/hpcg/hpcg.hpp>
+#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
+
+// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
+
+
 #include <graphblas/utils/Timer.hpp>
 
 #include <utils/argument_parser.hpp>
@@ -104,13 +111,25 @@ struct system_input {
  * @brief Container for the parameters for the HPCG simulation.
  */
 struct simulation_input : public system_input {
-	size_t test_repetitions;
-	size_t max_iterations;
-	size_t smoother_steps;
+	std::size_t test_repetitions;
+	std::size_t max_iterations;
+	std::size_t smoother_steps;
+	const char * matAfile_c_str;
+	std::vector< std::string > matAfiles;
+	std::vector< std::string > matMfiles;
+	std::vector< std::string > matPfiles;
+	std::vector< std::string > matRfiles;
 	bool evaluation_run;
 	bool no_preconditioning;
 };
 
+struct preloaded_matrices {
+	std::vector< std::string > matAfiles;
+	std::vector< std::string > matMfiles;
+	std::vector< std::string > matPfiles;
+	std::vector< std::string > matRfiles;
+};
+
 /**
  * @brief Containers for test outputs.
  */
@@ -150,15 +169,17 @@ T static next_pow_2( T n ) {
  * @brief Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
  * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
  */
-static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in ) {
-	const std::array< size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
+static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in, preloaded_matrices & inMats ) {
+	const std::array< std::size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
 	struct hpcg_system_params< 3, double > params {
 		physical_sys_sizes, HALO_RADIUS, BAND_WIDTH_3D * 2 + 1, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
 	};
 
-	return build_hpcg_system< 3, double >( holder, params );
+	return build_hpcg_system< 3, double >( holder, params, inMats );
 }
 
+
+
 #ifdef HPCG_PRINT_SYSTEM
 static void print_system( const hpcg_data< double, double, double > & data ) {
 	print_matrix( data.A, 70, "A" );
@@ -171,12 +192,12 @@ static void print_system( const hpcg_data< double, double, double > & data ) {
 }
 #endif
 
-#ifdef HPCG_PRINT_STEPS
+//#ifdef HPCG_PRINT_STEPS
 template< typename T,
 		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
 	>
 void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm;
+	T norm=0;
 	RC ret = grb::dot( norm, r, r, ring ); // residual = r' * r;
 	(void)ret;
 	assert( ret == SUCCESS );
@@ -186,13 +207,14 @@ void print_norm( const grb::Vector< T > & r, const char * head, const Ring & rin
 	}
 	std::cout << norm << std::endl;
 }
-#endif
+//#endif
 
 /**
  * @brief Main test, building an HPCG problem and running the simulation closely following the
  * parameters in the reference HPCG test.
  */
 void grbProgram( const simulation_input & in, struct output & out ) {
+	
 	// get user process ID
 	assert( spmd<>::pid() < spmd<>::nprocs() );
 	grb::utils::Timer timer;
@@ -202,9 +224,17 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.error_code = SUCCESS;
 	RC rc { SUCCESS };
 
+	preloaded_matrices inputData;
+	inputData.matAfiles = in.matAfiles;
+	inputData.matMfiles = in.matMfiles;
+	inputData.matPfiles = in.matPfiles;
+	inputData.matRfiles = in.matRfiles;	
+	
+	
+
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
-	rc = build_3d_system( hpcg_state, in );
+	rc = build_3d_system( hpcg_state, in, inputData );
 
 	if( rc != SUCCESS ) {
 		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
@@ -229,8 +259,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {
-		print_vector( x, 50, "X" );
-		print_vector( b, 50, "B" );
+		print_vector( x, 50, " ---> X(1)" );
+		print_vector( b, 50, " ---> B(1)" );
 	}
 #endif
 
@@ -254,6 +284,11 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 			rc = set( x, 0.0 );
 			assert( rc == SUCCESS );
 			rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+			std::cout << " ---> out.residual="<< out.residual << "\n";
+
+			rc = rc ? rc : grb::eWiseLambda( [ &x ]( const size_t i ) { x[ i ] -= 1;}, x );
+			print_norm (x, " ---> norm(x)");
+			
 			out.test_repetitions++;
 			if( rc != SUCCESS ) {
 				break;
@@ -264,6 +299,13 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		// sleep( 1 );
 	}
 
+#ifdef HPCG_PRINT_SYSTEM
+	if( spmd<>::pid() == 0 ) {
+		print_vector( x, 50, " ---> X(check out)" );
+		print_vector( b, 50, " ---> B(check out)" );
+	}
+#endif
+
 	if( spmd<>::pid() == 0 ) {
 		if( rc == SUCCESS ) {
 			if( in.evaluation_run ) {
@@ -368,13 +410,15 @@ int main( int argc, char ** argv ) {
 static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations, double & max_residual_norm, int argc, char ** argv ) {
 
 	argument_parser parser;
-	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
+	parser
+		.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
+								"maximum level for coarsening; 0 means no coarsening; note: actual "
+								"level may be limited by the minimum system dimension" )
+		.add_optional_argument( "--mat_files_pattern", sim_in.matAfile_c_str, "","file pattern for files contining matrices A, M_diag, P, R "
+		 						"i.e. '--mat_a_file_names /path/to/dir/level_  --max_coarse-levels 2' will read /path/to/dir/level_0_A.mtx,  /path/to/dir/level_1_A.mtx, /path/to/dir/level_2_A.mtx ... " )
+		.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
 		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
 		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
-		.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
-			"maximum level for coarsening; 0 means no coarsening; note: actual "
-			"level may be limited"
-			" by the minimum system dimension" )
 		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
 		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
 		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of HPCG iterations" )
@@ -386,9 +430,52 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 			"launch single run directly, without benchmarker (ignore "
 			"repetitions)" )
 		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" );
+	
 
 	parser.parse( argc, argv );
 
+	std::string matAfile=sim_in.matAfile_c_str;
+	if ( !matAfile.empty() ) {
+		std::cout << "Using  <<" << matAfile << ">> pattern to read matrices " << std::endl;
+		for ( size_t i=0 ; i <= sim_in.max_coarsening_levels ; i++ ){
+			std::string fnamebase = matAfile + std::to_string(  i );
+			std::string fnameA = fnamebase + "_A.mtx";
+			std::string fnameM = fnamebase + "_M_diag.mtx";
+			std::string fnameP = fnamebase + "_P.mtx";
+			std::string fnameR = fnamebase + "_R.mtx";
+			sim_in.matAfiles.push_back (fnameA);
+			sim_in.matMfiles.push_back (fnameM);
+			sim_in.matPfiles.push_back (fnameP);
+			sim_in.matRfiles.push_back (fnameR);
+		}
+		{
+			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
+			std::string fnameA = fnamebase + "_A.mtx";
+			sim_in.matAfiles.push_back (fnameA);
+		}		
+
+		std::cout << "files to read matrices: " << std::endl;
+		for (std::string fname: sim_in.matAfiles) {
+			std::cout << fname << " \n";
+		}
+
+		for (std::string fname: sim_in.matMfiles) {
+			std::cout << fname << " \n";
+		}
+
+		for (std::string fname: sim_in.matPfiles) {
+			std::cout << fname << " \n";
+		}
+
+		for (std::string fname: sim_in.matRfiles) {
+			std::cout << fname << " \n";
+		}
+
+	}
+	else {
+		std::cout << "No pattern to read matrices provided" << std::endl;
+	}
+
 	// check for valid values
 	size_t ssize { std::max( next_pow_2( sim_in.nx ), PHYS_SYSTEM_SIZE_MIN ) };
 	if( ssize != sim_in.nx ) {

From 54e85125fcdc9dd382d6bf7a8ed09d3656096866 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Fri, 10 Jun 2022 17:00:46 +0200
Subject: [PATCH 02/17] fix timing

---
 tests/smoke/hpcg.cpp | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index caf47cb92..2ac6e40b0 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -220,6 +220,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	grb::utils::Timer timer;
 	timer.reset();
 
+	out.times.io = timer.time();
+
 	// assume successful run
 	out.error_code = SUCCESS;
 	RC rc { SUCCESS };
@@ -247,6 +249,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
+	out.times.preamble = timer.time();
+
 	Matrix< double > & A { hpcg_state->A };
 	Vector< double > & x { hpcg_state->x };
 	Vector< double > & b { hpcg_state->b };
@@ -264,14 +268,12 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
-	out.times.preamble = timer.time();
-
 	const bool with_preconditioning = ! in.no_preconditioning;
+	out.test_repetitions = 0;
 	if( in.evaluation_run ) {
-		out.test_repetitions = 0;
-		timer.reset();
+		double single_time_start = timer.time();
 		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-		double single_time = timer.time();
+		double single_time = timer.time() - single_time_start;
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
 		}
@@ -279,26 +281,35 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
 	} else {
 		// do benchmark
-		timer.reset();
+		double time_start = timer.time();
 		for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
 			rc = set( x, 0.0 );
 			assert( rc == SUCCESS );
 			rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-			std::cout << " ---> out.residual="<< out.residual << "\n";
-
-			rc = rc ? rc : grb::eWiseLambda( [ &x ]( const size_t i ) { x[ i ] -= 1;}, x );
-			print_norm (x, " ---> norm(x)");
 			
 			out.test_repetitions++;
 			if( rc != SUCCESS ) {
 				break;
 			}
+
+#ifdef HPCG_PRINT_SYSTEM
+			std::cout << " ---> out.residual="<< out.residual << "\n";
+#endif
 		}
-		double time_taken { timer.time() };
+		double time_taken = timer.time() - time_start;
+		rc = rc ? rc : collectives<>::reduce( time_taken, 0, operators::max< double >() );
 		out.times.useful = time_taken / static_cast< double >( out.test_repetitions );
+
+#ifdef HPCG_PRINT_SYSTEM
+		rc = rc ? rc : grb::eWiseLambda( [ &x ]( const size_t i ) { x[ i ] -= 1;}, x );
+		print_norm (x, " ---> norm(x)");
+#endif
 		// sleep( 1 );
 	}
 
+	out.times.io = timer.time() - out.times.io;
+	out.times.preamble = timer.time() - out.times.preamble;
+
 #ifdef HPCG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {
 		print_vector( x, 50, " ---> X(check out)" );
@@ -449,7 +460,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 			sim_in.matRfiles.push_back (fnameR);
 		}
 		{
-			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
+			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels + 1 );
 			std::string fnameA = fnamebase + "_A.mtx";
 			sim_in.matAfiles.push_back (fnameA);
 		}		

From 7cebaf44b347481adafb72e07d66248fa5e14f74 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Mon, 13 Jun 2022 20:55:41 +0200
Subject: [PATCH 03/17] loading data into the memory before tests

hpcg-benchmark leftovers cleanup
---
 .../algorithms/hpcg/multigrid_v_cycle.hpp     |   5 -
 .../algorithms/hpcg/system_building_utils.hpp | 141 +++---------
 tests/smoke/hpcg.cpp                          | 216 ++++++++++++++----
 3 files changed, 203 insertions(+), 159 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
index f4cd55bdc..a0004a9cb 100644
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
@@ -188,11 +188,6 @@ namespace grb {
 					assert( ret == SUCCESS );
 				}
 				
-				
-				// for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
-				// 	ret = ret ? ret : red_black_gauss_seidel( data, ring );
-				// 	assert( ret == SUCCESS );
-				// }
 				return ret;
 			}
 
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 8f9fe13a2..e9f3fdf97 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -1,3 +1,4 @@
+
 /*
  *   Copyright 2021 Huawei Technologies Co., Ltd.
  *
@@ -39,37 +40,6 @@
 namespace grb {
 	namespace algorithms {
 
-		/**
-		 * @brief Divide each value of \p source by \p step and store the result into \p destination.
-		 *
-		 * @tparam DIMS size of passed arrays
-		 */
-		template< std::size_t DIMS >
-		void divide_array( std::array< std::size_t, DIMS > & destination, const std::array< std::size_t, DIMS > & source, std::size_t step ) {
-			for( std::size_t i { 0 }; i < destination.size(); i++ ) {
-				destination[ i ] = source[ i ] / step;
-			}
-		}
-
-		/**
-		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
-		 * coarsening information.
-		 *
-		 * @tparam DIMS dimensions of the physical system
-		 * @tparam T type of matrix values
-		 */
-		template< std::size_t DIMS, typename T >
-		struct hpcg_system_params {
-			const std::array< std::size_t, DIMS > & physical_sys_sizes;
-			const std::size_t halo_size;
-			const std::size_t num_colors;
-			const T diag_value;
-			const T non_diag_value;
-			const std::size_t min_phys_size;
-			const std::size_t max_levels;
-			const std::size_t coarsening_step;
-		};
-
 		/**
 		 * @brief Generates an entire HPCG problem according to the parameters in \p params , storing it in \p holder .
 		 *
@@ -80,21 +50,20 @@ namespace grb {
 		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors and matrices) succeeded,
 		 * otherwise the first unsuccessful return value
 		 */
-		template< std::size_t DIMS, typename T = double, typename SYSINP >
-		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, hpcg_system_params< DIMS, T > & params, SYSINP &in ) {
+		template< typename T = double, typename SYSINP >
+		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, const std::size_t max_levels, SYSINP &in ) {
 			grb::RC rc { grb::SUCCESS };
 			std::size_t coarsening_level = 0UL;
-			grb::utils::MatrixFileReader< T, std::conditional<
-				( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
-												 grb::config::RowIndexType,
-												 grb::config::ColIndexType >::type
-										  > parser_A( in.matAfiles[coarsening_level].c_str(), true );
-			const size_t n_A = parser_A.n();
+			const size_t n_A = in.matAbuffer[ coarsening_level ].size();
 			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n_A ) };
 			rc = buildMatrixUnique( data->A,
-									parser_A.begin( SEQUENTIAL ), parser_A.end( SEQUENTIAL),
+									in.matAbuffer[ coarsening_level ].i_data,
+									in.matAbuffer[ coarsening_level ].j_data,
+									in.matAbuffer[ coarsening_level ].v_data,
+									in.matAbuffer[ coarsening_level ].size(),
 									SEQUENTIAL
 									);
+
 			/* Once internal issue #342 is resolved this can be re-enabled
 			   const RC rc = buildMatrixUnique( L,
 			   parser.begin( PARALLEL ), parser.end( PARALLEL),
@@ -110,30 +79,14 @@ namespace grb {
 			holder = std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > >( data );
 
 			{
-				T *buffer;
-				std::ifstream inFile;
-				inFile.open(in.matMfiles[ coarsening_level ]);
-				if (inFile.is_open())  	{
-					size_t n=n_A;
-					buffer = new T [ n ];
-					for (size_t i = 0; i < n; i++) {
-						inFile >> buffer[i];
-					}
-						
-					inFile.close(); // CLose input file
-				
-					RC rc = grb::buildVector( data->A_diagonal, buffer, buffer + n, SEQUENTIAL );
-					if ( rc != SUCCESS ) {
-						std::cerr << " buildVector failed!\n ";
-						return rc;
-					}
-					delete [] buffer;
+				RC rc = grb::buildVector( data->A_diagonal,
+										  in.matMbuffer[coarsening_level].v_data,
+										  in.matMbuffer[coarsening_level].v_data + in.matMbuffer[coarsening_level].size(),
+										  SEQUENTIAL );
+				if ( rc != SUCCESS ) {
+					std::cerr << " buildVector failed!\n ";
+					return rc;
 				}
-				/* Once internal issue #342 is resolved this can be re-enabled
-				   const RC rc = buildMatrixUnique( L,
-				   parser.begin( PARALLEL ), parser.end( PARALLEL),
-				   PARALLEL
-				   );*/
 			}
 
 			std::size_t coarser_size;
@@ -144,34 +97,24 @@ namespace grb {
 			assert( *coarser == nullptr );
 			
 			// generate linked list of hierarchical coarseners
-			while( coarsening_level  < params.max_levels ) {
+			while( coarsening_level  < max_levels ) {
 				assert( *coarser == nullptr );
 				
-				grb::utils::MatrixFileReader< T, std::conditional<
-					( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
-													 grb::config::RowIndexType,
-													 grb::config::ColIndexType >::type
-											  > parser_A_next( in.matAfiles[ coarsening_level + 1 ].c_str(), true );
-				
-				coarser_size = parser_A_next.n();
+				coarser_size = in.matAbuffer[ coarsening_level + 1 ].size();
 
 				// build data structures for new level
 				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
 				
 				// install coarser level immediately to cleanup in case of build error
 				*coarser = new_coarser;
-				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
 
-				
+				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
 				{
-					grb::utils::MatrixFileReader< T, std::conditional<
-						( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
-														 grb::config::RowIndexType,
-														 grb::config::ColIndexType >::type
-												  > parser_P( in.matRfiles[coarsening_level].c_str(), true );
-					
 					rc = buildMatrixUnique( new_coarser->coarsening_matrix,
-											parser_P.begin( SEQUENTIAL ), parser_P.end( SEQUENTIAL),
+											in.matRbuffer[ coarsening_level ].i_data,
+											in.matRbuffer[ coarsening_level ].j_data,
+											in.matRbuffer[ coarsening_level ].v_data,
+											in.matRbuffer[ coarsening_level ].size(),
 											SEQUENTIAL
 											);
 
@@ -184,7 +127,10 @@ namespace grb {
 				}
 				{
 					rc = buildMatrixUnique( new_coarser->A,
-											parser_A_next.begin( SEQUENTIAL ), parser_A_next.end( SEQUENTIAL),
+											in.matAbuffer[ coarsening_level + 1 ].i_data,
+											in.matAbuffer[ coarsening_level + 1 ].j_data,
+											in.matAbuffer[ coarsening_level + 1 ].v_data,
+											in.matAbuffer[ coarsening_level + 1 ].size(),
 											SEQUENTIAL
 											);
 
@@ -194,31 +140,18 @@ namespace grb {
 						return rc;
 					}					
 					
-				}				
-
-				//set( new_coarser->A_diagonal, params.diag_value );
-				if( coarsening_level + 1 < params.max_levels ) {
-					T *buffer;
-					std::ifstream inFile;
-					inFile.open(in.matMfiles[ coarsening_level + 1 ]);
-					if (inFile.is_open())  	{
-						size_t n=grb::nrows(new_coarser->A);
-						buffer = new T [ n ];
-						for (size_t i = 0; i < n; i++) {
-							inFile >> buffer[i];
-						}
-						
-						inFile.close(); // CLose input file
+				}
 				
-						RC rc = grb::buildVector( new_coarser->A_diagonal, buffer, buffer + n, SEQUENTIAL );
-						if ( rc != SUCCESS ) {
-							std::cerr << " buildVector failed!\n ";
-							return rc;
-						}
-						delete [] buffer;
+				if( coarsening_level + 1 < max_levels ) {
+					RC rc = grb::buildVector( new_coarser->A_diagonal,
+											  in.matMbuffer[ coarsening_level + 1 ].v_data,
+											  in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
+											  SEQUENTIAL );
+					if ( rc != SUCCESS ) {
+						std::cerr << " buildVector failed!\n ";
+						return rc;
 					}
-
-				}				
+				}
 				
 				// prepare for new iteration
 				coarser = &new_coarser->coarser_level;
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 2ac6e40b0..5884b45d5 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -100,17 +100,78 @@ static const char * const TEXT_HIGHLIGHT = "===> ";
 
 
 /**
- * @brief Container for system parameters to create the HPCG problem.
+ * @brief Container to store matrices loaded from a file.
  */
-struct system_input {
-	size_t nx, ny, nz;
-	size_t max_coarsening_levels;
+template< typename T = double >
+struct mat_data {
+	std::size_t *i_data;
+	std::size_t *j_data;
+	T *v_data;
+	std::size_t n;
+
+	void resize( std::size_t in ){
+		n=in;
+		i_data = new std::size_t [ in ];
+		j_data = new std::size_t [ in ];
+		v_data = new T [ in ];
+	};
+
+	mat_data( ){
+		n = 0;
+	};
+	
+	mat_data( std::size_t in ){
+		resize( in );
+	};
+
+	std::size_t size(){
+		return n;
+	}
+
+	~mat_data(){
+		delete [] i_data;
+		delete [] i_data;
+		delete [] v_data;
+	}
 };
 
+/**
+ * @brief Container to store vectors loaded from a file.
+ */
+template< typename T = double >
+struct vec_data {
+	T *v_data;
+	std::size_t n;
+
+	void resize( std::size_t in ){
+		n=in;
+		v_data = new T [ in ];
+	};
+
+	vec_data( ){
+		n = 0;
+	};
+	
+	vec_data( std::size_t in ){
+		resize( in );
+	};
+
+	std::size_t size(){
+		return n;
+	}
+
+	~vec_data(){
+		delete [] v_data;
+	}
+};
+
+
+
 /**
  * @brief Container for the parameters for the HPCG simulation.
  */
-struct simulation_input : public system_input {
+struct simulation_input {
+	std::size_t max_coarsening_levels;
 	std::size_t test_repetitions;
 	std::size_t max_iterations;
 	std::size_t smoother_steps;
@@ -119,17 +180,102 @@ struct simulation_input : public system_input {
 	std::vector< std::string > matMfiles;
 	std::vector< std::string > matPfiles;
 	std::vector< std::string > matRfiles;
+	
 	bool evaluation_run;
 	bool no_preconditioning;
 };
 
-struct preloaded_matrices {
-	std::vector< std::string > matAfiles;
-	std::vector< std::string > matMfiles;
-	std::vector< std::string > matPfiles;
-	std::vector< std::string > matRfiles;
+
+/**
+ * @brief Container to store all data for HPCG hierarchy.
+ */
+class preloaded_matrices : public simulation_input {
+
+public :
+	std::size_t nAmt, nMmt, nPmt, nRmt;
+	mat_data<double> *matAbuffer;
+	vec_data<double> *matMbuffer;
+	mat_data<double> *matPbuffer;
+	mat_data<double> *matRbuffer;
+
+	grb::RC read_matrix( std::vector< std::string > & fname,
+						  mat_data<double> * data ) {
+		grb::RC rc = SUCCESS;
+		for ( size_t i = 0; i < fname.size(); i++ ){
+			grb::utils::MatrixFileReader< double, std::conditional<
+				( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+													  grb::config::RowIndexType,
+													  grb::config::ColIndexType >::type
+										  > parser_mat( fname[i].c_str(), true );
+#ifdef DEBUG			
+			std::cout << " ---> parser_mat.filename()=" << parser_mat.filename() << "\n";
+			std::cout << " ---> parser_mat.nz()=" << parser_mat.nz() << "\n";
+			std::cout << " ---> parser_mat.n()=" << parser_mat.n() << "\n";
+			std::cout << " ---> parser_mat.m()=" << parser_mat.m() << "\n";
+			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
+#endif
+			
+			data[i].resize( parser_mat.nz() );
+			size_t k = 0;
+			for (auto it=parser_mat.begin( SEQUENTIAL );
+				 it != parser_mat.end( SEQUENTIAL);
+				 ++it ){
+				data[i].i_data[k]=it.i();
+				data[i].j_data[k]=it.j();
+				data[i].v_data[k]=it.v();
+				k++;
+			}
+		}
+
+		return ( rc );
+	}
+
+
+	grb::RC read_vector( std::vector< std::string > & fname,
+						 vec_data<double> * data ) {
+		grb::RC rc = SUCCESS;
+		for ( size_t i = 0; i < fname.size(); i++ ){
+			std::ifstream inFile;
+			inFile.open(fname[i].c_str() );
+			std::string headder;
+			size_t n,m;
+			std::getline(inFile, headder); // skip the first line
+			inFile >> n >> m;
+			data[i].resize( n );
+			if (inFile.is_open())  	{
+				for (size_t j = 0; j < n; j++) {
+					inFile >> data[i].v_data[j];
+				}
+				inFile.close();
+			}
+		}
+		return ( rc );
+	}
+	
+	grb::RC read_vec_matrics(){
+		grb::RC rc = SUCCESS;
+
+		nAmt = matAfiles.size() ;
+		nMmt = matMfiles.size() ;
+		nPmt = matPfiles.size() ;
+		nRmt = matRfiles.size() ;
+
+		matAbuffer = new mat_data<double> [ nAmt ];
+		matMbuffer = new vec_data<double> [ nMmt ];
+		matPbuffer = new mat_data<double> [ nPmt ];
+		matRbuffer = new mat_data<double> [ nRmt ];
+		
+		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
+		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
+		rc = rc ? rc : read_matrix( matPfiles, matPbuffer );
+		rc = rc ? rc : read_vector( matMfiles, matMbuffer );
+
+		return rc;
+	}
 };
 
+
+
 /**
  * @brief Containers for test outputs.
  */
@@ -165,20 +311,6 @@ T static next_pow_2( T n ) {
 	return n + 1;
 }
 
-/**
- * @brief Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
- * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
- */
-static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in, preloaded_matrices & inMats ) {
-	const std::array< std::size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
-	struct hpcg_system_params< 3, double > params {
-		physical_sys_sizes, HALO_RADIUS, BAND_WIDTH_3D * 2 + 1, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
-	};
-
-	return build_hpcg_system< 3, double >( holder, params, inMats );
-}
-
-
 
 #ifdef HPCG_PRINT_SYSTEM
 static void print_system( const hpcg_data< double, double, double > & data ) {
@@ -230,13 +362,17 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	inputData.matAfiles = in.matAfiles;
 	inputData.matMfiles = in.matMfiles;
 	inputData.matPfiles = in.matPfiles;
-	inputData.matRfiles = in.matRfiles;	
-	
-	
+	inputData.matRfiles = in.matRfiles;
+
+	rc = inputData.read_vec_matrics();
+	if( rc != SUCCESS ) {
+		std::cerr << "Failure to read data" << std::endl;
+	}
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
-	rc = build_3d_system( hpcg_state, in, inputData );
+	rc = build_hpcg_system< double >( hpcg_state, in.max_coarsening_levels, inputData );
+
 
 	if( rc != SUCCESS ) {
 		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
@@ -359,9 +495,6 @@ int main( int argc, char ** argv ) {
 	double max_residual_norm;
 
 	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
-	thcout << "System size x: " << sim_in.nx << std::endl;
-	thcout << "System size y: " << sim_in.ny << std::endl;
-	thcout << "System size z: " << sim_in.nz << std::endl;
 	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
 	thcout << "Test repetitions: " << sim_in.test_repetitions << std::endl;
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
@@ -427,9 +560,6 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 								"level may be limited by the minimum system dimension" )
 		.add_optional_argument( "--mat_files_pattern", sim_in.matAfile_c_str, "","file pattern for files contining matrices A, M_diag, P, R "
 		 						"i.e. '--mat_a_file_names /path/to/dir/level_  --max_coarse-levels 2' will read /path/to/dir/level_0_A.mtx,  /path/to/dir/level_1_A.mtx, /path/to/dir/level_2_A.mtx ... " )
-		.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
-		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
-		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
 		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
 		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
 		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of HPCG iterations" )
@@ -444,7 +574,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	
 
 	parser.parse( argc, argv );
-
+	
 	std::string matAfile=sim_in.matAfile_c_str;
 	if ( !matAfile.empty() ) {
 		std::cout << "Using  <<" << matAfile << ">> pattern to read matrices " << std::endl;
@@ -458,6 +588,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 			sim_in.matMfiles.push_back (fnameM);
 			sim_in.matPfiles.push_back (fnameP);
 			sim_in.matRfiles.push_back (fnameR);
+			
 		}
 		{
 			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels + 1 );
@@ -488,21 +619,6 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	}
 
 	// check for valid values
-	size_t ssize { std::max( next_pow_2( sim_in.nx ), PHYS_SYSTEM_SIZE_MIN ) };
-	if( ssize != sim_in.nx ) {
-		std::cout << "Setting system size x to " << ssize << " instead of " << sim_in.nx << std::endl;
-		sim_in.nx = ssize;
-	}
-	ssize = std::max( next_pow_2( sim_in.ny ), PHYS_SYSTEM_SIZE_MIN );
-	if( ssize != sim_in.ny ) {
-		std::cout << "Setting system size y to " << ssize << " instead of " << sim_in.ny << std::endl;
-		sim_in.ny = ssize;
-	}
-	ssize = std::max( next_pow_2( sim_in.nz ), PHYS_SYSTEM_SIZE_MIN );
-	if( ssize != sim_in.nz ) {
-		std::cout << "Setting system size z to " << ssize << " instead of " << sim_in.nz << std::endl;
-		sim_in.nz = ssize;
-	}
 	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
 		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
 		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;

From c2e1107e5fb14b7bbbe36da90b800c27abca5254 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Tue, 14 Jun 2022 14:36:34 +0200
Subject: [PATCH 04/17] fix size and missing multigrid level residual and error
 print

---
 .../algorithms/hpcg/multigrid_v_cycle.hpp     | 11 ++-
 .../algorithms/hpcg/system_building_utils.hpp | 28 +++++--
 tests/smoke/hpcg.cpp                          | 80 +++++++++++--------
 3 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
index a0004a9cb..7d1fbd8b4 100644
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
@@ -164,6 +164,7 @@ namespace grb {
 													   minus ); 
 
 #ifdef HPCG_PRINT_STEPS
+					std::cout << " data.A(spai0): " << nrows(data.A) << " x " << ncols(data.A) << " \n";
 					print_norm( data.A_diagonal, " data.A_diagonal" );
 					print_norm( data.smoother_temp, " data.smoother_temp" );
 					print_norm( data.z, " data.z" );
@@ -172,7 +173,7 @@ namespace grb {
 					ret = ret ? ret :
 						grb::eWiseLambda(
 										 [ &data ]( const size_t i ) {
-#ifdef HPCG_PRINT_STEPS											 
+#ifdef HPCG_PRINT_STEPS
 											 if( i < 100 ){
 												 std::cout << " i= " << i
 														   << " data.z[i]= " << data.z[i]
@@ -180,14 +181,14 @@ namespace grb {
 														   << " data.smoother_temp[i]= " << data.smoother_temp[i]
 														   << "\n";
 											 }
-#endif											 
+#endif
 											 data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
 										 },
 										 data.A_diagonal, data.z, data.smoother_temp );
-					
+
 					assert( ret == SUCCESS );
 				}
-				
+
 				return ret;
 			}
 
@@ -272,7 +273,9 @@ namespace grb {
 				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
 				assert( ret == SUCCESS );
 
+				
 #ifdef HPCG_PRINT_STEPS
+				std::cout << "  data.A: " << nrows(data.A) << " x " << ncols(data.A) << " \n";
 				print_norm( cd.r, "before coarse cd.r" );
 #endif				
 
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index e9f3fdf97..a917053b0 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -54,7 +54,7 @@ namespace grb {
 		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, const std::size_t max_levels, SYSINP &in ) {
 			grb::RC rc { grb::SUCCESS };
 			std::size_t coarsening_level = 0UL;
-			const size_t n_A = in.matAbuffer[ coarsening_level ].size();
+			const size_t n_A = in.matAbuffer[ coarsening_level ].get_n();
 			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n_A ) };
 			rc = buildMatrixUnique( data->A,
 									in.matAbuffer[ coarsening_level ].i_data,
@@ -74,7 +74,9 @@ namespace grb {
 						  << "(" << toString( rc ) << ")." << std::endl;
 				return rc;
 			}
-			
+#ifdef HPCG_PRINT_STEPS
+			std::cout << " buildMatrixUnique: constructed data->A " << nrows(data->A) << " x " << ncols(data->A) << " matrix \n";
+#endif
 			assert( ! holder ); // should be empty
 			holder = std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > >( data );
 
@@ -87,6 +89,10 @@ namespace grb {
 					std::cerr << " buildVector failed!\n ";
 					return rc;
 				}
+#ifdef HPCG_PRINT_STEPS
+				std::cout << " buildVector: data->A_diagonal "
+						  << size(data->A_diagonal) << " vector \n";
+#endif
 			}
 
 			std::size_t coarser_size;
@@ -100,7 +106,7 @@ namespace grb {
 			while( coarsening_level  < max_levels ) {
 				assert( *coarser == nullptr );
 				
-				coarser_size = in.matAbuffer[ coarsening_level + 1 ].size();
+				coarser_size = in.matAbuffer[ coarsening_level + 1 ].get_n();
 
 				// build data structures for new level
 				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
@@ -123,7 +129,10 @@ namespace grb {
 								  << "(" << toString( rc ) << ")." << std::endl;
 						return rc;
 					}
-					
+#ifdef HPCG_PRINT_STEPS
+					std::cout << " buildMatrixUnique: constructed new_coarser->coarsening_matrix "
+							  << nrows(new_coarser->coarsening_matrix) << " x " << ncols(new_coarser->coarsening_matrix) << " matrix \n";
+#endif
 				}
 				{
 					rc = buildMatrixUnique( new_coarser->A,
@@ -138,8 +147,11 @@ namespace grb {
 						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
 								  << "(" << toString( rc ) << ")." << std::endl;
 						return rc;
-					}					
-					
+					}
+#ifdef HPCG_PRINT_STEPS
+					std::cout << " buildMatrixUnique: constructed new_coarser->A "
+							  << nrows(new_coarser->A) << " x " << ncols(new_coarser->A) << " matrix \n";
+#endif
 				}
 				
 				if( coarsening_level + 1 < max_levels ) {
@@ -151,6 +163,10 @@ namespace grb {
 						std::cerr << " buildVector failed!\n ";
 						return rc;
 					}
+#ifdef HPCG_PRINT_STEPS
+					std::cout << " buildVector: new_coarser->A_diagonal "
+							  << size(new_coarser->A_diagonal) << " vector \n";
+#endif
 				}
 				
 				// prepare for new iteration
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 5884b45d5..aac11685b 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -107,17 +107,19 @@ struct mat_data {
 	std::size_t *i_data;
 	std::size_t *j_data;
 	T *v_data;
-	std::size_t n;
-
-	void resize( std::size_t in ){
-		n=in;
-		i_data = new std::size_t [ in ];
-		j_data = new std::size_t [ in ];
-		v_data = new T [ in ];
+	std::size_t nz, n, m;
+
+	void resize( std::size_t innz, std::size_t inn, std::size_t inm ){
+		nz=innz;
+		n=inn;
+		m=inm;
+		i_data = new std::size_t [ nz ];
+		j_data = new std::size_t [ nz ];
+		v_data = new T [ nz ];
 	};
 
 	mat_data( ){
-		n = 0;
+		nz = 0;
 	};
 	
 	mat_data( std::size_t in ){
@@ -125,14 +127,22 @@ struct mat_data {
 	};
 
 	std::size_t size(){
+		return nz;
+	};
+
+	std::size_t get_n(){
 		return n;
-	}
+	};
+
+	std::size_t get_m(){
+		return m;
+	};
 
 	~mat_data(){
 		delete [] i_data;
 		delete [] i_data;
 		delete [] v_data;
-	}
+	};
 };
 
 /**
@@ -145,7 +155,7 @@ struct vec_data {
 
 	void resize( std::size_t in ){
 		n=in;
-		v_data = new T [ in ];
+		v_data = new T [ n ];
 	};
 
 	vec_data( ){
@@ -158,11 +168,11 @@ struct vec_data {
 
 	std::size_t size(){
 		return n;
-	}
+	};
 
 	~vec_data(){
 		delete [] v_data;
-	}
+	};
 };
 
 
@@ -192,7 +202,7 @@ struct simulation_input {
 class preloaded_matrices : public simulation_input {
 
 public :
-	std::size_t nAmt, nMmt, nPmt, nRmt;
+	std::size_t nzAmt, nzMmt, nzPmt, nzRmt;
 	mat_data<double> *matAbuffer;
 	vec_data<double> *matMbuffer;
 	mat_data<double> *matPbuffer;
@@ -215,7 +225,7 @@ public :
 			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
 #endif
 			
-			data[i].resize( parser_mat.nz() );
+			data[i].resize( parser_mat.nz(), parser_mat.n(), parser_mat.m() );
 			size_t k = 0;
 			for (auto it=parser_mat.begin( SEQUENTIAL );
 				 it != parser_mat.end( SEQUENTIAL);
@@ -255,15 +265,15 @@ public :
 	grb::RC read_vec_matrics(){
 		grb::RC rc = SUCCESS;
 
-		nAmt = matAfiles.size() ;
-		nMmt = matMfiles.size() ;
-		nPmt = matPfiles.size() ;
-		nRmt = matRfiles.size() ;
+		nzAmt = matAfiles.size() ;
+		nzMmt = matMfiles.size() ;
+		nzPmt = matPfiles.size() ;
+		nzRmt = matRfiles.size() ;
 
-		matAbuffer = new mat_data<double> [ nAmt ];
-		matMbuffer = new vec_data<double> [ nMmt ];
-		matPbuffer = new mat_data<double> [ nPmt ];
-		matRbuffer = new mat_data<double> [ nRmt ];
+		matAbuffer = new mat_data<double> [ nzAmt ];
+		matMbuffer = new vec_data<double> [ nzMmt ];
+		matPbuffer = new mat_data<double> [ nzPmt ];
+		matRbuffer = new mat_data<double> [ nzRmt ];
 		
 		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
 		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
@@ -346,13 +356,10 @@ void print_norm( const grb::Vector< T > & r, const char * head, const Ring & rin
  * parameters in the reference HPCG test.
  */
 void grbProgram( const simulation_input & in, struct output & out ) {
-	
+	grb::utils::Timer timer;
 	// get user process ID
 	assert( spmd<>::pid() < spmd<>::nprocs() );
-	grb::utils::Timer timer;
-	timer.reset();
 
-	out.times.io = timer.time();
 
 	// assume successful run
 	out.error_code = SUCCESS;
@@ -369,11 +376,14 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		std::cerr << "Failure to read data" << std::endl;
 	}
 
+	timer.reset();
+	// start timing after all data are in memory 
+	out.times.io = timer.time();
+
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
 	rc = build_hpcg_system< double >( hpcg_state, in.max_coarsening_levels, inputData );
 
-
 	if( rc != SUCCESS ) {
 		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
 		out.error_code = rc;
@@ -397,6 +407,11 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	rc = grb::mxv( b, A, x, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
 	set( x, 0.0 );
 
+	double norm_b=0;
+	rc = grb::dot( norm_b, b, b, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
+	(void)rc;
+	assert( rc == SUCCESS );
+
 #ifdef HPCG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {
 		print_vector( x, 50, " ---> X(1)" );
@@ -428,9 +443,6 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 				break;
 			}
 
-#ifdef HPCG_PRINT_SYSTEM
-			std::cout << " ---> out.residual="<< out.residual << "\n";
-#endif
 		}
 		double time_taken = timer.time() - time_start;
 		rc = rc ? rc : collectives<>::reduce( time_taken, 0, operators::max< double >() );
@@ -444,6 +456,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 
 	out.times.io = timer.time() - out.times.io;
+	
 	out.times.preamble = timer.time() - out.times.preamble;
 
 #ifdef HPCG_PRINT_SYSTEM
@@ -459,6 +472,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 				std::cout << "Info: cold HPCG completed within " << out.performed_iterations << " iterations. Last computed residual is " << out.residual << ". Time taken was " << out.times.useful
 						  << " ms. Deduced inner repetitions parameter of " << out.test_repetitions << " to take 1 second or more per inner benchmark." << std::endl;
 			} else {
+				std::cout << "Final residual= "<< out.residual << " relative error= " <<  out.residual/sqrt(norm_b) << "\n";
 				std::cout << "Average time taken for each of " << out.test_repetitions << " HPCG calls (hot start): " << out.times.useful << std::endl;
 			}
 		} else {
@@ -578,7 +592,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	std::string matAfile=sim_in.matAfile_c_str;
 	if ( !matAfile.empty() ) {
 		std::cout << "Using  <<" << matAfile << ">> pattern to read matrices " << std::endl;
-		for ( size_t i=0 ; i <= sim_in.max_coarsening_levels ; i++ ){
+		for ( size_t i=0 ; i < sim_in.max_coarsening_levels ; i++ ){
 			std::string fnamebase = matAfile + std::to_string(  i );
 			std::string fnameA = fnamebase + "_A.mtx";
 			std::string fnameM = fnamebase + "_M_diag.mtx";
@@ -591,7 +605,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 			
 		}
 		{
-			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels + 1 );
+			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
 			std::string fnameA = fnamebase + "_A.mtx";
 			sim_in.matAfiles.push_back (fnameA);
 		}		

From 31154a6f6922b8cf7c59f2e5b562c02a0adaf678 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Wed, 15 Jun 2022 11:05:08 +0200
Subject: [PATCH 05/17] timer add IO

---
 tests/smoke/hpcg.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index aac11685b..eb9c22916 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -25,7 +25,6 @@
  *
  * @date 2021-04-30
  */
-
 #include <array>
 #include <cassert>
 #include <cmath>
@@ -357,6 +356,8 @@ void print_norm( const grb::Vector< T > & r, const char * head, const Ring & rin
  */
 void grbProgram( const simulation_input & in, struct output & out ) {
 	grb::utils::Timer timer;
+	timer.reset();
+
 	// get user process ID
 	assert( spmd<>::pid() < spmd<>::nprocs() );
 
@@ -376,9 +377,9 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		std::cerr << "Failure to read data" << std::endl;
 	}
 
-	timer.reset();
-	// start timing after all data are in memory 
 	out.times.io = timer.time();
+	timer.reset();
+
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
@@ -395,8 +396,6 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
-	out.times.preamble = timer.time();
-
 	Matrix< double > & A { hpcg_state->A };
 	Vector< double > & x { hpcg_state->x };
 	Vector< double > & b { hpcg_state->b };
@@ -419,6 +418,9 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
+	out.times.preamble = timer.time();
+	timer.reset();
+
 	const bool with_preconditioning = ! in.no_preconditioning;
 	out.test_repetitions = 0;
 	if( in.evaluation_run ) {
@@ -454,10 +456,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 #endif
 		// sleep( 1 );
 	}
-
-	out.times.io = timer.time() - out.times.io;
-	
-	out.times.preamble = timer.time() - out.times.preamble;
+	timer.reset();
 
 #ifdef HPCG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {

From bc2aa43c68470ad79f27e50d16b5879abbf0eedf Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Wed, 15 Jun 2022 13:54:56 +0200
Subject: [PATCH 06/17] read the last level of  M-diag

---
 .../algorithms/hpcg/multigrid_v_cycle.hpp     |  3 ++-
 .../algorithms/hpcg/system_building_utils.hpp | 24 +++++++++----------
 tests/smoke/hpcg.cpp                          | 24 ++++++++++++-------
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
index 7d1fbd8b4..a006726cd 100644
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
@@ -174,13 +174,14 @@ namespace grb {
 						grb::eWiseLambda(
 										 [ &data ]( const size_t i ) {
 #ifdef HPCG_PRINT_STEPS
-											 if( i < 100 ){
+											 if( i < 10 || i + 10 > size( data.A_diagonal ) ){
 												 std::cout << " i= " << i
 														   << " data.z[i]= " << data.z[i]
 														   << " data.A_diagonal[i]= " << data.A_diagonal[i]
 														   << " data.smoother_temp[i]= " << data.smoother_temp[i]
 														   << "\n";
 											 }
+											 std::cout << "\n";
 #endif
 											 data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
 										 },
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index a917053b0..93e9b4dba 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -154,21 +154,19 @@ namespace grb {
 #endif
 				}
 				
-				if( coarsening_level + 1 < max_levels ) {
-					RC rc = grb::buildVector( new_coarser->A_diagonal,
-											  in.matMbuffer[ coarsening_level + 1 ].v_data,
-											  in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
-											  SEQUENTIAL );
-					if ( rc != SUCCESS ) {
-						std::cerr << " buildVector failed!\n ";
-						return rc;
-					}
+				RC rc = grb::buildVector( new_coarser->A_diagonal,
+										  in.matMbuffer[ coarsening_level + 1 ].v_data,
+										  in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
+										  SEQUENTIAL );
+				if ( rc != SUCCESS ) {
+					std::cerr << " buildVector failed!\n ";
+					return rc;
+				}
 #ifdef HPCG_PRINT_STEPS
-					std::cout << " buildVector: new_coarser->A_diagonal "
-							  << size(new_coarser->A_diagonal) << " vector \n";
+				std::cout << " buildVector: new_coarser->A_diagonal "
+						  << size(new_coarser->A_diagonal) << " vector \n";
 #endif
-				}
-				
+
 				// prepare for new iteration
 				coarser = &new_coarser->coarser_level;
 				previous_size = coarser_size;
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index eb9c22916..f45dacfc3 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -216,14 +216,13 @@ public :
 													  grb::config::RowIndexType,
 													  grb::config::ColIndexType >::type
 										  > parser_mat( fname[i].c_str(), true );
-#ifdef DEBUG			
+#ifdef DEBUG
 			std::cout << " ---> parser_mat.filename()=" << parser_mat.filename() << "\n";
 			std::cout << " ---> parser_mat.nz()=" << parser_mat.nz() << "\n";
 			std::cout << " ---> parser_mat.n()=" << parser_mat.n() << "\n";
 			std::cout << " ---> parser_mat.m()=" << parser_mat.m() << "\n";
 			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
 #endif
-			
 			data[i].resize( parser_mat.nz(), parser_mat.n(), parser_mat.m() );
 			size_t k = 0;
 			for (auto it=parser_mat.begin( SEQUENTIAL );
@@ -244,21 +243,26 @@ public :
 						 vec_data<double> * data ) {
 		grb::RC rc = SUCCESS;
 		for ( size_t i = 0; i < fname.size(); i++ ){
+#ifdef DEBUG
+			std::cout << " Reading " << fname[i].c_str() << ".\n";
+#endif
 			std::ifstream inFile;
 			inFile.open(fname[i].c_str() );
+			if( ! inFile.is_open() ){
+				std::cout << " Cannot open "<< fname[i].c_str() <<  "\n";
+				return( PANIC );
+			}
 			std::string headder;
 			size_t n,m;
 			std::getline(inFile, headder); // skip the first line
 			inFile >> n >> m;
 			data[i].resize( n );
-			if (inFile.is_open())  	{
-				for (size_t j = 0; j < n; j++) {
-					inFile >> data[i].v_data[j];
-				}
-				inFile.close();
+			for (size_t j = 0; j < n; j++) {
+				inFile >> data[i].v_data[j];
 			}
+			inFile.close();
 		}
-		return ( rc );
+		return( rc );
 	}
 	
 	grb::RC read_vec_matrics(){
@@ -605,8 +609,12 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 		}
 		{
 			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
+
 			std::string fnameA = fnamebase + "_A.mtx";
 			sim_in.matAfiles.push_back (fnameA);
+
+			std::string fnameM = fnamebase + "_M_diag.mtx";
+			sim_in.matMfiles.push_back (fnameM);
 		}		
 
 		std::cout << "files to read matrices: " << std::endl;

From c4f349eb09f919ed28d1352cdf82fc5e4721e124 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Wed, 15 Jun 2022 15:49:40 +0200
Subject: [PATCH 07/17] multi-line comments in dense matrix market format

---
 tests/smoke/hpcg.cpp | 56 ++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index f45dacfc3..82a46129e 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -202,20 +202,20 @@ class preloaded_matrices : public simulation_input {
 
 public :
 	std::size_t nzAmt, nzMmt, nzPmt, nzRmt;
-	mat_data<double> *matAbuffer;
-	vec_data<double> *matMbuffer;
-	mat_data<double> *matPbuffer;
-	mat_data<double> *matRbuffer;
+	mat_data< double > * matAbuffer;
+	vec_data< double > * matMbuffer;
+	mat_data< double > * matPbuffer;
+	mat_data< double > * matRbuffer;
 
 	grb::RC read_matrix( std::vector< std::string > & fname,
 						  mat_data<double> * data ) {
 		grb::RC rc = SUCCESS;
-		for ( size_t i = 0; i < fname.size(); i++ ){
+		for ( size_t i = 0; i < fname.size(); i++ ) {
 			grb::utils::MatrixFileReader< double, std::conditional<
 				( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
 													  grb::config::RowIndexType,
 													  grb::config::ColIndexType >::type
-										  > parser_mat( fname[i].c_str(), true );
+										  > parser_mat( fname[ i ].c_str(), true );
 #ifdef DEBUG
 			std::cout << " ---> parser_mat.filename()=" << parser_mat.filename() << "\n";
 			std::cout << " ---> parser_mat.nz()=" << parser_mat.nz() << "\n";
@@ -228,9 +228,9 @@ public :
 			for (auto it=parser_mat.begin( SEQUENTIAL );
 				 it != parser_mat.end( SEQUENTIAL);
 				 ++it ){
-				data[i].i_data[k]=it.i();
-				data[i].j_data[k]=it.j();
-				data[i].v_data[k]=it.v();
+				data[ i ].i_data[ k ]=it.i();
+				data[ i ].j_data[ k ]=it.j();
+				data[ i ].v_data[ k ]=it.v();
 				k++;
 			}
 		}
@@ -240,25 +240,29 @@ public :
 
 
 	grb::RC read_vector( std::vector< std::string > & fname,
-						 vec_data<double> * data ) {
+						 vec_data< double > * data ) {
 		grb::RC rc = SUCCESS;
 		for ( size_t i = 0; i < fname.size(); i++ ){
 #ifdef DEBUG
-			std::cout << " Reading " << fname[i].c_str() << ".\n";
+			std::cout << " Reading " << fname[ i ].c_str() << ".\n";
 #endif
 			std::ifstream inFile;
-			inFile.open(fname[i].c_str() );
-			if( ! inFile.is_open() ){
-				std::cout << " Cannot open "<< fname[i].c_str() <<  "\n";
+			inFile.open( fname[ i ].c_str() );
+			if( ! inFile.is_open() ) {
+				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
 				return( PANIC );
 			}
 			std::string headder;
-			size_t n,m;
-			std::getline(inFile, headder); // skip the first line
-			inFile >> n >> m;
-			data[i].resize( n );
-			for (size_t j = 0; j < n; j++) {
-				inFile >> data[i].v_data[j];
+			size_t n, m;
+			std::getline( inFile, headder ); // skip the first line
+			while( headder.at( 0 ) == '%' ) {
+				std::getline( inFile, headder );
+			}
+			std::stringstream ss( headder );
+			ss >> n >> m;						
+			data[ i ].resize( n );
+			for ( size_t j = 0; j < n; j++ ) {
+				inFile >> data[ i ].v_data[ j ];
 			}
 			inFile.close();
 		}
@@ -273,10 +277,10 @@ public :
 		nzPmt = matPfiles.size() ;
 		nzRmt = matRfiles.size() ;
 
-		matAbuffer = new mat_data<double> [ nzAmt ];
-		matMbuffer = new vec_data<double> [ nzMmt ];
-		matPbuffer = new mat_data<double> [ nzPmt ];
-		matRbuffer = new mat_data<double> [ nzRmt ];
+		matAbuffer = new mat_data< double > [ nzAmt ];
+		matMbuffer = new vec_data< double > [ nzMmt ];
+		matPbuffer = new mat_data< double > [ nzPmt ];
+		matRbuffer = new mat_data< double > [ nzRmt ];
 		
 		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
 		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
@@ -337,7 +341,7 @@ static void print_system( const hpcg_data< double, double, double > & data ) {
 }
 #endif
 
-//#ifdef HPCG_PRINT_STEPS
+#ifdef HPCG_PRINT_STEPS
 template< typename T,
 		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
 	>
@@ -352,7 +356,7 @@ void print_norm( const grb::Vector< T > & r, const char * head, const Ring & rin
 	}
 	std::cout << norm << std::endl;
 }
-//#endif
+#endif
 
 /**
  * @brief Main test, building an HPCG problem and running the simulation closely following the

From 335df2e0479720adcd5286b19227ce236e9d7020 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Thu, 23 Jun 2022 17:46:35 +0200
Subject: [PATCH 08/17] date buffer global

---
 tests/smoke/hpcg.cpp | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 82a46129e..bc3700c01 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -194,7 +194,7 @@ struct simulation_input {
 	bool no_preconditioning;
 };
 
-
+static bool matloaded=false;
 /**
  * @brief Container to store all data for HPCG hierarchy.
  */
@@ -223,7 +223,7 @@ public :
 			std::cout << " ---> parser_mat.m()=" << parser_mat.m() << "\n";
 			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
 #endif
-			data[i].resize( parser_mat.nz(), parser_mat.n(), parser_mat.m() );
+			data[i].resize( parser_mat.entries()*2, parser_mat.n(), parser_mat.m() );
 			size_t k = 0;
 			for (auto it=parser_mat.begin( SEQUENTIAL );
 				 it != parser_mat.end( SEQUENTIAL);
@@ -233,11 +233,11 @@ public :
 				data[ i ].v_data[ k ]=it.v();
 				k++;
 			}
+			data[i].nz=k;
 		}
 
 		return ( rc );
-	}
-
+	}	
 
 	grb::RC read_vector( std::vector< std::string > & fname,
 						 vec_data< double > * data ) {
@@ -284,7 +284,7 @@ public :
 		
 		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
 		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
-		rc = rc ? rc : read_matrix( matPfiles, matPbuffer );
+		// rc = rc ? rc : read_matrix( matPfiles, matPbuffer );
 		rc = rc ? rc : read_vector( matMfiles, matMbuffer );
 
 		return rc;
@@ -293,6 +293,8 @@ public :
 
 
 
+preloaded_matrices inputData;
+
 /**
  * @brief Containers for test outputs.
  */
@@ -374,15 +376,18 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.error_code = SUCCESS;
 	RC rc { SUCCESS };
 
-	preloaded_matrices inputData;
-	inputData.matAfiles = in.matAfiles;
-	inputData.matMfiles = in.matMfiles;
-	inputData.matPfiles = in.matPfiles;
-	inputData.matRfiles = in.matRfiles;
+	if(!matloaded){
+		//preloaded_matrices inputData;
+		inputData.matAfiles = in.matAfiles;
+		inputData.matMfiles = in.matMfiles;
+		inputData.matPfiles = in.matPfiles;
+		inputData.matRfiles = in.matRfiles;
 
-	rc = inputData.read_vec_matrics();
-	if( rc != SUCCESS ) {
-		std::cerr << "Failure to read data" << std::endl;
+		rc = inputData.read_vec_matrics();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure to read data" << std::endl;
+		}
+		matloaded=true;
 	}
 
 	out.times.io = timer.time();
@@ -419,6 +424,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	(void)rc;
 	assert( rc == SUCCESS );
 
+	
+
 #ifdef HPCG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {
 		print_vector( x, 50, " ---> X(1)" );
@@ -426,6 +433,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
+	std::cout  << " ---> before solver \n";
+
 	out.times.preamble = timer.time();
 	timer.reset();
 

From 2165bcfa5c98f23b8a3a6fe393b4fc74feb67aad Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Fri, 8 Jul 2022 13:49:09 +0200
Subject: [PATCH 09/17] initial version: no checks!

---
 tests/smoke/hpcg.cpp | 88 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 13 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index bc3700c01..d5121d093 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -195,6 +195,22 @@ struct simulation_input {
 };
 
 static bool matloaded=false;
+
+
+template<typename CharT, typename TraitsT = std::char_traits<CharT> >
+class vectorwrapbuf :
+	public std::basic_streambuf<CharT, TraitsT> {
+public:
+    vectorwrapbuf(std::vector<CharT> &vec) {
+		this->setg(vec.data(), vec.data(), vec.data() + vec.size());
+    }
+};
+std::istream& operator>>(std::istream& is, std::string& s){
+	std::getline(is, s);
+	return is;
+}
+
+
 /**
  * @brief Container to store all data for HPCG hierarchy.
  */
@@ -224,13 +240,46 @@ public :
 			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
 #endif
 			data[i].resize( parser_mat.entries()*2, parser_mat.n(), parser_mat.m() );
+			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
+			if( ! inFile.is_open() ) {
+				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			}
+			std::streamsize size = inFile.tellg();
+			inFile.seekg(0, std::ios::beg);
+			std::vector<char> buffer(size);
+			if (inFile.read(buffer.data(), size))	{
+				/* alles gut! */
+			}
+			else {
+				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			} 
+			inFile.close();
+			vectorwrapbuf<char> databuf(buffer);
+			std::istream isdata(&databuf);
+			std::string headder;
+			isdata >> headder;
+			while( headder.at( 0 ) == '%' ) {
+				isdata >> headder;
+			}
+			std::stringstream ss( headder );
+			size_t n, m, nz;
+			ss >> n >> m >> nz ;
 			size_t k = 0;
-			for (auto it=parser_mat.begin( SEQUENTIAL );
-				 it != parser_mat.end( SEQUENTIAL);
-				 ++it ){
-				data[ i ].i_data[ k ]=it.i();
-				data[ i ].j_data[ k ]=it.j();
-				data[ i ].v_data[ k ]=it.v();
+			for ( size_t j = 0; j < nz; j++ ) {
+				// 	data[ i ].i_data[ k ]=it.i();
+				// 	data[ i ].j_data[ k ]=it.j();
+				// 	data[ i ].v_data[ k ]=it.v();
+				size_t itmp,jtmp;
+				double vtmp;
+				isdata >> itmp >> jtmp >> vtmp;
+				data[ i ].i_data[ k ]=itmp-1;
+				data[ i ].j_data[ k ]=jtmp-1;
+				data[ i ].v_data[ k ]=vtmp;				
+				// isdata >> data[ i ].i_data[ k ]
+				// 	   >> data[ i ].j_data[ k ]
+				// 	   >> data[ i ].v_data[ k ] ;
 				k++;
 			}
 			data[i].nz=k;
@@ -246,25 +295,38 @@ public :
 #ifdef DEBUG
 			std::cout << " Reading " << fname[ i ].c_str() << ".\n";
 #endif
-			std::ifstream inFile;
-			inFile.open( fname[ i ].c_str() );
+			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
 			if( ! inFile.is_open() ) {
 				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
 				return( PANIC );
 			}
+			std::streamsize size = inFile.tellg();
+			inFile.seekg(0, std::ios::beg);
+			std::vector<char> buffer(size);
+			if (inFile.read(buffer.data(), size))	{
+				/* alles gut! */
+			}
+			else {
+				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			} 
+			inFile.close();
+			vectorwrapbuf<char> databuf(buffer);
+			std::istream isdata(&databuf);
 			std::string headder;
 			size_t n, m;
-			std::getline( inFile, headder ); // skip the first line
+			std::getline( isdata, headder ); // skip the first line
 			while( headder.at( 0 ) == '%' ) {
-				std::getline( inFile, headder );
+				std::getline( isdata, headder );
 			}
 			std::stringstream ss( headder );
-			ss >> n >> m;						
+			ss >> n >> m;
+			std::cout << " Reading from" << fname[ i ] << " dense matrix with dimensions: "
+					  << " n x m = " << n << " x " << m << "\n";
 			data[ i ].resize( n );
 			for ( size_t j = 0; j < n; j++ ) {
-				inFile >> data[ i ].v_data[ j ];
+				isdata >> data[ i ].v_data[ j ];
 			}
-			inFile.close();
 		}
 		return( rc );
 	}

From a8ea4ff7cdd3e0ec112c5ceb750d33ef9e96ea3e Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Mon, 11 Jul 2022 14:14:45 +0200
Subject: [PATCH 10/17] set() omp fix

---
 include/graphblas/reference/io.hpp |  3 +++
 tests/smoke/hpcg.cpp               | 23 ++++++++---------------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index dbd34c199..01bb250ce 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -334,6 +334,9 @@ namespace grb {
 
 	} // namespace internal
 
+	extern "C" void foo();
+
+
 	/**
 	 * Sets all elements of a vector to the given value.
 	 *
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index d5121d093..5f04a0234 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -256,8 +256,8 @@ public :
 				return( PANIC );
 			} 
 			inFile.close();
-			vectorwrapbuf<char> databuf(buffer);
-			std::istream isdata(&databuf);
+			vectorwrapbuf< char > databuf( buffer );
+			std::istream isdata( &databuf );
 			std::string headder;
 			isdata >> headder;
 			while( headder.at( 0 ) == '%' ) {
@@ -268,23 +268,16 @@ public :
 			ss >> n >> m >> nz ;
 			size_t k = 0;
 			for ( size_t j = 0; j < nz; j++ ) {
-				// 	data[ i ].i_data[ k ]=it.i();
-				// 	data[ i ].j_data[ k ]=it.j();
-				// 	data[ i ].v_data[ k ]=it.v();
-				size_t itmp,jtmp;
+				size_t itmp, jtmp;
 				double vtmp;
 				isdata >> itmp >> jtmp >> vtmp;
-				data[ i ].i_data[ k ]=itmp-1;
-				data[ i ].j_data[ k ]=jtmp-1;
-				data[ i ].v_data[ k ]=vtmp;				
-				// isdata >> data[ i ].i_data[ k ]
-				// 	   >> data[ i ].j_data[ k ]
-				// 	   >> data[ i ].v_data[ k ] ;
-				k++;
+				data[ i ].i_data[ k ] = itmp - 1;
+				data[ i ].j_data[ k ] = jtmp - 1;
+				data[ i ].v_data[ k ] = vtmp;
+				k+=1;
 			}
-			data[i].nz=k;
+			data[ i ].nz = k;
 		}
-
 		return ( rc );
 	}	
 

From b4c7dda3c6abad446c20365c10d06b88a39d06bb Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Mon, 11 Jul 2022 16:21:03 +0200
Subject: [PATCH 11/17] cleanup

---
 tests/smoke/hpcg.cpp | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 5f04a0234..e10a5264c 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -175,6 +175,7 @@ struct vec_data {
 };
 
 
+static bool matloaded = false;
 
 /**
  * @brief Container for the parameters for the HPCG simulation.
@@ -194,15 +195,12 @@ struct simulation_input {
 	bool no_preconditioning;
 };
 
-static bool matloaded=false;
-
-
 template<typename CharT, typename TraitsT = std::char_traits<CharT> >
 class vectorwrapbuf :
 	public std::basic_streambuf<CharT, TraitsT> {
 public:
     vectorwrapbuf(std::vector<CharT> &vec) {
-		this->setg(vec.data(), vec.data(), vec.data() + vec.size());
+		this->setg( vec.data(), vec.data(), vec.data() + vec.size() );
     }
 };
 std::istream& operator>>(std::istream& is, std::string& s){
@@ -248,8 +246,8 @@ public :
 			std::streamsize size = inFile.tellg();
 			inFile.seekg(0, std::ios::beg);
 			std::vector<char> buffer(size);
-			if (inFile.read(buffer.data(), size))	{
-				/* alles gut! */
+			if ( inFile.read(buffer.data(), size) )	{
+				/* all fine! */
 			}
 			else {
 				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
@@ -274,7 +272,7 @@ public :
 				data[ i ].i_data[ k ] = itmp - 1;
 				data[ i ].j_data[ k ] = jtmp - 1;
 				data[ i ].v_data[ k ] = vtmp;
-				k+=1;
+				k += 1;
 			}
 			data[ i ].nz = k;
 		}
@@ -304,8 +302,8 @@ public :
 				return( PANIC );
 			} 
 			inFile.close();
-			vectorwrapbuf<char> databuf(buffer);
-			std::istream isdata(&databuf);
+			vectorwrapbuf< char > databuf( buffer );
+			std::istream isdata( &databuf );
 			std::string headder;
 			size_t n, m;
 			std::getline( isdata, headder ); // skip the first line
@@ -431,7 +429,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.error_code = SUCCESS;
 	RC rc { SUCCESS };
 
-	if(!matloaded){
+	if( ! matloaded ) {
 		//preloaded_matrices inputData;
 		inputData.matAfiles = in.matAfiles;
 		inputData.matMfiles = in.matMfiles;
@@ -442,13 +440,12 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		if( rc != SUCCESS ) {
 			std::cerr << "Failure to read data" << std::endl;
 		}
-		matloaded=true;
+		matloaded = true ;
 	}
 
 	out.times.io = timer.time();
 	timer.reset();
 
-
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
 	rc = build_hpcg_system< double >( hpcg_state, in.max_coarsening_levels, inputData );

From a0fb57e8b4f53e59694ff5e8d735e2cf6c61ee46 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Mon, 11 Jul 2022 17:00:57 +0200
Subject: [PATCH 12/17] rename hpcg to amg restore hpcg from develop

---
 include/graphblas/algorithms/amg/amg.hpp      | 230 ++++++
 include/graphblas/algorithms/amg/amg_data.hpp | 192 +++++
 .../algorithms/amg/multigrid_v_cycle.hpp      | 317 ++++++++
 .../algorithms/amg/system_building_utils.hpp  | 179 +++++
 include/graphblas/algorithms/hpcg/hpcg.hpp    |  28 +-
 .../graphblas/algorithms/hpcg/hpcg_data.hpp   |   2 +-
 .../algorithms/hpcg/matrix_building_utils.hpp |   2 -
 .../algorithms/hpcg/multigrid_v_cycle.hpp     |  94 +--
 .../algorithms/hpcg/system_building_utils.hpp | 174 ++---
 tests/smoke/CMakeLists.txt                    |   4 +
 tests/smoke/amg.cpp                           | 710 ++++++++++++++++++
 tests/smoke/hpcg.cpp                          | 420 ++---------
 12 files changed, 1791 insertions(+), 561 deletions(-)
 create mode 100644 include/graphblas/algorithms/amg/amg.hpp
 create mode 100644 include/graphblas/algorithms/amg/amg_data.hpp
 create mode 100644 include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
 create mode 100644 include/graphblas/algorithms/amg/system_building_utils.hpp
 create mode 100644 tests/smoke/amg.cpp

diff --git a/include/graphblas/algorithms/amg/amg.hpp b/include/graphblas/algorithms/amg/amg.hpp
new file mode 100644
index 000000000..db2ac5fd6
--- /dev/null
+++ b/include/graphblas/algorithms/amg/amg.hpp
@@ -0,0 +1,230 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file amg.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief File with the main routine to run a full AMG simulation, comprising multi-grid runs
+ *        with Red-Black Gauss-Seidel smoothing.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_AMG
+#define _H_GRB_ALGORITHMS_AMG
+
+#include <graphblas.hpp>
+
+#include "amg_data.hpp"
+#include "multigrid_v_cycle.hpp"
+
+#include <utils/print_vec_mat.hpp>
+
+namespace grb {
+	namespace algorithms {
+
+
+
+		
+		/**
+		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
+		 *
+		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running the AMG algorithm.
+		 * AMG implementation (as the standard one) couples a standard CG algorithm with a V-cycle
+		 * multi-grid solver to initially refine the tentative solution. This refinement step depends on the
+		 * availability of coarsening information, which should be stored inside \p data; otherwise,
+		 * the refinement is not performed and only the CG algorithm is run. 
+		 *
+		 * This implementation assumes that the vectors and matrices inside \p data are all correctly initialized
+		 * and populated with the proper values; in particular
+		 * - amg_data#x with the initial tentative solution (iterative solutions are also stored here)
+		 * - amg_data#A with the system matrix
+		 * - amg_data#b with the right-hand side vector \f$ b \f$
+		 * - amg_data#A_diagonal with the diagonal values of the matrix
+		 * - amg_data#color_masks with the color masks for this level
+		 * - amg_data#coarser_level with the information for the coarser multi-grid run (if any)
+		 * The other vectors are assumed to be inizialized (via the usual grb::Vector#Vector(size_t) constructor)
+		 * but not necessarily populated with values, as they are internally populated when needed; hence,
+		 * any previous values are overwritten.
+		 *
+		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
+		 * the failure code.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam ResidualType type of the residual norm
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam InputType type of values of the right-hand side vector b
+		 * @tparam Ring the ring of algebraic operators zero-values
+		 * @tparam Minus the minus operator for subtractions
+		 *
+		 * @param[in,out] data \ref amg_data object storing inputs, outputs and temporary vectors used for the computation,
+		 *                     as long as the information for the recursive multi-grid runs
+		 * @param[in] with_preconditioning whether to use pre-conditioning, i.e. to perform multi-grid runs
+		 * @param[in] presmoother_steps number of pre-smoother steps, for multi-grid runs
+		 * @param[in] postsmoother_steps nomber of post-smoother steps, for multi-grid runs
+		 * @param[in] max_iterations maximum number if iterations the simulation may run for; once reached,
+		 *                           the simulation stops even if the residual norm is above \p tolerance
+		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of the residual norm to stop
+		 *                      the simulation at
+		 * @param[out] iterations numbers of iterations performed
+		 * @param[out] norm_residual norm of the final residual
+		 * @param[in] ring the ring to perform the operations on
+		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+		 *                          unsuccessful operation otherwise
+		 */
+		template< typename IOType,
+			typename ResidualType,
+			typename NonzeroType,
+			typename InputType,
+			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
+			class Minus = operators::subtract< IOType > >
+		grb::RC amg( amg_data< IOType, NonzeroType, InputType > &data,
+			bool with_preconditioning,
+			const size_t presmoother_steps,
+			const size_t postsmoother_steps,
+			const size_t max_iterations,
+			const ResidualType tolerance,
+			size_t &iterations,
+			ResidualType &norm_residual,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus()
+		) {
+			ResidualType alpha;
+
+			const grb::Matrix< NonzeroType > &A { data.A };
+			grb::Vector< IOType > &x { data.x };
+			const grb::Vector< InputType > &b { data.b };
+			grb::Vector< IOType > &r { data.r };  // residual vector
+			grb::Vector< IOType > &p { data.p };  // direction vector
+			grb::Vector< IOType > &Ap { data.u }; // temp vector
+			grb::Vector< IOType > &z { data.z };  // pre-conditioned residual vector
+			grb::RC ret { SUCCESS };
+
+			ret = ret ? ret : grb::set( Ap, 0 );
+			ret = ret ? ret : grb::set( r, 0 );
+			ret = ret ? ret : grb::set( p, 0 );
+
+			ret = ret ? ret : grb::set( p, x );
+			ret = ret ? ret : grb::mxv( Ap, A, x, ring ); // Ap = A * x
+			assert( ret == SUCCESS );
+
+			ret = ret ? ret : grb::eWiseApply( r, b, Ap, minus ); // r = b - Ap;
+			assert( ret == SUCCESS );
+
+			norm_residual = ring.template getZero< ResidualType >();
+			ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // norm_residual = r' * r;
+			assert( ret == SUCCESS );
+
+			// compute sqrt to avoid underflow
+			norm_residual = std::sqrt( norm_residual );
+
+			// initial norm of residual
+			const ResidualType norm_residual_initial { norm_residual };
+			ResidualType old_r_dot_z { 0.0 }, r_dot_z { 0.0 }, beta { 0.0 };
+			size_t iter { 0 };
+
+#ifdef AMG_PRINT_STEPS
+			print_norm( p, "start p" );
+			print_norm( Ap, "start Ap" );
+			print_norm( r, "start r" );
+#endif
+
+			do {
+#ifdef AMG_PRINT_STEPS
+				DBG_println( "========= iteration " << iter << " =========" );
+#endif
+				if( with_preconditioning ) {
+					//ret = ret ? ret : grb::set( z, r ); // z = r;
+					ret = ret ? ret : internal::multi_grid( data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
+					//ret = ret ? ret : grb::set( x, z );
+					assert( ret == SUCCESS );
+				} else {
+					ret = ret ? ret : grb::set( z, r ); // z = r;
+					assert( ret == SUCCESS );
+				}
+#ifdef AMG_PRINT_STEPS
+				print_norm( z, "initial z" );
+#endif
+
+				ResidualType pAp;
+
+				if( iter == 0 ) {
+					ret = ret ? ret : grb::set( p, z ); //  p = z;
+					assert( ret == SUCCESS );
+
+					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
+					assert( ret == SUCCESS );
+				} else {
+					old_r_dot_z = r_dot_z;
+
+					r_dot_z = ring.template getZero< ResidualType >();
+					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
+					assert( ret == SUCCESS );
+
+					beta = r_dot_z / old_r_dot_z;
+					ret = ret ? ret : grb::clear( Ap );                         // Ap  = 0;
+					ret = ret ? ret : grb::eWiseMulAdd( Ap, beta, p, z, ring ); // Ap += beta * p + z;
+					std::swap( Ap, p );                                         // p = Ap;
+					assert( ret == SUCCESS );
+				}
+#ifdef AMG_PRINT_STEPS
+				print_norm( p, "middle p" );
+#endif
+				ret = ret ? ret : grb::set( Ap, 0 );
+				ret = ret ? ret : grb::mxv( Ap, A, p, ring ); // Ap = A * p;
+
+				assert( ret == SUCCESS );
+#ifdef AMG_PRINT_STEPS
+				print_norm( Ap, "middle Ap" );
+#endif
+				pAp = static_cast< ResidualType >( 0.0 );
+				ret = ret ? ret : grb::dot( pAp, Ap, p, ring ); // pAp = p' * Ap
+				assert( ret == SUCCESS );
+
+				alpha = r_dot_z / pAp;
+
+				ret = ret ? ret : grb::eWiseMul( x, alpha, p, ring ); // x += alpha * p;
+				assert( ret == SUCCESS );
+#ifdef AMG_PRINT_STEPS
+				print_norm( x, "end x" );
+#endif
+				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, ring ); // r += - alpha * Ap;
+				assert( ret == SUCCESS );
+#ifdef AMG_PRINT_STEPS
+				print_norm( r, "end r" );
+#endif
+
+				norm_residual = static_cast< ResidualType >( 0.0 );
+				ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // residual = r' * r;
+				assert( ret == SUCCESS );
+
+				norm_residual = std::sqrt( norm_residual );
+
+#ifdef AMG_PRINT_STEPS				
+				std::cout << " ---> norm_residual=" << norm_residual << "\n";
+#endif
+				++iter;
+			} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
+
+			iterations = iter;
+			return ret;
+		}
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_AMG
diff --git a/include/graphblas/algorithms/amg/amg_data.hpp b/include/graphblas/algorithms/amg/amg_data.hpp
new file mode 100644
index 000000000..62e2310a9
--- /dev/null
+++ b/include/graphblas/algorithms/amg/amg_data.hpp
@@ -0,0 +1,192 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file amg_data.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Data structures to store AMG input/output data.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_AMG_DATA
+#define _H_GRB_ALGORITHMS_AMG_DATA
+
+#include <vector>
+#include <cstddef>
+
+#include <graphblas.hpp>
+
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * @brief basic data container for the AMG algorithm, storing \b only the
+		 * data in common between the full CG run and the V-cycle multi-grid solver.
+		 * Additional data are stored in inheriting data structures.
+		 *
+		 * @tparam IOType type of values of the vectors for intermediate results
+		 * @tparam NonzeroType type of the values stored inside the system matrix #A
+		 */
+		template< typename IOType, typename NonzeroType >
+		struct system_data {
+
+			const std::size_t system_size; ///< size of the system, i.e. side of the #A
+
+			grb::Matrix< NonzeroType > A;                   ///< system matrix
+			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
+			grb::Vector< IOType > z;                        ///< multi-grid solution
+			grb::Vector< IOType > r;                        ///< residual
+			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
+			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
+
+			/**
+			 * @brief Constructor building all the stored vectors and matrices.
+			 *
+			 * Stored vectors and matrices are constructed according to \p sys_size but \b not initialized
+			 * to any value internally, as initialization is up to users's code.
+			 *
+			 * @param[in] sys_size the size of the underlying physical system, i.e. the size of vectors and the number
+			 * of rows and columns of the #A matrix.
+			 */
+			system_data( std::size_t sys_size ) :
+				system_size( sys_size ), A( sys_size, sys_size ), A_diagonal( sys_size ), z( sys_size ), r( sys_size ),
+				// temp(sys_size),
+				smoother_temp( sys_size ) {}
+
+			// for safety, disable copy semantics
+			system_data( const system_data & o ) = delete;
+
+			system_data & operator=( const system_data & ) = delete;
+		};
+
+		/**
+		 * @brief Data container for all multi-grid inputs and outputs.
+		 *
+		 * @tparam IOType Type of values of the vectors for intermediate results
+		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
+		 *                     and the coarsening matrix #Ax_finer
+		 *
+		 * This data structure stores information for a full multi-grid V cycle, i.e.
+		 * - input and output vectors for solution, residual and temporary vectors
+		 * - coarsening information, in particular the #coarsening_matrix that
+		 *   coarsens a larger system of size #finer_size to the current system
+		 *   of size #system_size
+		 * - the next level of coarsening, pointed to by #coarser_level, possibly being \c nullptr
+		 *   if no further coarsening is desired; note that this information is automatically
+		 *   destructed on object destruction (if any)
+		 *
+		 * Vectors stored here refer to the \b coarsened system (with the exception of #Ax_finer),
+		 * thus having size #system_size; this also holds for the system matrix #A,
+		 * while #coarsening_matrix has size #system_size \f$ \times \f$ #finer_size.
+		 * Hence, the typical usage of this data structure is to coarsen \b external vectors, e.g. vectors
+		 * coming from another \code multi_grid_data<IOType, NonZeroType> \endcode object whose #system_size equals
+		 * \code this-> \endcode #fines_size, via \code this-> \endcode #coarsening_matrix and store the coarsened
+		 * vectors internally. Mimicing the recursive behavior of standard multi-grid simulations,
+		 * the information for a further coarsening is stored inside #coarser_level, so that the
+		 * hierarchy of coarsened levels is reflected inside this data structure.
+		 *
+		 * As for \ref system_data, internal vectors and matrices are initialized to the proper size,
+		 * but their values are \b not initialized.
+		 */
+		template< typename IOType, typename NonzeroType >
+		struct multi_grid_data : public system_data< IOType, NonzeroType > {
+
+			const std::size_t finer_size; ///< ssize of the finer system to coarse from;
+			///< typically \c finer_size \code == 8 * \endcode #system_size
+
+			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
+
+			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
+			///< to coarsen an input vector of size #finer_size into a vector of size #system_size
+
+			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< pointer to next coarsening level, for recursive
+			                                                               ///< multi-grid V cycle implementations
+
+			/**
+			 * @brief Construct a new \c multi_grid_data_object by initializing internal data structures and setting
+			 *        #coarser_level to \c nullptr.
+			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
+			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
+			 */
+			multi_grid_data( std::size_t coarser_size, std::size_t _finer_size ) :
+				system_data< IOType, NonzeroType >( coarser_size ), finer_size( _finer_size ), Ax_finer( finer_size ), coarsening_matrix( coarser_size, finer_size ) {
+				coarser_level = nullptr;
+			}
+
+			/**
+			 * @brief Destroys the \c multi_grid_data_object object by destroying #coarser_level.
+			 */
+			virtual ~multi_grid_data() {
+				if( coarser_level != nullptr ) {
+					delete coarser_level;
+				}
+			}
+		};
+
+		/**
+		 * @brief Data stucture to store the data for a full AMG run: system vectors and matrix,
+		 * coarsening information and temporary vectors.
+		 *
+		 * This data structures contains all the needed vectors and matrices to solve a linear system
+		 * \f$ A x = b \f$. As for \ref system_data, internal elements are built and their sizes properly initialized
+		 * to #system_size, but internal values are \b not initialized, as they are left to user's logic.
+		 * Similarly, the coarsening information in #coarser_level is to be initialized by users by properly
+		 * building a \code multi_grid_data<IOType, NonzeroType> \endcode object and storing its pointer into
+		 * #coarser_level; on destruction, #coarser_level will also be properly destroyed without
+		 * user's intervention.
+		 *
+		 * @tparam IOType type of values of the vectors for intermediate results
+		 * @tparam NonzeroType type of the values stored inside the system matrix #A
+		 * @tparam InputType type of the values of the right-hand side vector #b
+		 */
+		template< typename IOType, typename NonzeroType, typename InputType >
+		struct amg_data : public system_data< IOType, NonzeroType > {
+
+			grb::Vector< InputType > b; ///< right-side vector of known values
+			grb::Vector< IOType > u;    ///< temporary vectors (typically for CG exploration directions)
+			grb::Vector< IOType > p;    ///< temporary vector (typically for x refinements coming from the multi-grid run)
+			grb::Vector< IOType > x;    // system solution being refined over the iterations: it us up to the user
+			///< to set the initial solution value
+
+			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< information about the coarser system, for
+			                                                               ///< the multi-grid run
+
+			/**
+			 * @brief Construct a new \c amg_data object by building vectors and matrices and by setting
+			 * #coarser_level to \c nullptr (i.e. no coarser level is assumed).
+			 *
+			 * @param[in] sys_size the size of the simulated system, i.e. of all the internal vectors and matrices
+			 */
+			amg_data( std::size_t sys_size ) : system_data< IOType, NonzeroType >( sys_size ), b( sys_size ), u( sys_size ), p( sys_size ), x( sys_size ) {
+				coarser_level = nullptr;
+			}
+
+			/**
+			 * @brief Destroy the \c amg_data object by destroying the #coarser_level informartion, if any.
+			 */
+			virtual ~amg_data() {
+				if( coarser_level != nullptr ) {
+					delete coarser_level;
+				}
+			}
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_AMG_DATA
diff --git a/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
new file mode 100644
index 000000000..5d8a82c59
--- /dev/null
+++ b/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
@@ -0,0 +1,317 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file multigrid_v_cycle.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief This file contains the routines for multi-grid solution refinement, including the main routine
+ *        and those for coarsening and refinement of the tentative solution.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+#define _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+
+#include <cassert>
+#include <vector>
+
+#include <graphblas.hpp>
+
+#include "amg_data.hpp"
+
+#include <utils/print_vec_mat.hpp>
+
+
+
+#define DBG_println( args ) std::cout << args << std::endl;
+
+
+namespace grb {
+	namespace algorithms {
+
+		
+		/**
+		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
+		 */
+		namespace internal {
+
+			
+			/**
+			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
+			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
+			 *
+			 * The coarsening information are stored inside \p coarsening_data.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 * @tparam Minus the minus operator for subtractions
+			 *
+			 * @param[in] r_fine fine residual vector
+			 * @param[in,out] coarsening_data \ref multi_grid_data data structure storing the information for coarsening
+			 * @param[in] ring the ring to perform the operations on
+			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *                          unsuccessful operation otherwise
+			 */
+			template< typename IOType,
+				typename NonzeroType,
+				class Ring,
+				class Minus >
+			grb::RC compute_coarsening( const grb::Vector< IOType > & r_fine, // fine residual
+				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
+				const Ring & ring,
+				const Minus & minus ) {
+				RC ret { SUCCESS };
+				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine, coarsening_data.Ax_finer,
+									  minus ); // Ax_finer = r_fine - Ax_finer
+				assert( ret == SUCCESS );
+
+				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
+				// to *coarsening_data->system_size
+				ret = ret ? ret : grb::set( coarsening_data.r, 0 );
+				ret = ret ? ret : grb::mxv( coarsening_data.r, coarsening_data.coarsening_matrix, coarsening_data.Ax_finer,
+									  ring ); // r = coarsening_matrix * Ax_finer
+				return ret;
+			}
+
+			/**
+			 * @brief computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
+			 * \p x_fine.
+			 *
+			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 *
+			 * @param[out] x_fine the solution vector to store the prolonged solution into
+			 * @param[in,out] coarsening_data information for coarsening
+			 * @param[in] ring the ring to perform the operations on
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 * unsuccessful operation otherwise
+			 */
+			template< typename IOType,
+				typename NonzeroType,
+				class Ring >
+			grb::RC compute_prolongation( grb::Vector< IOType > & x_fine, // fine residual
+				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
+				const Ring & ring ) {
+				RC ret { SUCCESS };
+				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
+				// to nrows(x_fine)
+				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
+
+				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix >( coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, coarsening_data.z, ring );
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
+				assert( ret == SUCCESS );
+				return ret;
+			}
+
+			template< typename IOType, typename NonzeroType, class Ring >
+			grb::RC spai0_smoother( system_data< IOType, NonzeroType > & data, const Ring & ring ) {
+				RC ret { SUCCESS };
+
+				NonzeroType alpha = 1.;
+				ret = ret ? ret : grb::eWiseMulAdd( data.z, alpha, data.A_diagonal, data.r, ring );
+				assert( ret == SUCCESS );
+					
+				return ret;
+			}
+			
+
+			/**
+			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
+			 * inside \p data.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 *
+			 * @param[in,out] data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
+			 *                     residual, system matrix colors, temporary vectors
+			 * @param[in] smoother_steps how many smoothing steps to run
+			 * @param[in] ring the ring to perform the operations on
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *                          unsuccessful operation otherwise
+			 */
+			template< typename IOType, typename NonzeroType, class Ring, class Minus >
+			grb::RC run_spai0_smoother( system_data< IOType, NonzeroType > & data,
+								  const std::size_t smoother_steps, const Ring & ring, const Minus & minus ) {
+				RC ret { SUCCESS };
+
+				for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
+					ret = ret ? ret : grb::set( data.smoother_temp, 0 );
+					ret = ret ? ret : grb::mxv( data.smoother_temp, data.A, data.z, ring );
+					ret = ret ? ret : grb::eWiseApply( data.smoother_temp, data.r, data.smoother_temp,
+													   minus ); 
+
+#ifdef HPCG_PRINT_STEPS
+					std::cout << " data.A(spai0): " << nrows(data.A) << " x " << ncols(data.A) << " \n";
+					print_norm( data.A_diagonal, " data.A_diagonal" );
+					print_norm( data.smoother_temp, " data.smoother_temp" );
+					print_norm( data.z, " data.z" );
+#endif
+
+					ret = ret ? ret :
+						grb::eWiseLambda(
+										 [ &data ]( const size_t i ) {
+#ifdef HPCG_PRINT_STEPS
+											 if( i < 10 || i + 10 > size( data.A_diagonal ) ){
+												 std::cout << " i= " << i
+														   << " data.z[i]= " << data.z[i]
+														   << " data.A_diagonal[i]= " << data.A_diagonal[i]
+														   << " data.smoother_temp[i]= " << data.smoother_temp[i]
+														   << "\n";
+											 }
+											 std::cout << "\n";
+#endif
+											 data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
+										 },
+										 data.A_diagonal, data.z, data.smoother_temp );
+
+					assert( ret == SUCCESS );
+				}
+
+				return ret;
+			}
+
+			/**
+			 * @brief Multi-grid V cycle implementation to refine a given solution.
+			 *
+			 * A full multi-grid run goes through the following steps:
+			 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the Red-Black Gauss-Seidel smoother are run
+			 *    to improve on the initial solution stored into \p data.z
+			 * -# the coarsening of \f$ r - A*z \f$ is computed to find the coarser residual vector
+			 * -# a multi-grid run is recursively performed on the coarser system
+			 * -# the tentative solution from the coarser multi-grid run is prolonged and added to the current tentative solution
+			 *    into \p data.z
+			 * -# this solution is further smoothed for \p postsmoother_steps steps
+			 *
+			 * If coarsening information is not available, the multi-grid run consists in a single smmothing run.
+			 *
+			 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
+			 * the failure code.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 * @tparam Minus the minus operator for subtractions
+			 *
+			 * @param[in,out] data \ref multi_grid_data object storing the relevant data for the multi-grid run of the current
+			 *                     clevel
+			 * @param[in,out] coarsening_data pointer to information for the coarsening/refinement operations and for the
+			 *                recursive multi-grid run on the coarsened system; if \c nullptr, no coarsening/refinement occurs
+			 *                and only smoothing occurs on the current solution
+			 * @param[in] presmoother_steps number of pre-smoother steps
+			 * @param[in] postsmoother_steps number of post-smoother steps
+			 * @param[in] ring the ring to perform the operations on
+			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *                          unsuccessful operation otherwise
+			 */
+			template< typename IOType, typename NonzeroType, class Ring, class Minus >
+			grb::RC multi_grid( system_data< IOType, NonzeroType > & data,
+				struct multi_grid_data< IOType, NonzeroType > * const coarsening_data,
+				const size_t presmoother_steps,
+				const size_t postsmoother_steps,
+				const Ring & ring,
+				const Minus & minus ) {
+				RC ret { SUCCESS };
+
+#ifdef HPCG_PRINT_STEPS
+				DBG_println( "mg BEGINNING {" );
+#endif
+
+				// clean destination vector
+				//ret = ret ? ret : grb::set( data.z, data.r );
+				ret = ret ? ret : grb::set( data.z, 0 );
+				
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "first print smoothed z" );
+				print_norm( data.r, "initial r" );
+#endif
+				if( coarsening_data == nullptr ) {
+					//compute one round of smoother
+					ret = ret ? ret : run_spai0_smoother( data, 1, ring, minus );
+					assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+					print_norm( data.z, "smoothed z" );
+					DBG_println( "} mg END" );
+#endif
+					return ret;
+				}
+
+				struct multi_grid_data< IOType, NonzeroType > & cd {
+					*coarsening_data
+				};
+
+				// pre-smoother
+				ret = ret ? ret : run_spai0_smoother( data, presmoother_steps, ring, minus );
+				assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "pre-smoothed z" );
+#endif
+
+				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
+				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
+				assert( ret == SUCCESS );
+
+				
+#ifdef HPCG_PRINT_STEPS
+				std::cout << "  data.A: " << nrows(data.A) << " x " << ncols(data.A) << " \n";
+				print_norm( cd.r, "before coarse cd.r" );
+#endif				
+
+				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
+				assert( ret == SUCCESS );
+
+				
+#ifdef HPCG_PRINT_STEPS
+				print_norm( cd.z, "after cd.coarse z" );				
+				print_norm( cd.r, "after cd.coarse r" );
+#endif
+
+				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );			
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : compute_prolongation( data.z, cd, ring );
+				assert( ret == SUCCESS );
+				
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "prolonged z" );
+#endif
+
+				// post-smoother
+				ret = ret ? ret : run_spai0_smoother( data, postsmoother_steps, ring, minus );
+				assert( ret == SUCCESS );
+				
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "post-smoothed z" );
+				DBG_println( "} mg END" );
+#endif
+
+				return ret;
+			}
+
+		} // namespace internal
+	}     // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
diff --git a/include/graphblas/algorithms/amg/system_building_utils.hpp b/include/graphblas/algorithms/amg/system_building_utils.hpp
new file mode 100644
index 000000000..30a98e78b
--- /dev/null
+++ b/include/graphblas/algorithms/amg/system_building_utils.hpp
@@ -0,0 +1,179 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file amg_system_building_utils.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Utilities to build an antire system for AMG simulations in an arbitrary number of dimensions.
+ * @date 2021-04-30
+ */
+
+#ifndef _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <memory>
+
+#include <graphblas.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include "amg_data.hpp"
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * @brief Generates an entire AMG problem according to the parameters in \p params , storing it in \p holder .
+		 *
+		 * @tparam DIMS dimensions of the system
+		 * @tparam T type of matrix values
+		 * @param holder std::unique_ptr to store the AMG problem into
+		 * @param params parameters container to build the AMG problem
+		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors and matrices) succeeded,
+		 * otherwise the first unsuccessful return value
+		 */
+		template< typename T = double, typename SYSINP >
+		grb::RC build_amg_system( std::unique_ptr< grb::algorithms::amg_data< T, T, T > > & holder, const std::size_t max_levels, SYSINP &in ) {
+			grb::RC rc { grb::SUCCESS };
+			std::size_t coarsening_level = 0UL;
+			const size_t n_A = in.matAbuffer[ coarsening_level ].get_n();
+			grb::algorithms::amg_data< T, T, T > * data { new grb::algorithms::amg_data< T, T, T >( n_A ) };
+			rc = buildMatrixUnique( data->A,
+									in.matAbuffer[ coarsening_level ].i_data,
+									in.matAbuffer[ coarsening_level ].j_data,
+									in.matAbuffer[ coarsening_level ].v_data,
+									in.matAbuffer[ coarsening_level ].size(),
+									SEQUENTIAL
+									);
+
+			/* Once internal issue #342 is resolved this can be re-enabled
+			   const RC rc = buildMatrixUnique( L,
+			   parser.begin( PARALLEL ), parser.end( PARALLEL),
+			   PARALLEL
+			   );*/
+			if( rc != SUCCESS ) {
+				std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+						  << "(" << toString( rc ) << ")." << std::endl;
+				return rc;
+			}
+#ifdef AMG_PRINT_STEPS
+			std::cout << " buildMatrixUnique: constructed data->A " << nrows(data->A) << " x " << ncols(data->A) << " matrix \n";
+#endif
+			assert( ! holder ); // should be empty
+			holder = std::unique_ptr< grb::algorithms::amg_data< T, T, T > >( data );
+
+			{
+				RC rc = grb::buildVector( data->A_diagonal,
+										  in.matMbuffer[coarsening_level].v_data,
+										  in.matMbuffer[coarsening_level].v_data + in.matMbuffer[coarsening_level].size(),
+										  SEQUENTIAL );
+				if ( rc != SUCCESS ) {
+					std::cerr << " buildVector failed!\n ";
+					return rc;
+				}
+#ifdef AMG_PRINT_STEPS
+				std::cout << " buildVector: data->A_diagonal "
+						  << size(data->A_diagonal) << " vector \n";
+#endif
+			}
+
+			std::size_t coarser_size;
+			std::size_t previous_size=n_A;
+			
+			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
+			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
+			assert( *coarser == nullptr );
+			
+			// generate linked list of hierarchical coarseners
+			while( coarsening_level  < max_levels ) {
+				assert( *coarser == nullptr );
+				
+				coarser_size = in.matAbuffer[ coarsening_level + 1 ].get_n();
+
+				// build data structures for new level
+				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
+				
+				// install coarser level immediately to cleanup in case of build error
+				*coarser = new_coarser;
+
+				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
+				{
+					rc = buildMatrixUnique( new_coarser->coarsening_matrix,
+											in.matRbuffer[ coarsening_level ].i_data,
+											in.matRbuffer[ coarsening_level ].j_data,
+											in.matRbuffer[ coarsening_level ].v_data,
+											in.matRbuffer[ coarsening_level ].size(),
+											SEQUENTIAL
+											);
+
+					if( rc != SUCCESS ) {
+						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+								  << "(" << toString( rc ) << ")." << std::endl;
+						return rc;
+					}
+#ifdef AMG_PRINT_STEPS
+					std::cout << " buildMatrixUnique: constructed new_coarser->coarsening_matrix "
+							  << nrows(new_coarser->coarsening_matrix) << " x " << ncols(new_coarser->coarsening_matrix) << " matrix \n";
+#endif
+				}
+				{
+					rc = buildMatrixUnique( new_coarser->A,
+											in.matAbuffer[ coarsening_level + 1 ].i_data,
+											in.matAbuffer[ coarsening_level + 1 ].j_data,
+											in.matAbuffer[ coarsening_level + 1 ].v_data,
+											in.matAbuffer[ coarsening_level + 1 ].size(),
+											SEQUENTIAL
+											);
+
+					if( rc != SUCCESS ) {
+						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+								  << "(" << toString( rc ) << ")." << std::endl;
+						return rc;
+					}
+#ifdef AMG_PRINT_STEPS
+					std::cout << " buildMatrixUnique: constructed new_coarser->A "
+							  << nrows(new_coarser->A) << " x " << ncols(new_coarser->A) << " matrix \n";
+#endif
+				}
+				
+				RC rc = grb::buildVector( new_coarser->A_diagonal,
+										  in.matMbuffer[ coarsening_level + 1 ].v_data,
+										  in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
+										  SEQUENTIAL );
+				if ( rc != SUCCESS ) {
+					std::cerr << " buildVector failed!\n ";
+					return rc;
+				}
+#ifdef AMG_PRINT_STEPS
+				std::cout << " buildVector: new_coarser->A_diagonal "
+						  << size(new_coarser->A_diagonal) << " vector \n";
+#endif
+
+				// prepare for new iteration
+				coarser = &new_coarser->coarser_level;
+				previous_size = coarser_size;
+				coarsening_level++;
+			}
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
index 01c00a50d..6caf22a1c 100644
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg.hpp
@@ -31,14 +31,10 @@
 #include "hpcg_data.hpp"
 #include "multigrid_v_cycle.hpp"
 
-#include <utils/print_vec_mat.hpp>
 
 namespace grb {
 	namespace algorithms {
 
-
-
-		
 		/**
 		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
 		 *
@@ -144,9 +140,9 @@ namespace grb {
 			size_t iter { 0 };
 
 #ifdef HPCG_PRINT_STEPS
-			print_norm( p, "start p" );
-			print_norm( Ap, "start Ap" );
-			print_norm( r, "start r" );
+			DBG_print_norm( p, "start p" );
+			DBG_print_norm( Ap, "start Ap" );
+			DBG_print_norm( r, "start r" );
 #endif
 
 			do {
@@ -154,16 +150,14 @@ namespace grb {
 				DBG_println( "========= iteration " << iter << " =========" );
 #endif
 				if( with_preconditioning ) {
-					//ret = ret ? ret : grb::set( z, r ); // z = r;
 					ret = ret ? ret : internal::multi_grid( data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
-					//ret = ret ? ret : grb::set( x, z );
 					assert( ret == SUCCESS );
 				} else {
 					ret = ret ? ret : grb::set( z, r ); // z = r;
 					assert( ret == SUCCESS );
 				}
 #ifdef HPCG_PRINT_STEPS
-				print_norm( z, "initial z" );
+				DBG_print_norm( z, "initial z" );
 #endif
 
 				ResidualType pAp;
@@ -188,14 +182,14 @@ namespace grb {
 					assert( ret == SUCCESS );
 				}
 #ifdef HPCG_PRINT_STEPS
-				print_norm( p, "middle p" );
+				DBG_print_norm( p, "middle p" );
 #endif
+
 				ret = ret ? ret : grb::set( Ap, 0 );
 				ret = ret ? ret : grb::mxv( Ap, A, p, ring ); // Ap = A * p;
-
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				print_norm( Ap, "middle Ap" );
+				DBG_print_norm( Ap, "middle Ap" );
 #endif
 				pAp = static_cast< ResidualType >( 0.0 );
 				ret = ret ? ret : grb::dot( pAp, Ap, p, ring ); // pAp = p' * Ap
@@ -206,12 +200,13 @@ namespace grb {
 				ret = ret ? ret : grb::eWiseMul( x, alpha, p, ring ); // x += alpha * p;
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				print_norm( x, "end x" );
+				DBG_print_norm( x, "end x" );
 #endif
+
 				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, ring ); // r += - alpha * Ap;
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				print_norm( r, "end r" );
+				DBG_print_norm( r, "end r" );
 #endif
 
 				norm_residual = static_cast< ResidualType >( 0.0 );
@@ -220,9 +215,6 @@ namespace grb {
 
 				norm_residual = std::sqrt( norm_residual );
 
-#ifdef HPCG_PRINT_STEPS				
-				std::cout << " ---> norm_residual=" << norm_residual << "\n";
-#endif
 				++iter;
 			} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
 
diff --git a/include/graphblas/algorithms/hpcg/hpcg_data.hpp b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
index d2df78e9d..96b39856d 100644
--- a/include/graphblas/algorithms/hpcg/hpcg_data.hpp
+++ b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
@@ -38,7 +38,7 @@ namespace grb {
 		/**
 		 * @brief basic data container for the HPCG algorithm, storing \b only the
 		 * data in common between the full CG run and the V-cycle multi-grid solver.
-		 * Additional data are stored in inheriting data structures.
+		 * Additional data are stored in inheriting daata structures.
 		 *
 		 * @tparam IOType type of values of the vectors for intermediate results
 		 * @tparam NonzeroType type of the values stored inside the system matrix #A
diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
index a291ad9e5..1facabe49 100644
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
@@ -69,7 +69,6 @@ namespace grb {
 			}
 			grb::algorithms::matrix_generator_iterator< DIMS, T > begin( sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
 			grb::algorithms::matrix_generator_iterator< DIMS, T > end( sys_sizes, n, halo_size, diag_value, non_diag_value );
-
 			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
 		}
 
@@ -116,7 +115,6 @@ namespace grb {
 
 			grb::algorithms::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, 0 );
 			grb::algorithms::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, rows );
-			
 			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
 		}
 
diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
index a006726cd..f40296f91 100644
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
@@ -34,23 +34,14 @@
 #include "hpcg_data.hpp"
 #include "red_black_gauss_seidel.hpp"
 
-#include <utils/print_vec_mat.hpp>
-
-
-
-#define DBG_println( args ) std::cout << args << std::endl;
-
 
 namespace grb {
 	namespace algorithms {
-
-		
 		/**
 		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
 		 */
 		namespace internal {
 
-			
 			/**
 			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
 			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
@@ -125,18 +116,6 @@ namespace grb {
 				return ret;
 			}
 
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC spai0_smoother( system_data< IOType, NonzeroType > & data, const Ring & ring ) {
-				RC ret { SUCCESS };
-
-				NonzeroType alpha = 1.;
-				ret = ret ? ret : grb::eWiseMulAdd( data.z, alpha, data.A_diagonal, data.r, ring );
-				assert( ret == SUCCESS );
-					
-				return ret;
-			}
-			
-
 			/**
 			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
 			 * inside \p data.
@@ -152,44 +131,14 @@ namespace grb {
 			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
 			 *                          unsuccessful operation otherwise
 			 */
-			template< typename IOType, typename NonzeroType, class Ring, class Minus >
-			grb::RC run_spai0_smoother( system_data< IOType, NonzeroType > & data,
-								  const std::size_t smoother_steps, const Ring & ring, const Minus & minus ) {
+			template< typename IOType, typename NonzeroType, class Ring >
+			grb::RC run_smoother( system_data< IOType, NonzeroType > & data, const std::size_t smoother_steps, const Ring & ring ) {
 				RC ret { SUCCESS };
 
 				for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
-					ret = ret ? ret : grb::set( data.smoother_temp, 0 );
-					ret = ret ? ret : grb::mxv( data.smoother_temp, data.A, data.z, ring );
-					ret = ret ? ret : grb::eWiseApply( data.smoother_temp, data.r, data.smoother_temp,
-													   minus ); 
-
-#ifdef HPCG_PRINT_STEPS
-					std::cout << " data.A(spai0): " << nrows(data.A) << " x " << ncols(data.A) << " \n";
-					print_norm( data.A_diagonal, " data.A_diagonal" );
-					print_norm( data.smoother_temp, " data.smoother_temp" );
-					print_norm( data.z, " data.z" );
-#endif
-
-					ret = ret ? ret :
-						grb::eWiseLambda(
-										 [ &data ]( const size_t i ) {
-#ifdef HPCG_PRINT_STEPS
-											 if( i < 10 || i + 10 > size( data.A_diagonal ) ){
-												 std::cout << " i= " << i
-														   << " data.z[i]= " << data.z[i]
-														   << " data.A_diagonal[i]= " << data.A_diagonal[i]
-														   << " data.smoother_temp[i]= " << data.smoother_temp[i]
-														   << "\n";
-											 }
-											 std::cout << "\n";
-#endif
-											 data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
-										 },
-										 data.A_diagonal, data.z, data.smoother_temp );
-
+					ret = ret ? ret : red_black_gauss_seidel( data, ring );
 					assert( ret == SUCCESS );
 				}
-
 				return ret;
 			}
 
@@ -235,25 +184,21 @@ namespace grb {
 				const Ring & ring,
 				const Minus & minus ) {
 				RC ret { SUCCESS };
-
 #ifdef HPCG_PRINT_STEPS
 				DBG_println( "mg BEGINNING {" );
 #endif
 
 				// clean destination vector
-				//ret = ret ? ret : grb::set( data.z, data.r );
 				ret = ret ? ret : grb::set( data.z, 0 );
-				
 #ifdef HPCG_PRINT_STEPS
-				print_norm( data.z, "first print smoothed z" );
-				print_norm( data.r, "initial r" );
+				DBG_print_norm( data.r, "initial r" );
 #endif
 				if( coarsening_data == nullptr ) {
-					//compute one round of smoother
-					ret = ret ? ret : run_spai0_smoother( data, 1, ring, minus );
+					// compute one round of Gauss Seidel and return
+					ret = ret ? ret : run_smoother( data, 1, ring );
 					assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-					print_norm( data.z, "smoothed z" );
+					DBG_print_norm( data.z, "smoothed z" );
 					DBG_println( "} mg END" );
 #endif
 					return ret;
@@ -264,47 +209,36 @@ namespace grb {
 				};
 
 				// pre-smoother
-				ret = ret ? ret : run_spai0_smoother( data, presmoother_steps, ring, minus );
+				ret = ret ? ret : run_smoother( data, presmoother_steps, ring );
 				assert( ret == SUCCESS );
 #ifdef HPCG_PRINT_STEPS
-				print_norm( data.z, "pre-smoothed z" );
+				DBG_print_norm( data.z, "pre-smoothed z" );
 #endif
 
 				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
 				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
 				assert( ret == SUCCESS );
 
-				
-#ifdef HPCG_PRINT_STEPS
-				std::cout << "  data.A: " << nrows(data.A) << " x " << ncols(data.A) << " \n";
-				print_norm( cd.r, "before coarse cd.r" );
-#endif				
-
 				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
 				assert( ret == SUCCESS );
-
-				
 #ifdef HPCG_PRINT_STEPS
-				print_norm( cd.z, "after cd.coarse z" );				
-				print_norm( cd.r, "after cd.coarse r" );
+				DBG_print_norm( cd.r, "coarse r" );
 #endif
 
-				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );			
+				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
 				assert( ret == SUCCESS );
 
 				ret = ret ? ret : compute_prolongation( data.z, cd, ring );
 				assert( ret == SUCCESS );
-				
 #ifdef HPCG_PRINT_STEPS
-				print_norm( data.z, "prolonged z" );
+				DBG_print_norm( data.z, "prolonged z" );
 #endif
 
 				// post-smoother
-				ret = ret ? ret : run_spai0_smoother( data, postsmoother_steps, ring, minus );
+				ret = ret ? ret : run_smoother( data, postsmoother_steps, ring );
 				assert( ret == SUCCESS );
-				
 #ifdef HPCG_PRINT_STEPS
-				print_norm( data.z, "post-smoothed z" );
+				DBG_print_norm( data.z, "post-smoothed z" );
 				DBG_println( "} mg END" );
 #endif
 
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 93e9b4dba..11adf82c1 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -31,7 +31,6 @@
 #include <memory>
 
 #include <graphblas.hpp>
-#include <graphblas/utils/parser.hpp>
 
 #include "hpcg_data.hpp"
 #include "matrix_building_utils.hpp"
@@ -40,6 +39,37 @@
 namespace grb {
 	namespace algorithms {
 
+		/**
+		 * @brief Divide each value of \p source by \p step and store the result into \p destination.
+		 *
+		 * @tparam DIMS size of passed arrays
+		 */
+		template< std::size_t DIMS >
+		void divide_array( std::array< std::size_t, DIMS > & destination, const std::array< std::size_t, DIMS > & source, std::size_t step ) {
+			for( std::size_t i { 0 }; i < destination.size(); i++ ) {
+				destination[ i ] = source[ i ] / step;
+			}
+		}
+
+		/**
+		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
+		 * coarsening information.
+		 *
+		 * @tparam DIMS dimensions of the physical system
+		 * @tparam T type of matrix values
+		 */
+		template< std::size_t DIMS, typename T >
+		struct hpcg_system_params {
+			const std::array< std::size_t, DIMS > & physical_sys_sizes;
+			const std::size_t halo_size;
+			const std::size_t num_colors;
+			const T diag_value;
+			const T non_diag_value;
+			const std::size_t min_phys_size;
+			const std::size_t max_levels;
+			const std::size_t coarsening_step;
+		};
+
 		/**
 		 * @brief Generates an entire HPCG problem according to the parameters in \p params , storing it in \p holder .
 		 *
@@ -50,126 +80,70 @@ namespace grb {
 		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors and matrices) succeeded,
 		 * otherwise the first unsuccessful return value
 		 */
-		template< typename T = double, typename SYSINP >
-		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, const std::size_t max_levels, SYSINP &in ) {
-			grb::RC rc { grb::SUCCESS };
-			std::size_t coarsening_level = 0UL;
-			const size_t n_A = in.matAbuffer[ coarsening_level ].get_n();
-			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n_A ) };
-			rc = buildMatrixUnique( data->A,
-									in.matAbuffer[ coarsening_level ].i_data,
-									in.matAbuffer[ coarsening_level ].j_data,
-									in.matAbuffer[ coarsening_level ].v_data,
-									in.matAbuffer[ coarsening_level ].size(),
-									SEQUENTIAL
-									);
-
-			/* Once internal issue #342 is resolved this can be re-enabled
-			   const RC rc = buildMatrixUnique( L,
-			   parser.begin( PARALLEL ), parser.end( PARALLEL),
-			   PARALLEL
-			   );*/
-			if( rc != SUCCESS ) {
-				std::cerr << "Failure: call to buildMatrixUnique did not succeed "
-						  << "(" << toString( rc ) << ")." << std::endl;
-				return rc;
-			}
-#ifdef HPCG_PRINT_STEPS
-			std::cout << " buildMatrixUnique: constructed data->A " << nrows(data->A) << " x " << ncols(data->A) << " matrix \n";
-#endif
+		template< std::size_t DIMS, typename T = double >
+		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, hpcg_system_params< DIMS, T > & params ) {
+			// n is the system matrix size
+			const std::size_t n { std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+
+			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n ) };
+
 			assert( ! holder ); // should be empty
 			holder = std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > >( data );
 
-			{
-				RC rc = grb::buildVector( data->A_diagonal,
-										  in.matMbuffer[coarsening_level].v_data,
-										  in.matMbuffer[coarsening_level].v_data + in.matMbuffer[coarsening_level].size(),
-										  SEQUENTIAL );
-				if ( rc != SUCCESS ) {
-					std::cerr << " buildVector failed!\n ";
-					return rc;
-				}
-#ifdef HPCG_PRINT_STEPS
-				std::cout << " buildVector: data->A_diagonal "
-						  << size(data->A_diagonal) << " vector \n";
-#endif
+			// initialize the main (=uncoarsened) system matrix
+			grb::RC rc { grb::SUCCESS };
+			rc = build_ndims_system_matrix< DIMS, T >( data->A, params.physical_sys_sizes, params.halo_size, params.diag_value, params.non_diag_value );
+
+			if( rc != grb::SUCCESS ) {
+				std::cerr << "Failure to generate the initial system (" << toString( rc ) << ") of size " << n << std::endl;
+				return rc;
 			}
 
-			std::size_t coarser_size;
-			std::size_t previous_size=n_A;
-			
+			// set values of diagonal vector
+			set( data->A_diagonal, params.diag_value );
+
+			build_static_color_masks( data->color_masks, n, params.num_colors );
+
 			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
 			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
 			assert( *coarser == nullptr );
-			
+			std::array< std::size_t, DIMS > coarser_sizes;
+			std::array< std::size_t, DIMS > previous_sizes( params.physical_sys_sizes );
+			std::size_t min_physical_coarsened_size { *std::min_element( previous_sizes.cbegin(), previous_sizes.cend() ) / params.coarsening_step };
+			// coarsen system sizes into coarser_sizes
+			divide_array( coarser_sizes, previous_sizes, params.coarsening_step );
+			std::size_t coarsening_level = 0UL;
+
 			// generate linked list of hierarchical coarseners
-			while( coarsening_level  < max_levels ) {
+			while( min_physical_coarsened_size >= params.min_phys_size && coarsening_level < params.max_levels ) {
 				assert( *coarser == nullptr );
-				
-				coarser_size = in.matAbuffer[ coarsening_level + 1 ].get_n();
-
+				// compute size of finer and coarser matrices
+				std::size_t coarser_size { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+				std::size_t previous_size { std::accumulate( previous_sizes.cbegin(), previous_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
 				// build data structures for new level
 				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
-				
 				// install coarser level immediately to cleanup in case of build error
 				*coarser = new_coarser;
-
 				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
-				{
-					rc = buildMatrixUnique( new_coarser->coarsening_matrix,
-											in.matRbuffer[ coarsening_level ].i_data,
-											in.matRbuffer[ coarsening_level ].j_data,
-											in.matRbuffer[ coarsening_level ].v_data,
-											in.matRbuffer[ coarsening_level ].size(),
-											SEQUENTIAL
-											);
-
-					if( rc != SUCCESS ) {
-						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
-								  << "(" << toString( rc ) << ")." << std::endl;
-						return rc;
-					}
-#ifdef HPCG_PRINT_STEPS
-					std::cout << " buildMatrixUnique: constructed new_coarser->coarsening_matrix "
-							  << nrows(new_coarser->coarsening_matrix) << " x " << ncols(new_coarser->coarsening_matrix) << " matrix \n";
-#endif
-				}
-				{
-					rc = buildMatrixUnique( new_coarser->A,
-											in.matAbuffer[ coarsening_level + 1 ].i_data,
-											in.matAbuffer[ coarsening_level + 1 ].j_data,
-											in.matAbuffer[ coarsening_level + 1 ].v_data,
-											in.matAbuffer[ coarsening_level + 1 ].size(),
-											SEQUENTIAL
-											);
-
-					if( rc != SUCCESS ) {
-						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
-								  << "(" << toString( rc ) << ")." << std::endl;
-						return rc;
-					}
-#ifdef HPCG_PRINT_STEPS
-					std::cout << " buildMatrixUnique: constructed new_coarser->A "
-							  << nrows(new_coarser->A) << " x " << ncols(new_coarser->A) << " matrix \n";
-#endif
+				rc = build_ndims_coarsener_matrix< DIMS >( new_coarser->coarsening_matrix, coarser_sizes, previous_sizes );
+				if( rc != grb::SUCCESS ) {
+					std::cerr << "Failure to generate coarsening matrix (" << toString( rc ) << ")." << std::endl;
+					return rc;
 				}
-				
-				RC rc = grb::buildVector( new_coarser->A_diagonal,
-										  in.matMbuffer[ coarsening_level + 1 ].v_data,
-										  in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
-										  SEQUENTIAL );
-				if ( rc != SUCCESS ) {
-					std::cerr << " buildVector failed!\n ";
+				rc = build_ndims_system_matrix< DIMS, T >( new_coarser->A, coarser_sizes, params.halo_size, params.diag_value, params.non_diag_value );
+				if( rc != grb::SUCCESS ) {
+					std::cerr << "Failure to generate system matrix (" << toString( rc ) << ")for size " << coarser_size << std::endl;
 					return rc;
 				}
-#ifdef HPCG_PRINT_STEPS
-				std::cout << " buildVector: new_coarser->A_diagonal "
-						  << size(new_coarser->A_diagonal) << " vector \n";
-#endif
+				set( new_coarser->A_diagonal, params.diag_value );
+				// build color masks for coarser level (same masks, but with coarser system size)
+				rc = build_static_color_masks( new_coarser->color_masks, coarser_size, params.num_colors );
 
 				// prepare for new iteration
 				coarser = &new_coarser->coarser_level;
-				previous_size = coarser_size;
+				min_physical_coarsened_size /= params.coarsening_step;
+				previous_sizes = coarser_sizes;
+				divide_array( coarser_sizes, coarser_sizes, params.coarsening_step );
 				coarsening_level++;
 			}
 			return rc;
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index fac6ceee8..21f97586a 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -98,7 +98,11 @@ add_grb_executables( knn knn.cpp ../unit/parser.cpp
 )
 
 add_grb_executables( hpcg hpcg.cpp
+	BACKENDS reference reference_omp bsp1d hybrid
+	ADDITIONAL_LINK_LIBRARIES test_utils
+)
 
+add_grb_executables( amg amg.cpp
 	BACKENDS reference reference_omp bsp1d hybrid
 	ADDITIONAL_LINK_LIBRARIES test_utils
 )
diff --git a/tests/smoke/amg.cpp b/tests/smoke/amg.cpp
new file mode 100644
index 000000000..4d42b3185
--- /dev/null
+++ b/tests/smoke/amg.cpp
@@ -0,0 +1,710 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file amg.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @brief Test for AMG solver, levels provided by AMGCL.
+ *
+ *
+ * @date 2021-04-30
+ */
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <string>
+
+#include <graphblas.hpp>
+
+
+// forward declaration for the tracing facility
+template< typename T,
+	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
+>
+void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
+
+#ifdef AMG_PRINT_STEPS
+
+// AMG_PRINT_STEPS requires defining the following symbols
+
+/**
+ * @brief simply prints \p args on a dedicated line.
+ */
+#define DBG_println( args ) std::cout << args << std::endl;
+
+
+/**
+ * @brief prints \p head and the norm of \p r.
+ */
+#define DBG_print_norm( vec, head ) print_norm( vec, head )
+#endif
+
+
+#include <graphblas/algorithms/amg/amg.hpp>
+#include <graphblas/algorithms/amg/system_building_utils.hpp>
+
+// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
+
+
+#include <graphblas/utils/Timer.hpp>
+
+#include <utils/argument_parser.hpp>
+#include <utils/assertions.hpp>
+#include <utils/print_vec_mat.hpp>
+
+//========== MAIN PROBLEM PARAMETERS =========
+// values modifiable via cmd line args: default set as in reference AMG
+constexpr std::size_t DEF_COARSENING_LEVELS{ 1U };
+constexpr std::size_t MAX_COARSENING_LEVELS{ 4U };
+constexpr std::size_t MAX_ITERATIONS_DEF{ 56UL };
+constexpr std::size_t SMOOTHER_STEPS_DEF{ 1 };
+//============================================
+
+constexpr double MAX_NORM { 4.0e-14 };
+
+using namespace grb;
+using namespace algorithms;
+
+static const char * const TEXT_HIGHLIGHT = "===> ";
+#define thcout ( std::cout << TEXT_HIGHLIGHT )
+#define thcerr ( std::cerr << TEXT_HIGHLIGHT )
+
+
+/**
+ * @brief Container to store matrices loaded from a file.
+ */
+template< typename T = double >
+struct mat_data {
+	std::size_t *i_data;
+	std::size_t *j_data;
+	T *v_data;
+	std::size_t nz, n, m;
+
+	void resize( std::size_t innz, std::size_t inn, std::size_t inm ){
+		nz=innz;
+		n=inn;
+		m=inm;
+		i_data = new std::size_t [ nz ];
+		j_data = new std::size_t [ nz ];
+		v_data = new T [ nz ];
+	};
+
+	mat_data( ){
+		nz = 0;
+	};
+	
+	mat_data( std::size_t in ){
+		resize( in );
+	};
+
+	std::size_t size(){
+		return nz;
+	};
+
+	std::size_t get_n(){
+		return n;
+	};
+
+	std::size_t get_m(){
+		return m;
+	};
+
+	~mat_data(){
+		delete [] i_data;
+		delete [] i_data;
+		delete [] v_data;
+	};
+};
+
+/**
+ * @brief Container to store vectors loaded from a file.
+ */
+template< typename T = double >
+struct vec_data {
+	T *v_data;
+	std::size_t n;
+
+	void resize( std::size_t in ){
+		n=in;
+		v_data = new T [ n ];
+	};
+
+	vec_data( ){
+		n = 0;
+	};
+	
+	vec_data( std::size_t in ){
+		resize( in );
+	};
+
+	std::size_t size(){
+		return n;
+	};
+
+	~vec_data(){
+		delete [] v_data;
+	};
+};
+
+
+static bool matloaded = false;
+
+/**
+ * @brief Container for the parameters for the AMG simulation.
+ */
+struct simulation_input {
+	std::size_t max_coarsening_levels;
+	std::size_t test_repetitions;
+	std::size_t max_iterations;
+	std::size_t smoother_steps;
+	const char * matAfile_c_str;
+	std::vector< std::string > matAfiles;
+	std::vector< std::string > matMfiles;
+	std::vector< std::string > matPfiles;
+	std::vector< std::string > matRfiles;
+	
+	bool evaluation_run;
+	bool no_preconditioning;
+};
+
+template<typename CharT, typename TraitsT = std::char_traits<CharT> >
+class vectorwrapbuf :
+	public std::basic_streambuf<CharT, TraitsT> {
+public:
+    vectorwrapbuf(std::vector<CharT> &vec) {
+		this->setg( vec.data(), vec.data(), vec.data() + vec.size() );
+    }
+};
+std::istream& operator>>(std::istream& is, std::string& s){
+	std::getline(is, s);
+	return is;
+}
+
+
+/**
+ * @brief Container to store all data for AMG hierarchy.
+ */
+class preloaded_matrices : public simulation_input {
+
+public :
+	std::size_t nzAmt, nzMmt, nzPmt, nzRmt;
+	mat_data< double > * matAbuffer;
+	vec_data< double > * matMbuffer;
+	mat_data< double > * matPbuffer;
+	mat_data< double > * matRbuffer;
+
+	grb::RC read_matrix( std::vector< std::string > & fname,
+						  mat_data<double> * data ) {
+		grb::RC rc = SUCCESS;
+		for ( size_t i = 0; i < fname.size(); i++ ) {
+			grb::utils::MatrixFileReader< double, std::conditional<
+				( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+													  grb::config::RowIndexType,
+													  grb::config::ColIndexType >::type
+										  > parser_mat( fname[ i ].c_str(), true );
+#ifdef DEBUG
+			std::cout << " ---> parser_mat.filename()=" << parser_mat.filename() << "\n";
+			std::cout << " ---> parser_mat.nz()=" << parser_mat.nz() << "\n";
+			std::cout << " ---> parser_mat.n()=" << parser_mat.n() << "\n";
+			std::cout << " ---> parser_mat.m()=" << parser_mat.m() << "\n";
+			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
+#endif
+			data[i].resize( parser_mat.entries()*2, parser_mat.n(), parser_mat.m() );
+			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
+			if( ! inFile.is_open() ) {
+				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			}
+			std::streamsize size = inFile.tellg();
+			inFile.seekg(0, std::ios::beg);
+			std::vector<char> buffer(size);
+			if ( inFile.read(buffer.data(), size) )	{
+				/* all fine! */
+			}
+			else {
+				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			} 
+			inFile.close();
+			vectorwrapbuf< char > databuf( buffer );
+			std::istream isdata( &databuf );
+			std::string headder;
+			isdata >> headder;
+			while( headder.at( 0 ) == '%' ) {
+				isdata >> headder;
+			}
+			std::stringstream ss( headder );
+			size_t n, m, nz;
+			ss >> n >> m >> nz ;
+			size_t k = 0;
+			for ( size_t j = 0; j < nz; j++ ) {
+				size_t itmp, jtmp;
+				double vtmp;
+				isdata >> itmp >> jtmp >> vtmp;
+				data[ i ].i_data[ k ] = itmp - 1;
+				data[ i ].j_data[ k ] = jtmp - 1;
+				data[ i ].v_data[ k ] = vtmp;
+				k += 1;
+			}
+			data[ i ].nz = k;
+		}
+		return ( rc );
+	}	
+
+	grb::RC read_vector( std::vector< std::string > & fname,
+						 vec_data< double > * data ) {
+		grb::RC rc = SUCCESS;
+		for ( size_t i = 0; i < fname.size(); i++ ){
+#ifdef DEBUG
+			std::cout << " Reading " << fname[ i ].c_str() << ".\n";
+#endif
+			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
+			if( ! inFile.is_open() ) {
+				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			}
+			std::streamsize size = inFile.tellg();
+			inFile.seekg(0, std::ios::beg);
+			std::vector<char> buffer(size);
+			if (inFile.read(buffer.data(), size))	{
+				/* alles gut! */
+			}
+			else {
+				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			} 
+			inFile.close();
+			vectorwrapbuf< char > databuf( buffer );
+			std::istream isdata( &databuf );
+			std::string headder;
+			size_t n, m;
+			std::getline( isdata, headder ); // skip the first line
+			while( headder.at( 0 ) == '%' ) {
+				std::getline( isdata, headder );
+			}
+			std::stringstream ss( headder );
+			ss >> n >> m;
+			std::cout << " Reading from" << fname[ i ] << " dense matrix with dimensions: "
+					  << " n x m = " << n << " x " << m << "\n";
+			data[ i ].resize( n );
+			for ( size_t j = 0; j < n; j++ ) {
+				isdata >> data[ i ].v_data[ j ];
+			}
+		}
+		return( rc );
+	}
+	
+	grb::RC read_vec_matrics(){
+		grb::RC rc = SUCCESS;
+
+		nzAmt = matAfiles.size() ;
+		nzMmt = matMfiles.size() ;
+		nzPmt = matPfiles.size() ;
+		nzRmt = matRfiles.size() ;
+
+		matAbuffer = new mat_data< double > [ nzAmt ];
+		matMbuffer = new vec_data< double > [ nzMmt ];
+		matPbuffer = new mat_data< double > [ nzPmt ];
+		matRbuffer = new mat_data< double > [ nzRmt ];
+		
+		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
+		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
+		// rc = rc ? rc : read_matrix( matPfiles, matPbuffer );
+		rc = rc ? rc : read_vector( matMfiles, matMbuffer );
+
+		return rc;
+	}
+};
+
+
+
+preloaded_matrices inputData;
+
+/**
+ * @brief Containers for test outputs.
+ */
+struct output {
+	RC error_code;
+	size_t test_repetitions;
+	size_t performed_iterations;
+	double residual;
+	grb::utils::TimerResults times;
+	std::unique_ptr< PinnedVector< double > > pinnedVector;
+	double square_norm_diff;
+
+	output() {
+		error_code = SUCCESS;
+		test_repetitions = 0;
+		performed_iterations = 0;
+		residual = 0.0;
+	}
+};
+
+/**
+ * @brief Returns the closets power of 2 bigger or equal to \p n .
+ */
+template< typename T = std::size_t >
+T static next_pow_2( T n ) {
+	static_assert( std::is_integral< T >::value, "Integral required." );
+	--n;
+	n |= ( n >> 1 );
+	for( unsigned i = 1; i <= sizeof( T ) * 4; i *= 2 ) {
+		const unsigned shift = static_cast< T >( 1U ) << i;
+		n |= ( n >> shift );
+	}
+	return n + 1;
+}
+
+
+#ifdef AMG_PRINT_SYSTEM
+static void print_system( const amg_data< double, double, double > & data ) {
+	print_matrix( data.A, 70, "A" );
+	multi_grid_data< double, double > * coarser = data.coarser_level;
+	while( coarser != nullptr ) {
+		print_matrix( coarser->coarsening_matrix, 50, "COARSENING MATRIX" );
+		print_matrix( coarser->A, 50, "COARSER SYSTEM MATRIX" );
+		coarser = coarser->coarser_level;
+	}
+}
+#endif
+
+#ifdef AMG_PRINT_STEPS
+template< typename T,
+		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
+	>
+void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
+	T norm=0;
+	RC ret = grb::dot( norm, r, r, ring ); // residual = r' * r;
+	(void)ret;
+	assert( ret == SUCCESS );
+	std::cout << ">>> ";
+	if( head != nullptr ) {
+		std::cout << head << ": ";
+	}
+	std::cout << norm << std::endl;
+}
+#endif
+
+/**
+ * @brief Main test, building an AMG problem and running the simulation closely following the
+ * parameters in the reference AMG test.
+ */
+void grbProgram( const simulation_input & in, struct output & out ) {
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// get user process ID
+	assert( spmd<>::pid() < spmd<>::nprocs() );
+
+
+	// assume successful run
+	out.error_code = SUCCESS;
+	RC rc { SUCCESS };
+
+	if( ! matloaded ) {
+		//preloaded_matrices inputData;
+		inputData.matAfiles = in.matAfiles;
+		inputData.matMfiles = in.matMfiles;
+		inputData.matPfiles = in.matPfiles;
+		inputData.matRfiles = in.matRfiles;
+
+		rc = inputData.read_vec_matrics();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure to read data" << std::endl;
+		}
+		matloaded = true ;
+	}
+
+	out.times.io = timer.time();
+	timer.reset();
+
+	// wrap amg_data inside a unique_ptr to forget about cleaning chores
+	std::unique_ptr< amg_data< double, double, double > > amg_state;
+	rc = build_amg_system< double >( amg_state, in.max_coarsening_levels, inputData );
+
+	if( rc != SUCCESS ) {
+		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
+		out.error_code = rc;
+		return;
+	}
+#ifdef AMG_PRINT_SYSTEM
+	if( spmd<>::pid() == 0 ) {
+		print_system( *amg_state );
+	}
+#endif
+
+	Matrix< double > & A { amg_state->A };
+	Vector< double > & x { amg_state->x };
+	Vector< double > & b { amg_state->b };
+
+	// set vectors as from standard AMG benchmark
+	set( x, 1.0 );
+	set( b, 0.0 );
+	rc = grb::mxv( b, A, x, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
+	set( x, 0.0 );
+
+	double norm_b=0;
+	rc = grb::dot( norm_b, b, b, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
+	(void)rc;
+	assert( rc == SUCCESS );
+
+	
+
+#ifdef AMG_PRINT_SYSTEM
+	if( spmd<>::pid() == 0 ) {
+		print_vector( x, 50, " ---> X(1)" );
+		print_vector( b, 50, " ---> B(1)" );
+	}
+#endif
+
+	std::cout  << " ---> before solver \n";
+
+	out.times.preamble = timer.time();
+	timer.reset();
+
+	const bool with_preconditioning = ! in.no_preconditioning;
+	out.test_repetitions = 0;
+	if( in.evaluation_run ) {
+		double single_time_start = timer.time();
+		rc = amg( *amg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+		double single_time = timer.time() - single_time_start;
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		out.times.useful = single_time;
+		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
+	} else {
+		// do benchmark
+		double time_start = timer.time();
+		for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
+			rc = set( x, 0.0 );
+			assert( rc == SUCCESS );
+			rc = amg( *amg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+			
+			out.test_repetitions++;
+			if( rc != SUCCESS ) {
+				break;
+			}
+
+		}
+		double time_taken = timer.time() - time_start;
+		rc = rc ? rc : collectives<>::reduce( time_taken, 0, operators::max< double >() );
+		out.times.useful = time_taken / static_cast< double >( out.test_repetitions );
+
+#ifdef AMG_PRINT_SYSTEM
+		rc = rc ? rc : grb::eWiseLambda( [ &x ]( const size_t i ) { x[ i ] -= 1;}, x );
+		print_norm (x, " ---> norm(x)");
+#endif
+		// sleep( 1 );
+	}
+	timer.reset();
+
+#ifdef AMG_PRINT_SYSTEM
+	if( spmd<>::pid() == 0 ) {
+		print_vector( x, 50, " ---> X(check out)" );
+		print_vector( b, 50, " ---> B(check out)" );
+	}
+#endif
+
+	if( spmd<>::pid() == 0 ) {
+		if( rc == SUCCESS ) {
+			if( in.evaluation_run ) {
+				std::cout << "Info: cold AMG completed within " << out.performed_iterations << " iterations. Last computed residual is " << out.residual << ". Time taken was " << out.times.useful
+						  << " ms. Deduced inner repetitions parameter of " << out.test_repetitions << " to take 1 second or more per inner benchmark." << std::endl;
+			} else {
+				std::cout << "Final residual= "<< out.residual << " relative error= " <<  out.residual/sqrt(norm_b) << "\n";
+				std::cout << "Average time taken for each of " << out.test_repetitions << " AMG calls (hot start): " << out.times.useful << std::endl;
+			}
+		} else {
+			std::cerr << "Failure: call to AMG did not succeed (" << toString( rc ) << ")." << std::endl;
+		}
+	}
+
+	// start postamble
+	timer.reset();
+	// set error code
+	out.error_code = rc;
+
+	Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one > ring;
+	grb::set( b, 1.0 );
+	out.square_norm_diff = 0.0;
+	grb::eWiseMul( b, -1.0, x, ring );
+	grb::dot( out.square_norm_diff, b, b, ring );
+
+	// output
+	out.pinnedVector = std::unique_ptr< PinnedVector< double > >( new PinnedVector< double >( x, SEQUENTIAL ) );
+	// finish timing
+	const double time_taken { timer.time() };
+	out.times.postamble = time_taken;
+}
+
+/**
+ * @brief Parser the command-line arguments to extract the simulation information and checks they are valid.
+ */
+static void parse_arguments( simulation_input &, std::size_t &, double &, int, char ** );
+
+int main( int argc, char ** argv ) {
+	simulation_input sim_in;
+	size_t test_outer_iterations;
+	double max_residual_norm;
+
+	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
+	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
+	thcout << "Test repetitions: " << sim_in.test_repetitions << std::endl;
+	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
+	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run << std::noboolalpha << std::endl;
+	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning << std::noboolalpha << std::endl;
+	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
+	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
+	thcout << "Maximum norm for residual: " << max_residual_norm << std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc { SUCCESS };
+
+	// launch estimator (if requested)
+	if( sim_in.evaluation_run ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, sim_in, out, true );
+		if( rc == SUCCESS ) {
+			sim_in.test_repetitions = out.test_repetitions;
+		} else {
+			thcout << "launcher.exec returns with non-SUCCESS error code " << grb::toString( rc ) << std::endl;
+			std::exit( -1 );
+		}
+	}
+
+	// launch full benchmark
+	grb::Benchmarker< AUTOMATIC > benchmarker;
+	rc = benchmarker.exec( &grbProgram, sim_in, out, 1, test_outer_iterations, true );
+	ASSERT_RC_SUCCESS( rc );
+	thcout << "Benchmark completed successfully and took " << out.performed_iterations << " iterations to converge with residual " << out.residual << std::endl;
+
+	if( ! out.pinnedVector ) {
+		thcerr << "no output vector to inspect" << std::endl;
+	} else {
+		const PinnedVector< double > &solution { *out.pinnedVector };
+		thcout << "Size of x is " << solution.size() << std::endl;
+		if( solution.size() > 0 ) {
+			print_vector( solution, 30, "SOLUTION" );
+		} else {
+			thcerr << "ERROR: solution contains no values" << std::endl;
+		}
+	}
+
+	ASSERT_RC_SUCCESS( out.error_code );
+
+	double residual_norm { sqrt( out.square_norm_diff ) };
+	thcout << "Residual norm: " << residual_norm << std::endl;
+
+	ASSERT_LT( residual_norm, max_residual_norm );
+
+	thcout << "Test OK" << std::endl;
+	return 0;
+}
+
+static void parse_arguments( simulation_input & sim_in, std::size_t & outer_iterations, double & max_residual_norm, int argc, char ** argv ) {
+
+	argument_parser parser;
+	parser
+		.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
+								"maximum level for coarsening; 0 means no coarsening; note: actual "
+								"level may be limited by the minimum system dimension" )
+		.add_optional_argument( "--mat_files_pattern", sim_in.matAfile_c_str, "","file pattern for files contining matrices A, M_diag, P, R "
+		 						"i.e. '--mat_a_file_names /path/to/dir/level_  --max_coarse-levels 2' will read /path/to/dir/level_0_A.mtx,  /path/to/dir/level_1_A.mtx, /path/to/dir/level_2_A.mtx ... " )
+		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
+		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
+		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of AMG iterations" )
+		.add_optional_argument( "--max-residual-norm", max_residual_norm, MAX_NORM,
+			"maximum norm for the residual to be acceptable (does NOT limit "
+			"the execution of the algorithm)" )
+		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF, "number of pre/post-smoother steps; 0 disables smoothing" )
+		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
+			"launch single run directly, without benchmarker (ignore "
+			"repetitions)" )
+		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" );
+	
+
+	parser.parse( argc, argv );
+	
+	std::string matAfile=sim_in.matAfile_c_str;
+	if ( !matAfile.empty() ) {
+		std::cout << "Using  <<" << matAfile << ">> pattern to read matrices " << std::endl;
+		for ( size_t i=0 ; i < sim_in.max_coarsening_levels ; i++ ){
+			std::string fnamebase = matAfile + std::to_string(  i );
+			std::string fnameA = fnamebase + "_A.mtx";
+			std::string fnameM = fnamebase + "_M_diag.mtx";
+			std::string fnameP = fnamebase + "_P.mtx";
+			std::string fnameR = fnamebase + "_R.mtx";
+			sim_in.matAfiles.push_back (fnameA);
+			sim_in.matMfiles.push_back (fnameM);
+			sim_in.matPfiles.push_back (fnameP);
+			sim_in.matRfiles.push_back (fnameR);
+			
+		}
+		{
+			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
+
+			std::string fnameA = fnamebase + "_A.mtx";
+			sim_in.matAfiles.push_back (fnameA);
+
+			std::string fnameM = fnamebase + "_M_diag.mtx";
+			sim_in.matMfiles.push_back (fnameM);
+		}		
+
+		std::cout << "files to read matrices: " << std::endl;
+		for (std::string fname: sim_in.matAfiles) {
+			std::cout << fname << " \n";
+		}
+
+		for (std::string fname: sim_in.matMfiles) {
+			std::cout << fname << " \n";
+		}
+
+		for (std::string fname: sim_in.matPfiles) {
+			std::cout << fname << " \n";
+		}
+
+		for (std::string fname: sim_in.matRfiles) {
+			std::cout << fname << " \n";
+		}
+
+	}
+	else {
+		std::cout << "No pattern to read matrices provided" << std::endl;
+	}
+
+	// check for valid values
+	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
+		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
+		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;
+	}
+	if( sim_in.test_repetitions == 0 ) {
+		std::cerr << "ERROR no test runs selected: set \"--test-rep >0\"" << std::endl;
+		std::exit( -1 );
+	}
+	if( sim_in.max_iterations == 0 ) {
+		std::cout << "Setting number of iterations to 1" << std::endl;
+		sim_in.max_iterations = 1;
+	}
+}
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index e10a5264c..65f14fdc7 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -25,6 +25,7 @@
  *
  * @date 2021-04-30
  */
+
 #include <array>
 #include <cassert>
 #include <cmath>
@@ -32,17 +33,12 @@
 #include <iostream>
 #include <memory>
 #include <type_traits>
-#include <string>
 
 #include <graphblas.hpp>
+#include <graphblas/algorithms/hpcg/hpcg.hpp>
+#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
 
-
-// forward declaration for the tracing facility
-template< typename T,
-	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
->
-void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
-
+// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
 #ifdef HPCG_PRINT_STEPS
 
 // HPCG_PRINT_STEPS requires defining the following symbols
@@ -52,6 +48,11 @@ void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring
  */
 #define DBG_println( args ) std::cout << args << std::endl;
 
+// forward declaration for the tracing facility
+template< typename T,
+	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
+>
+void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
 
 /**
  * @brief prints \p head and the norm of \p r.
@@ -59,13 +60,6 @@ void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring
 #define DBG_print_norm( vec, head ) print_norm( vec, head )
 #endif
 
-
-#include <graphblas/algorithms/hpcg/hpcg.hpp>
-#include <graphblas/algorithms/hpcg/system_building_utils.hpp>
-
-// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
-
-
 #include <graphblas/utils/Timer.hpp>
 
 #include <utils/argument_parser.hpp>
@@ -99,255 +93,24 @@ static const char * const TEXT_HIGHLIGHT = "===> ";
 
 
 /**
- * @brief Container to store matrices loaded from a file.
+ * @brief Container for system parameters to create the HPCG problem.
  */
-template< typename T = double >
-struct mat_data {
-	std::size_t *i_data;
-	std::size_t *j_data;
-	T *v_data;
-	std::size_t nz, n, m;
-
-	void resize( std::size_t innz, std::size_t inn, std::size_t inm ){
-		nz=innz;
-		n=inn;
-		m=inm;
-		i_data = new std::size_t [ nz ];
-		j_data = new std::size_t [ nz ];
-		v_data = new T [ nz ];
-	};
-
-	mat_data( ){
-		nz = 0;
-	};
-	
-	mat_data( std::size_t in ){
-		resize( in );
-	};
-
-	std::size_t size(){
-		return nz;
-	};
-
-	std::size_t get_n(){
-		return n;
-	};
-
-	std::size_t get_m(){
-		return m;
-	};
-
-	~mat_data(){
-		delete [] i_data;
-		delete [] i_data;
-		delete [] v_data;
-	};
-};
-
-/**
- * @brief Container to store vectors loaded from a file.
- */
-template< typename T = double >
-struct vec_data {
-	T *v_data;
-	std::size_t n;
-
-	void resize( std::size_t in ){
-		n=in;
-		v_data = new T [ n ];
-	};
-
-	vec_data( ){
-		n = 0;
-	};
-	
-	vec_data( std::size_t in ){
-		resize( in );
-	};
-
-	std::size_t size(){
-		return n;
-	};
-
-	~vec_data(){
-		delete [] v_data;
-	};
+struct system_input {
+	std::size_t nx, ny, nz;
+	std::size_t max_coarsening_levels;
 };
 
-
-static bool matloaded = false;
-
 /**
  * @brief Container for the parameters for the HPCG simulation.
  */
-struct simulation_input {
-	std::size_t max_coarsening_levels;
+struct simulation_input : public system_input {
 	std::size_t test_repetitions;
 	std::size_t max_iterations;
 	std::size_t smoother_steps;
-	const char * matAfile_c_str;
-	std::vector< std::string > matAfiles;
-	std::vector< std::string > matMfiles;
-	std::vector< std::string > matPfiles;
-	std::vector< std::string > matRfiles;
-	
 	bool evaluation_run;
 	bool no_preconditioning;
 };
 
-template<typename CharT, typename TraitsT = std::char_traits<CharT> >
-class vectorwrapbuf :
-	public std::basic_streambuf<CharT, TraitsT> {
-public:
-    vectorwrapbuf(std::vector<CharT> &vec) {
-		this->setg( vec.data(), vec.data(), vec.data() + vec.size() );
-    }
-};
-std::istream& operator>>(std::istream& is, std::string& s){
-	std::getline(is, s);
-	return is;
-}
-
-
-/**
- * @brief Container to store all data for HPCG hierarchy.
- */
-class preloaded_matrices : public simulation_input {
-
-public :
-	std::size_t nzAmt, nzMmt, nzPmt, nzRmt;
-	mat_data< double > * matAbuffer;
-	vec_data< double > * matMbuffer;
-	mat_data< double > * matPbuffer;
-	mat_data< double > * matRbuffer;
-
-	grb::RC read_matrix( std::vector< std::string > & fname,
-						  mat_data<double> * data ) {
-		grb::RC rc = SUCCESS;
-		for ( size_t i = 0; i < fname.size(); i++ ) {
-			grb::utils::MatrixFileReader< double, std::conditional<
-				( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
-													  grb::config::RowIndexType,
-													  grb::config::ColIndexType >::type
-										  > parser_mat( fname[ i ].c_str(), true );
-#ifdef DEBUG
-			std::cout << " ---> parser_mat.filename()=" << parser_mat.filename() << "\n";
-			std::cout << " ---> parser_mat.nz()=" << parser_mat.nz() << "\n";
-			std::cout << " ---> parser_mat.n()=" << parser_mat.n() << "\n";
-			std::cout << " ---> parser_mat.m()=" << parser_mat.m() << "\n";
-			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
-#endif
-			data[i].resize( parser_mat.entries()*2, parser_mat.n(), parser_mat.m() );
-			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
-			if( ! inFile.is_open() ) {
-				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
-				return( PANIC );
-			}
-			std::streamsize size = inFile.tellg();
-			inFile.seekg(0, std::ios::beg);
-			std::vector<char> buffer(size);
-			if ( inFile.read(buffer.data(), size) )	{
-				/* all fine! */
-			}
-			else {
-				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
-				return( PANIC );
-			} 
-			inFile.close();
-			vectorwrapbuf< char > databuf( buffer );
-			std::istream isdata( &databuf );
-			std::string headder;
-			isdata >> headder;
-			while( headder.at( 0 ) == '%' ) {
-				isdata >> headder;
-			}
-			std::stringstream ss( headder );
-			size_t n, m, nz;
-			ss >> n >> m >> nz ;
-			size_t k = 0;
-			for ( size_t j = 0; j < nz; j++ ) {
-				size_t itmp, jtmp;
-				double vtmp;
-				isdata >> itmp >> jtmp >> vtmp;
-				data[ i ].i_data[ k ] = itmp - 1;
-				data[ i ].j_data[ k ] = jtmp - 1;
-				data[ i ].v_data[ k ] = vtmp;
-				k += 1;
-			}
-			data[ i ].nz = k;
-		}
-		return ( rc );
-	}	
-
-	grb::RC read_vector( std::vector< std::string > & fname,
-						 vec_data< double > * data ) {
-		grb::RC rc = SUCCESS;
-		for ( size_t i = 0; i < fname.size(); i++ ){
-#ifdef DEBUG
-			std::cout << " Reading " << fname[ i ].c_str() << ".\n";
-#endif
-			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
-			if( ! inFile.is_open() ) {
-				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
-				return( PANIC );
-			}
-			std::streamsize size = inFile.tellg();
-			inFile.seekg(0, std::ios::beg);
-			std::vector<char> buffer(size);
-			if (inFile.read(buffer.data(), size))	{
-				/* alles gut! */
-			}
-			else {
-				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
-				return( PANIC );
-			} 
-			inFile.close();
-			vectorwrapbuf< char > databuf( buffer );
-			std::istream isdata( &databuf );
-			std::string headder;
-			size_t n, m;
-			std::getline( isdata, headder ); // skip the first line
-			while( headder.at( 0 ) == '%' ) {
-				std::getline( isdata, headder );
-			}
-			std::stringstream ss( headder );
-			ss >> n >> m;
-			std::cout << " Reading from" << fname[ i ] << " dense matrix with dimensions: "
-					  << " n x m = " << n << " x " << m << "\n";
-			data[ i ].resize( n );
-			for ( size_t j = 0; j < n; j++ ) {
-				isdata >> data[ i ].v_data[ j ];
-			}
-		}
-		return( rc );
-	}
-	
-	grb::RC read_vec_matrics(){
-		grb::RC rc = SUCCESS;
-
-		nzAmt = matAfiles.size() ;
-		nzMmt = matMfiles.size() ;
-		nzPmt = matPfiles.size() ;
-		nzRmt = matRfiles.size() ;
-
-		matAbuffer = new mat_data< double > [ nzAmt ];
-		matMbuffer = new vec_data< double > [ nzMmt ];
-		matPbuffer = new mat_data< double > [ nzPmt ];
-		matRbuffer = new mat_data< double > [ nzRmt ];
-		
-		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
-		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
-		// rc = rc ? rc : read_matrix( matPfiles, matPbuffer );
-		rc = rc ? rc : read_vector( matMfiles, matMbuffer );
-
-		return rc;
-	}
-};
-
-
-
-preloaded_matrices inputData;
-
 /**
  * @brief Containers for test outputs.
  */
@@ -383,6 +146,18 @@ T static next_pow_2( T n ) {
 	return n + 1;
 }
 
+/**
+ * @brief Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
+ * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
+ */
+static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in ) {
+	const std::array< std::size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
+	struct hpcg_system_params< 3, double > params {
+		physical_sys_sizes, HALO_RADIUS, BAND_WIDTH_3D * 2 + 1, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
+	};
+
+	return build_hpcg_system< 3, double >( holder, params );
+}
 
 #ifdef HPCG_PRINT_SYSTEM
 static void print_system( const hpcg_data< double, double, double > & data ) {
@@ -401,7 +176,7 @@ template< typename T,
 		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
 	>
 void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm=0;
+	T norm;
 	RC ret = grb::dot( norm, r, r, ring ); // residual = r' * r;
 	(void)ret;
 	assert( ret == SUCCESS );
@@ -418,37 +193,18 @@ void print_norm( const grb::Vector< T > & r, const char * head, const Ring & rin
  * parameters in the reference HPCG test.
  */
 void grbProgram( const simulation_input & in, struct output & out ) {
-	grb::utils::Timer timer;
-	timer.reset();
-
 	// get user process ID
 	assert( spmd<>::pid() < spmd<>::nprocs() );
-
+	grb::utils::Timer timer;
+	timer.reset();
 
 	// assume successful run
 	out.error_code = SUCCESS;
 	RC rc { SUCCESS };
 
-	if( ! matloaded ) {
-		//preloaded_matrices inputData;
-		inputData.matAfiles = in.matAfiles;
-		inputData.matMfiles = in.matMfiles;
-		inputData.matPfiles = in.matPfiles;
-		inputData.matRfiles = in.matRfiles;
-
-		rc = inputData.read_vec_matrics();
-		if( rc != SUCCESS ) {
-			std::cerr << "Failure to read data" << std::endl;
-		}
-		matloaded = true ;
-	}
-
-	out.times.io = timer.time();
-	timer.reset();
-
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
 	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
-	rc = build_hpcg_system< double >( hpcg_state, in.max_coarsening_levels, inputData );
+	rc = build_3d_system( hpcg_state, in );
 
 	if( rc != SUCCESS ) {
 		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
@@ -471,31 +227,21 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	rc = grb::mxv( b, A, x, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
 	set( x, 0.0 );
 
-	double norm_b=0;
-	rc = grb::dot( norm_b, b, b, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
-	(void)rc;
-	assert( rc == SUCCESS );
-
-	
-
 #ifdef HPCG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {
-		print_vector( x, 50, " ---> X(1)" );
-		print_vector( b, 50, " ---> B(1)" );
+		print_vector( x, 50, "X" );
+		print_vector( b, 50, "B" );
 	}
 #endif
 
-	std::cout  << " ---> before solver \n";
-
 	out.times.preamble = timer.time();
-	timer.reset();
 
 	const bool with_preconditioning = ! in.no_preconditioning;
-	out.test_repetitions = 0;
 	if( in.evaluation_run ) {
-		double single_time_start = timer.time();
+		out.test_repetitions = 0;
+		timer.reset();
 		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-		double single_time = timer.time() - single_time_start;
+		double single_time = timer.time();
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
 		}
@@ -503,36 +249,20 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
 	} else {
 		// do benchmark
-		double time_start = timer.time();
+		timer.reset();
 		for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
 			rc = set( x, 0.0 );
 			assert( rc == SUCCESS );
 			rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-			
 			out.test_repetitions++;
 			if( rc != SUCCESS ) {
 				break;
 			}
-
 		}
-		double time_taken = timer.time() - time_start;
-		rc = rc ? rc : collectives<>::reduce( time_taken, 0, operators::max< double >() );
+		double time_taken { timer.time() };
 		out.times.useful = time_taken / static_cast< double >( out.test_repetitions );
-
-#ifdef HPCG_PRINT_SYSTEM
-		rc = rc ? rc : grb::eWiseLambda( [ &x ]( const size_t i ) { x[ i ] -= 1;}, x );
-		print_norm (x, " ---> norm(x)");
-#endif
 		// sleep( 1 );
 	}
-	timer.reset();
-
-#ifdef HPCG_PRINT_SYSTEM
-	if( spmd<>::pid() == 0 ) {
-		print_vector( x, 50, " ---> X(check out)" );
-		print_vector( b, 50, " ---> B(check out)" );
-	}
-#endif
 
 	if( spmd<>::pid() == 0 ) {
 		if( rc == SUCCESS ) {
@@ -540,7 +270,6 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 				std::cout << "Info: cold HPCG completed within " << out.performed_iterations << " iterations. Last computed residual is " << out.residual << ". Time taken was " << out.times.useful
 						  << " ms. Deduced inner repetitions parameter of " << out.test_repetitions << " to take 1 second or more per inner benchmark." << std::endl;
 			} else {
-				std::cout << "Final residual= "<< out.residual << " relative error= " <<  out.residual/sqrt(norm_b) << "\n";
 				std::cout << "Average time taken for each of " << out.test_repetitions << " HPCG calls (hot start): " << out.times.useful << std::endl;
 			}
 		} else {
@@ -577,6 +306,9 @@ int main( int argc, char ** argv ) {
 	double max_residual_norm;
 
 	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
+	thcout << "System size x: " << sim_in.nx << std::endl;
+	thcout << "System size y: " << sim_in.ny << std::endl;
+	thcout << "System size z: " << sim_in.nz << std::endl;
 	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
 	thcout << "Test repetitions: " << sim_in.test_repetitions << std::endl;
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
@@ -636,12 +368,13 @@ int main( int argc, char ** argv ) {
 static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations, double & max_residual_norm, int argc, char ** argv ) {
 
 	argument_parser parser;
-	parser
+	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
+		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
+		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
 		.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
-								"maximum level for coarsening; 0 means no coarsening; note: actual "
-								"level may be limited by the minimum system dimension" )
-		.add_optional_argument( "--mat_files_pattern", sim_in.matAfile_c_str, "","file pattern for files contining matrices A, M_diag, P, R "
-		 						"i.e. '--mat_a_file_names /path/to/dir/level_  --max_coarse-levels 2' will read /path/to/dir/level_0_A.mtx,  /path/to/dir/level_1_A.mtx, /path/to/dir/level_2_A.mtx ... " )
+			"maximum level for coarsening; 0 means no coarsening; note: actual "
+			"level may be limited"
+			" by the minimum system dimension" )
 		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
 		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
 		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of HPCG iterations" )
@@ -653,58 +386,25 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 			"launch single run directly, without benchmarker (ignore "
 			"repetitions)" )
 		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" );
-	
 
 	parser.parse( argc, argv );
-	
-	std::string matAfile=sim_in.matAfile_c_str;
-	if ( !matAfile.empty() ) {
-		std::cout << "Using  <<" << matAfile << ">> pattern to read matrices " << std::endl;
-		for ( size_t i=0 ; i < sim_in.max_coarsening_levels ; i++ ){
-			std::string fnamebase = matAfile + std::to_string(  i );
-			std::string fnameA = fnamebase + "_A.mtx";
-			std::string fnameM = fnamebase + "_M_diag.mtx";
-			std::string fnameP = fnamebase + "_P.mtx";
-			std::string fnameR = fnamebase + "_R.mtx";
-			sim_in.matAfiles.push_back (fnameA);
-			sim_in.matMfiles.push_back (fnameM);
-			sim_in.matPfiles.push_back (fnameP);
-			sim_in.matRfiles.push_back (fnameR);
-			
-		}
-		{
-			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
-
-			std::string fnameA = fnamebase + "_A.mtx";
-			sim_in.matAfiles.push_back (fnameA);
-
-			std::string fnameM = fnamebase + "_M_diag.mtx";
-			sim_in.matMfiles.push_back (fnameM);
-		}		
-
-		std::cout << "files to read matrices: " << std::endl;
-		for (std::string fname: sim_in.matAfiles) {
-			std::cout << fname << " \n";
-		}
-
-		for (std::string fname: sim_in.matMfiles) {
-			std::cout << fname << " \n";
-		}
-
-		for (std::string fname: sim_in.matPfiles) {
-			std::cout << fname << " \n";
-		}
-
-		for (std::string fname: sim_in.matRfiles) {
-			std::cout << fname << " \n";
-		}
 
+	// check for valid values
+	std::size_t ssize { std::max( next_pow_2( sim_in.nx ), PHYS_SYSTEM_SIZE_MIN ) };
+	if( ssize != sim_in.nx ) {
+		std::cout << "Setting system size x to " << ssize << " instead of " << sim_in.nx << std::endl;
+		sim_in.nx = ssize;
 	}
-	else {
-		std::cout << "No pattern to read matrices provided" << std::endl;
+	ssize = std::max( next_pow_2( sim_in.ny ), PHYS_SYSTEM_SIZE_MIN );
+	if( ssize != sim_in.ny ) {
+		std::cout << "Setting system size y to " << ssize << " instead of " << sim_in.ny << std::endl;
+		sim_in.ny = ssize;
+	}
+	ssize = std::max( next_pow_2( sim_in.nz ), PHYS_SYSTEM_SIZE_MIN );
+	if( ssize != sim_in.nz ) {
+		std::cout << "Setting system size z to " << ssize << " instead of " << sim_in.nz << std::endl;
+		sim_in.nz = ssize;
 	}
-
-	// check for valid values
 	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
 		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
 		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;

From 4160d6e15a569acabcc7517956f52ec95b9e059c Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Fri, 5 Aug 2022 16:22:32 +0200
Subject: [PATCH 13/17] restore hpcg

---
 tests/smoke/hpcg.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index 65f14fdc7..d84c157e0 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -96,17 +96,17 @@ static const char * const TEXT_HIGHLIGHT = "===> ";
  * @brief Container for system parameters to create the HPCG problem.
  */
 struct system_input {
-	std::size_t nx, ny, nz;
-	std::size_t max_coarsening_levels;
+	size_t nx, ny, nz;
+	size_t max_coarsening_levels;
 };
 
 /**
  * @brief Container for the parameters for the HPCG simulation.
  */
 struct simulation_input : public system_input {
-	std::size_t test_repetitions;
-	std::size_t max_iterations;
-	std::size_t smoother_steps;
+	size_t test_repetitions;
+	size_t max_iterations;
+	size_t smoother_steps;
 	bool evaluation_run;
 	bool no_preconditioning;
 };
@@ -151,7 +151,7 @@ T static next_pow_2( T n ) {
  * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
  */
 static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in ) {
-	const std::array< std::size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
+	const std::array< size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
 	struct hpcg_system_params< 3, double > params {
 		physical_sys_sizes, HALO_RADIUS, BAND_WIDTH_3D * 2 + 1, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
 	};
@@ -390,7 +390,7 @@ static void parse_arguments( simulation_input & sim_in, size_t & outer_iteration
 	parser.parse( argc, argv );
 
 	// check for valid values
-	std::size_t ssize { std::max( next_pow_2( sim_in.nx ), PHYS_SYSTEM_SIZE_MIN ) };
+	size_t ssize { std::max( next_pow_2( sim_in.nx ), PHYS_SYSTEM_SIZE_MIN ) };
 	if( ssize != sim_in.nx ) {
 		std::cout << "Setting system size x to " << ssize << " instead of " << sim_in.nx << std::endl;
 		sim_in.nx = ssize;

From 82fcf8c462703dbaf7c2c7b82d88e5cfb3cd71fd Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Wed, 10 Aug 2022 11:38:10 +0200
Subject: [PATCH 14/17] address review comments

---
 include/graphblas/algorithms/amg/amg.hpp | 61 +++++++++++++-----------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/include/graphblas/algorithms/amg/amg.hpp b/include/graphblas/algorithms/amg/amg.hpp
index db2ac5fd6..221765ba7 100644
--- a/include/graphblas/algorithms/amg/amg.hpp
+++ b/include/graphblas/algorithms/amg/amg.hpp
@@ -16,37 +16,36 @@
  */
 
 /**
+ * Algebraic Multi-grid algorithm relying on level_ matrices from AMGCL.
  * @file amg.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief File with the main routine to run a full AMG simulation, comprising multi-grid runs
- *        with Red-Black Gauss-Seidel smoothing.
- * @date 2021-04-30
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-08-10
  */
 
 #ifndef _H_GRB_ALGORITHMS_AMG
 #define _H_GRB_ALGORITHMS_AMG
+#ifdef _DEBUG
+ #define AMG_PRINT_STEPS
+#endif
 
 #include <graphblas.hpp>
-
 #include "amg_data.hpp"
 #include "multigrid_v_cycle.hpp"
-
 #include <utils/print_vec_mat.hpp>
 
 namespace grb {
-	namespace algorithms {
-
 
+	namespace algorithms {
 
-		
 		/**
-		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
+		 * Algebraic Multi-grid algorithm relying on level_ matrices from AMGCL.
 		 *
 		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running the AMG algorithm.
 		 * AMG implementation (as the standard one) couples a standard CG algorithm with a V-cycle
 		 * multi-grid solver to initially refine the tentative solution. This refinement step depends on the
 		 * availability of coarsening information, which should be stored inside \p data; otherwise,
-		 * the refinement is not performed and only the CG algorithm is run. 
+		 * the refinement is not performed and only the CG algorithm is run.
 		 *
 		 * This implementation assumes that the vectors and matrices inside \p data are all correctly initialized
 		 * and populated with the proper values; in particular
@@ -54,7 +53,6 @@ namespace grb {
 		 * - amg_data#A with the system matrix
 		 * - amg_data#b with the right-hand side vector \f$ b \f$
 		 * - amg_data#A_diagonal with the diagonal values of the matrix
-		 * - amg_data#color_masks with the color masks for this level
 		 * - amg_data#coarser_level with the information for the coarser multi-grid run (if any)
 		 * The other vectors are assumed to be inizialized (via the usual grb::Vector#Vector(size_t) constructor)
 		 * but not necessarily populated with values, as they are internally populated when needed; hence,
@@ -92,7 +90,8 @@ namespace grb {
 			typename InputType,
 			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
 			class Minus = operators::subtract< IOType > >
-		grb::RC amg( amg_data< IOType, NonzeroType, InputType > &data,
+		grb::RC amg(
+			amg_data< IOType, NonzeroType, InputType > &data,
 			bool with_preconditioning,
 			const size_t presmoother_steps,
 			const size_t postsmoother_steps,
@@ -105,14 +104,14 @@ namespace grb {
 		) {
 			ResidualType alpha;
 
-			const grb::Matrix< NonzeroType > &A { data.A };
-			grb::Vector< IOType > &x { data.x };
-			const grb::Vector< InputType > &b { data.b };
-			grb::Vector< IOType > &r { data.r };  // residual vector
-			grb::Vector< IOType > &p { data.p };  // direction vector
-			grb::Vector< IOType > &Ap { data.u }; // temp vector
-			grb::Vector< IOType > &z { data.z };  // pre-conditioned residual vector
-			grb::RC ret { SUCCESS };
+			const grb::Matrix< NonzeroType > &A = data.A;
+			grb::Vector< IOType > &x = data.x;
+			const grb::Vector< InputType > &b = data.b;
+			grb::Vector< IOType > &r = data.r;  // residual vector
+			grb::Vector< IOType > &p = data.p;  // direction vector
+			grb::Vector< IOType > &Ap = data.u; // temp vector
+			grb::Vector< IOType > &z = data.z;  // pre-conditioned residual vector
+			grb::RC ret = SUCCESS;
 
 			ret = ret ? ret : grb::set( Ap, 0 );
 			ret = ret ? ret : grb::set( r, 0 );
@@ -133,9 +132,11 @@ namespace grb {
 			norm_residual = std::sqrt( norm_residual );
 
 			// initial norm of residual
-			const ResidualType norm_residual_initial { norm_residual };
-			ResidualType old_r_dot_z { 0.0 }, r_dot_z { 0.0 }, beta { 0.0 };
-			size_t iter { 0 };
+			const ResidualType norm_residual_initial = norm_residual;
+			ResidualType old_r_dot_z = 0.0,
+			ResidualType r_dot_z = 0.0,
+			ResidualType beta = 0.0;
+			size_t iter = 0;
 
 #ifdef AMG_PRINT_STEPS
 			print_norm( p, "start p" );
@@ -149,7 +150,9 @@ namespace grb {
 #endif
 				if( with_preconditioning ) {
 					//ret = ret ? ret : grb::set( z, r ); // z = r;
-					ret = ret ? ret : internal::multi_grid( data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
+					ret = ret ? ret : internal::multi_grid(
+						data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus
+					);
 					//ret = ret ? ret : grb::set( x, z );
 					assert( ret == SUCCESS );
 				} else {
@@ -214,17 +217,21 @@ namespace grb {
 
 				norm_residual = std::sqrt( norm_residual );
 
-#ifdef AMG_PRINT_STEPS				
+#ifdef AMG_PRINT_STEPS
 				std::cout << " ---> norm_residual=" << norm_residual << "\n";
 #endif
 				++iter;
-			} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
-
+			} while(
+				iter < max_iterations &&
+				norm_residual / norm_residual_initial > tolerance &&
+				ret == SUCCESS
+				);
 			iterations = iter;
 			return ret;
 		}
 
 	} // namespace algorithms
+
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_AMG

From e987ce34e02d6ee4cd09c151c399196650592c9a Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Wed, 10 Aug 2022 11:50:05 +0200
Subject: [PATCH 15/17] forgot ;

---
 include/graphblas/algorithms/amg/amg.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/algorithms/amg/amg.hpp b/include/graphblas/algorithms/amg/amg.hpp
index 221765ba7..4468fe9bf 100644
--- a/include/graphblas/algorithms/amg/amg.hpp
+++ b/include/graphblas/algorithms/amg/amg.hpp
@@ -133,8 +133,8 @@ namespace grb {
 
 			// initial norm of residual
 			const ResidualType norm_residual_initial = norm_residual;
-			ResidualType old_r_dot_z = 0.0,
-			ResidualType r_dot_z = 0.0,
+			ResidualType old_r_dot_z = 0.0;
+			ResidualType r_dot_z = 0.0;
 			ResidualType beta = 0.0;
 			size_t iter = 0;
 

From 651c55a0e6388a9890c4354dc6c655ae95209d8d Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Wed, 10 Aug 2022 15:42:22 +0200
Subject: [PATCH 16/17] code style update

---
 include/graphblas/algorithms/amg/amg.hpp      |  66 +++--
 include/graphblas/algorithms/amg/amg_data.hpp | 115 +++++----
 .../algorithms/amg/multigrid_v_cycle.hpp      | 226 +++++++++---------
 .../algorithms/amg/system_building_utils.hpp  | 138 ++++++-----
 4 files changed, 296 insertions(+), 249 deletions(-)

diff --git a/include/graphblas/algorithms/amg/amg.hpp b/include/graphblas/algorithms/amg/amg.hpp
index 4468fe9bf..660b0e63a 100644
--- a/include/graphblas/algorithms/amg/amg.hpp
+++ b/include/graphblas/algorithms/amg/amg.hpp
@@ -41,25 +41,31 @@ namespace grb {
 		/**
 		 * Algebraic Multi-grid algorithm relying on level_ matrices from AMGCL.
 		 *
-		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running the AMG algorithm.
-		 * AMG implementation (as the standard one) couples a standard CG algorithm with a V-cycle
-		 * multi-grid solver to initially refine the tentative solution. This refinement step depends on the
-		 * availability of coarsening information, which should be stored inside \p data; otherwise,
-		 * the refinement is not performed and only the CG algorithm is run.
+		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running
+		 * the AMG algorithm.  AMG implementation (as the standard one) couples a
+		 * standard CG algorithm with a V-cycle  multi-grid solver to initially
+		 * refine the tentative solution. This refinement step depends on the
+		 * availability of coarsening information, which should be stored inside
+		 * \p data; otherwise, the refinement is not performed and only the CG
+		 * algorithm is run.
 		 *
-		 * This implementation assumes that the vectors and matrices inside \p data are all correctly initialized
-		 * and populated with the proper values; in particular
-		 * - amg_data#x with the initial tentative solution (iterative solutions are also stored here)
+		 * This implementation assumes that the vectors and matrices inside \p data
+		 * are all correctly initialized and populated with the proper values;
+		 * in particular
+		 * - amg_data#x with the initial tentative solution
+		 *              (iterative solutions are also stored here)
 		 * - amg_data#A with the system matrix
 		 * - amg_data#b with the right-hand side vector \f$ b \f$
 		 * - amg_data#A_diagonal with the diagonal values of the matrix
-		 * - amg_data#coarser_level with the information for the coarser multi-grid run (if any)
-		 * The other vectors are assumed to be inizialized (via the usual grb::Vector#Vector(size_t) constructor)
-		 * but not necessarily populated with values, as they are internally populated when needed; hence,
-		 * any previous values are overwritten.
+		 * - amg_data#coarser_level with the information for
+		 *                the coarser multi-grid run (if any)
+		 * The other vectors are assumed to be inizialized
+		 * (via the usual grb::Vector#Vector(size_t) constructor)
+		 * but not necessarily populated with values, as they are internally populated
+		 * when needed; hence, any previous values are overwritten.
 		 *
-		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
-		 * the failure code.
+		 * Failuers of GraphBLAS operations are handled by immediately stopping the
+		 * execution and by returning the failure code.
 		 *
 		 * @tparam IOType type of result and intermediate vectors used during computation
 		 * @tparam ResidualType type of the residual norm
@@ -68,27 +74,36 @@ namespace grb {
 		 * @tparam Ring the ring of algebraic operators zero-values
 		 * @tparam Minus the minus operator for subtractions
 		 *
-		 * @param[in,out] data \ref amg_data object storing inputs, outputs and temporary vectors used for the computation,
-		 *                     as long as the information for the recursive multi-grid runs
-		 * @param[in] with_preconditioning whether to use pre-conditioning, i.e. to perform multi-grid runs
+		 * @param[in,out] data \ref amg_data object storing inputs, outputs and temporary
+		 *                   vectors used for the computation, as long as the information
+		 *                   for the recursive multi-grid runs
+		 * @param[in] with_preconditioning whether to use pre-conditioning,
+		 *                     i.e. to perform multi-grid runs
 		 * @param[in] presmoother_steps number of pre-smoother steps, for multi-grid runs
-		 * @param[in] postsmoother_steps nomber of post-smoother steps, for multi-grid runs
-		 * @param[in] max_iterations maximum number if iterations the simulation may run for; once reached,
-		 *                           the simulation stops even if the residual norm is above \p tolerance
-		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of the residual norm to stop
-		 *                      the simulation at
+		 * @param[in] postsmoother_steps nomber of post-smoother steps,
+		 *                          for multi-grid runs
+		 * @param[in] max_iterations maximum number if iterations the simulation may run
+		 *                       for; once reached, the simulation stops even if the
+		 *                       residual norm is above \p tolerance
+		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of
+		 *                      the residual norm to stop  the simulation at
 		 * @param[out] iterations numbers of iterations performed
 		 * @param[out] norm_residual norm of the final residual
 		 * @param[in] ring the ring to perform the operations on
 		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-		 *                          unsuccessful operation otherwise
+		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error
+		 *                          code of the first unsuccessful operation otherwise
 		 */
 		template< typename IOType,
 			typename ResidualType,
 			typename NonzeroType,
 			typename InputType,
-			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
+			class Ring = Semiring<
+				grb::operators::add< IOType >,
+				grb::operators::mul< IOType >,
+				grb::identities::zero,
+				grb::identities::one
+			>,
 			class Minus = operators::subtract< IOType > >
 		grb::RC amg(
 			amg_data< IOType, NonzeroType, InputType > &data,
@@ -103,7 +118,6 @@ namespace grb {
 			const Minus &minus = Minus()
 		) {
 			ResidualType alpha;
-
 			const grb::Matrix< NonzeroType > &A = data.A;
 			grb::Vector< IOType > &x = data.x;
 			const grb::Vector< InputType > &b = data.b;
diff --git a/include/graphblas/algorithms/amg/amg_data.hpp b/include/graphblas/algorithms/amg/amg_data.hpp
index 62e2310a9..2be2524db 100644
--- a/include/graphblas/algorithms/amg/amg_data.hpp
+++ b/include/graphblas/algorithms/amg/amg_data.hpp
@@ -16,10 +16,11 @@
  */
 
 /**
+ * Data structures to store AMG input/output data.
  * @file amg_data.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Data structures to store AMG input/output data.
- * @date 2021-04-30
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-10-08
  */
 
 #ifndef _H_GRB_ALGORITHMS_AMG_DATA
@@ -27,15 +28,14 @@
 
 #include <vector>
 #include <cstddef>
-
 #include <graphblas.hpp>
 
-
 namespace grb {
+
 	namespace algorithms {
 
 		/**
-		 * @brief basic data container for the AMG algorithm, storing \b only the
+		 * basic data container for the AMG algorithm, storing \b only the
 		 * data in common between the full CG run and the V-cycle multi-grid solver.
 		 * Additional data are stored in inheriting data structures.
 		 *
@@ -44,9 +44,7 @@ namespace grb {
 		 */
 		template< typename IOType, typename NonzeroType >
 		struct system_data {
-
 			const std::size_t system_size; ///< size of the system, i.e. side of the #A
-
 			grb::Matrix< NonzeroType > A;                   ///< system matrix
 			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
 			grb::Vector< IOType > z;                        ///< multi-grid solution
@@ -55,27 +53,26 @@ namespace grb {
 			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
 
 			/**
-			 * @brief Constructor building all the stored vectors and matrices.
+			 * Constructor building all the stored vectors and matrices.
 			 *
-			 * Stored vectors and matrices are constructed according to \p sys_size but \b not initialized
+			 * Stored vectors and matrices are constructed according to
+			 * \p sys_size but \b not initialized
 			 * to any value internally, as initialization is up to users's code.
 			 *
-			 * @param[in] sys_size the size of the underlying physical system, i.e. the size of vectors and the number
-			 * of rows and columns of the #A matrix.
+			 * @param[in] sys_size the size of the underlying physical system,
+			 *                      i.e. the size of vectors and the number
+			 *                      of rows and columns of the #A matrix.
 			 */
 			system_data( std::size_t sys_size ) :
 				system_size( sys_size ), A( sys_size, sys_size ), A_diagonal( sys_size ), z( sys_size ), r( sys_size ),
-				// temp(sys_size),
 				smoother_temp( sys_size ) {}
-
 			// for safety, disable copy semantics
 			system_data( const system_data & o ) = delete;
-
 			system_data & operator=( const system_data & ) = delete;
 		};
 
 		/**
-		 * @brief Data container for all multi-grid inputs and outputs.
+		 * Data container for all multi-grid inputs and outputs.
 		 *
 		 * @tparam IOType Type of values of the vectors for intermediate results
 		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
@@ -86,50 +83,57 @@ namespace grb {
 		 * - coarsening information, in particular the #coarsening_matrix that
 		 *   coarsens a larger system of size #finer_size to the current system
 		 *   of size #system_size
-		 * - the next level of coarsening, pointed to by #coarser_level, possibly being \c nullptr
-		 *   if no further coarsening is desired; note that this information is automatically
-		 *   destructed on object destruction (if any)
+		 * - the next level of coarsening, pointed to by #coarser_level,
+		 *        possibly being \c nullptr if no further coarsening is desired;
+		 *        note that this information is automatically
+		 *        destructed on object destruction (if any)
 		 *
-		 * Vectors stored here refer to the \b coarsened system (with the exception of #Ax_finer),
+		 * Vectors stored here refer to the \b coarsened system
+		 * (with the exception of #Ax_finer),
 		 * thus having size #system_size; this also holds for the system matrix #A,
 		 * while #coarsening_matrix has size #system_size \f$ \times \f$ #finer_size.
-		 * Hence, the typical usage of this data structure is to coarsen \b external vectors, e.g. vectors
-		 * coming from another \code multi_grid_data<IOType, NonZeroType> \endcode object whose #system_size equals
-		 * \code this-> \endcode #fines_size, via \code this-> \endcode #coarsening_matrix and store the coarsened
-		 * vectors internally. Mimicing the recursive behavior of standard multi-grid simulations,
-		 * the information for a further coarsening is stored inside #coarser_level, so that the
-		 * hierarchy of coarsened levels is reflected inside this data structure.
+		 * Hence, the typical usage of this data structure is to coarsen \b external
+		 * vectors, e.g. vectors coming from another
+		 * \code multi_grid_data<IOType, NonZeroType>
+		 * \endcode object whose #system_size equals
+		 * \code this-> \endcode #fines_size, via
+		 * \code this-> \endcode #coarsening_matrix and store the coarsened
+		 * vectors internally. Mimicing the recursive behavior of standard multi-grid
+		 * simulations, the information for a further coarsening is stored inside
+		 * #coarser_level, so that the  hierarchy of coarsened levels is reflected
+		 * inside this data structure.
 		 *
-		 * As for \ref system_data, internal vectors and matrices are initialized to the proper size,
-		 * but their values are \b not initialized.
+		 * As for \ref system_data, internal vectors and matrices are initialized to the
+		 * proper size, but their values are \b not initialized.
 		 */
 		template< typename IOType, typename NonzeroType >
 		struct multi_grid_data : public system_data< IOType, NonzeroType > {
 
 			const std::size_t finer_size; ///< ssize of the finer system to coarse from;
-			///< typically \c finer_size \code == 8 * \endcode #system_size
-
 			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
-
 			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
-			///< to coarsen an input vector of size #finer_size into a vector of size #system_size
-
-			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< pointer to next coarsening level, for recursive
+			                                              ///< to coarsen an input vector of size #finer_size
+			                                              ///< into a vector of size #system_size
+			struct multi_grid_data< IOType, NonzeroType > *coarser_level; ///< pointer to next coarsening level, for recursive
 			                                                               ///< multi-grid V cycle implementations
 
 			/**
-			 * @brief Construct a new \c multi_grid_data_object by initializing internal data structures and setting
-			 *        #coarser_level to \c nullptr.
-			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
-			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
+			 * Construct a new \c multi_grid_data_object by initializing internal
+			 *     data structures and setting #coarser_level to \c nullptr.
+			 * @param[in] coarser_size size of the current system,
+			 *            i.e. size \b after coarsening
+			 * @param[in] _finer_size  size of the finer system,
+			 *            i.e. size of external objects \b before coarsening
 			 */
 			multi_grid_data( std::size_t coarser_size, std::size_t _finer_size ) :
-				system_data< IOType, NonzeroType >( coarser_size ), finer_size( _finer_size ), Ax_finer( finer_size ), coarsening_matrix( coarser_size, finer_size ) {
+				system_data< IOType, NonzeroType >( coarser_size ), finer_size( _finer_size ),
+				Ax_finer( finer_size ), coarsening_matrix( coarser_size, finer_size ) {
 				coarser_level = nullptr;
 			}
 
 			/**
-			 * @brief Destroys the \c multi_grid_data_object object by destroying #coarser_level.
+			 * @brief Destroys the \c multi_grid_data_object object
+			 * by destroying #coarser_level.
 			 */
 			virtual ~multi_grid_data() {
 				if( coarser_level != nullptr ) {
@@ -139,16 +143,18 @@ namespace grb {
 		};
 
 		/**
-		 * @brief Data stucture to store the data for a full AMG run: system vectors and matrix,
-		 * coarsening information and temporary vectors.
+		 * @brief Data stucture to store the data for a full AMG run:
+		 * system vectors and matrix, coarsening information and temporary vectors.
 		 *
-		 * This data structures contains all the needed vectors and matrices to solve a linear system
-		 * \f$ A x = b \f$. As for \ref system_data, internal elements are built and their sizes properly initialized
-		 * to #system_size, but internal values are \b not initialized, as they are left to user's logic.
-		 * Similarly, the coarsening information in #coarser_level is to be initialized by users by properly
-		 * building a \code multi_grid_data<IOType, NonzeroType> \endcode object and storing its pointer into
-		 * #coarser_level; on destruction, #coarser_level will also be properly destroyed without
-		 * user's intervention.
+		 * This data structures contains all the needed vectors and matrices to
+		 * solve a linear system \f$ A x = b \f$. As for \ref system_data,
+		 * internal elements are built and their sizes properly initialized
+		 * to #system_size, but internal values are \b not initialized, as
+		 * they are left to user's logic. Similarly, the coarsening information
+		 * in #coarser_level is to be initialized by users by properly
+		 * building a \code multi_grid_data<IOType, NonzeroType> \endcode object
+		 * and storing its pointer into #coarser_level; on destruction, #coarser_level
+		 * will also be properly destroyed without user's intervention.
 		 *
 		 * @tparam IOType type of values of the vectors for intermediate results
 		 * @tparam NonzeroType type of the values stored inside the system matrix #A
@@ -163,21 +169,23 @@ namespace grb {
 			grb::Vector< IOType > x;    // system solution being refined over the iterations: it us up to the user
 			///< to set the initial solution value
 
-			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< information about the coarser system, for
+			struct multi_grid_data< IOType, NonzeroType > *coarser_level; ///< information about the coarser system, for
 			                                                               ///< the multi-grid run
 
 			/**
-			 * @brief Construct a new \c amg_data object by building vectors and matrices and by setting
-			 * #coarser_level to \c nullptr (i.e. no coarser level is assumed).
+			 * Construct a new \c amg_data object by building vectors and matrices
+			 * and by setting #coarser_level to \c nullptr (i.e. no coarser level is assumed).
 			 *
-			 * @param[in] sys_size the size of the simulated system, i.e. of all the internal vectors and matrices
+			 * @param[in] sys_size the size of the simulated system,
+			 *            i.e. of all the internal vectors and matrices
 			 */
 			amg_data( std::size_t sys_size ) : system_data< IOType, NonzeroType >( sys_size ), b( sys_size ), u( sys_size ), p( sys_size ), x( sys_size ) {
 				coarser_level = nullptr;
 			}
 
 			/**
-			 * @brief Destroy the \c amg_data object by destroying the #coarser_level informartion, if any.
+			 * Destroy the \c amg_data object by destroying the #coarser_level informartion,
+			 *  if any.
 			 */
 			virtual ~amg_data() {
 				if( coarser_level != nullptr ) {
@@ -187,6 +195,7 @@ namespace grb {
 		};
 
 	} // namespace algorithms
+
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_AMG_DATA
diff --git a/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
index 5d8a82c59..e19da1026 100644
--- a/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
+++ b/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,14 @@
  */
 
 /**
+ *
+ * This file contains the routines for multi-grid solution refinement,
+ *        including the main routine and those for coarsening and refinement
+ *        of the tentative solution.
  * @file multigrid_v_cycle.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief This file contains the routines for multi-grid solution refinement, including the main routine
- *        and those for coarsening and refinement of the tentative solution.
- * @date 2021-04-30
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-08-10
  */
 
 #ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
@@ -28,139 +31,148 @@
 
 #include <cassert>
 #include <vector>
-
 #include <graphblas.hpp>
-
 #include "amg_data.hpp"
-
 #include <utils/print_vec_mat.hpp>
 
-
-
 #define DBG_println( args ) std::cout << args << std::endl;
 
-
 namespace grb {
+
 	namespace algorithms {
 
-		
 		/**
-		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
+		 * Namespace for interfaces that should not be used outside of the algorithm
+		 * namespace.
 		 */
 		namespace internal {
 
-			
 			/**
-			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
-			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
+			 * computes the coarser residual vector \p coarsening_data.r by coarsening
+			 *        \p coarsening_data.Ax_finer - \p r_fine via
+			 * \p coarsening_data.coarsening_matrix.
 			 *
 			 * The coarsening information are stored inside \p coarsening_data.
 			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam IOType type of result and intermediate vectors used during
+			 *         computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
 			 * @tparam Minus the minus operator for subtractions
 			 *
 			 * @param[in] r_fine fine residual vector
-			 * @param[in,out] coarsening_data \ref multi_grid_data data structure storing the information for coarsening
+			 * @param[in,out] coarsening_data \ref multi_grid_data data structure storing
+			 *                the information for coarsening
 			 * @param[in] ring the ring to perform the operations on
 			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the
+			 *                 error code of the first unsuccessful operation otherwise
 			 */
 			template< typename IOType,
 				typename NonzeroType,
 				class Ring,
 				class Minus >
-			grb::RC compute_coarsening( const grb::Vector< IOType > & r_fine, // fine residual
-				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring,
-				const Minus & minus ) {
-				RC ret { SUCCESS };
-				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine, coarsening_data.Ax_finer,
-									  minus ); // Ax_finer = r_fine - Ax_finer
+			grb::RC compute_coarsening(
+				const grb::Vector< IOType > &r_fine, // fine residual
+				struct multi_grid_data< IOType, NonzeroType > &coarsening_data,
+				const Ring &ring,
+				const Minus &minus
+				) {
+				RC ret = SUCCESS;
+				ret = ret ? ret : grb::eWiseApply(
+					coarsening_data.Ax_finer,
+					r_fine, coarsening_data.Ax_finer,
+					minus
+				); // Ax_finer = r_fine - Ax_finer
 				assert( ret == SUCCESS );
 
-				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
-				// to *coarsening_data->system_size
 				ret = ret ? ret : grb::set( coarsening_data.r, 0 );
-				ret = ret ? ret : grb::mxv( coarsening_data.r, coarsening_data.coarsening_matrix, coarsening_data.Ax_finer,
-									  ring ); // r = coarsening_matrix * Ax_finer
+				ret = ret ? ret : grb::mxv(
+					coarsening_data.r, coarsening_data.coarsening_matrix,
+					coarsening_data.Ax_finer, ring
+				); // r = coarsening_matrix * Ax_finer
 				return ret;
 			}
 
 			/**
-			 * @brief computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
-			 * \p x_fine.
+			 * computes the prolongation of the coarser solution \p coarsening_data.z and
+			 *  stores it into \p x_fine.
 			 *
-			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
+			 * For prolongation, this function uses the matrix
+			 *   \p coarsening_data.coarsening_matrix by transposing it.
 			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam IOType type of result and intermediate vectors used
+			 * during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
 			 *
-			 * @param[out] x_fine the solution vector to store the prolonged solution into
+			 * @param[out] x_fine the solution vector to store the
+			 *             prolonged solution into
 			 * @param[in,out] coarsening_data information for coarsening
 			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 * unsuccessful operation otherwise
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate,
+			 *              the error code of the first unsuccessful operation otherwise
 			 */
 			template< typename IOType,
 				typename NonzeroType,
 				class Ring >
-			grb::RC compute_prolongation( grb::Vector< IOType > & x_fine, // fine residual
-				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring ) {
-				RC ret { SUCCESS };
-				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
-				// to nrows(x_fine)
+			grb::RC compute_prolongation(
+				grb::Vector< IOType > &x_fine, // fine residual
+				struct multi_grid_data< IOType, NonzeroType > &coarsening_data,
+				const Ring &ring
+				) {
+				RC ret = SUCCESS;
 				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
 
-				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix >( coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, coarsening_data.z, ring );
+				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix >(
+					coarsening_data.Ax_finer,
+					coarsening_data.coarsening_matrix,
+					coarsening_data.z,
+					ring
+				);
 				assert( ret == SUCCESS );
-
-				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
+				// x_fine += Ax_finer;
+				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() );
 				assert( ret == SUCCESS );
 				return ret;
 			}
 
 			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC spai0_smoother( system_data< IOType, NonzeroType > & data, const Ring & ring ) {
-				RC ret { SUCCESS };
+			grb::RC spai0_smoother( system_data< IOType, NonzeroType > &data, const Ring &ring ) {
+				RC ret = SUCCESS;
 
 				NonzeroType alpha = 1.;
 				ret = ret ? ret : grb::eWiseMulAdd( data.z, alpha, data.A_diagonal, data.r, ring );
 				assert( ret == SUCCESS );
-					
 				return ret;
 			}
-			
 
 			/**
-			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
-			 * inside \p data.
+			 * Runs \p smoother_steps iteration of the SPAI0 smoother,
+			 *        with inputs and outputs stored inside \p data.
 			 *
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
 			 *
-			 * @param[in,out] data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
-			 *                     residual, system matrix colors, temporary vectors
+			 * @param[in,out] data \ref system_data data structure with relevant inpus
+			 *                 and outputs: system matrix, initial solution, residual,
+			 *                 system matrix colors, temporary vectors
 			 * @param[in] smoother_steps how many smoothing steps to run
 			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate,
+			 *         the error code of the first unsuccessful operation otherwise
 			 */
 			template< typename IOType, typename NonzeroType, class Ring, class Minus >
-			grb::RC run_spai0_smoother( system_data< IOType, NonzeroType > & data,
-								  const std::size_t smoother_steps, const Ring & ring, const Minus & minus ) {
-				RC ret { SUCCESS };
+			grb::RC run_spai0_smoother( system_data< IOType, NonzeroType > &data,
+								  const size_t smoother_steps, const Ring &ring, const Minus &minus ) {
+				RC ret = SUCCESS;
 
-				for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
+				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
 					ret = ret ? ret : grb::set( data.smoother_temp, 0 );
 					ret = ret ? ret : grb::mxv( data.smoother_temp, data.A, data.z, ring );
 					ret = ret ? ret : grb::eWiseApply( data.smoother_temp, data.r, data.smoother_temp,
-													   minus ); 
+													   minus );
 
 #ifdef HPCG_PRINT_STEPS
 					std::cout << " data.A(spai0): " << nrows(data.A) << " x " << ncols(data.A) << " \n";
@@ -171,20 +183,20 @@ namespace grb {
 
 					ret = ret ? ret :
 						grb::eWiseLambda(
-										 [ &data ]( const size_t i ) {
+							[ &data ]( const size_t i ) {
 #ifdef HPCG_PRINT_STEPS
-											 if( i < 10 || i + 10 > size( data.A_diagonal ) ){
-												 std::cout << " i= " << i
-														   << " data.z[i]= " << data.z[i]
-														   << " data.A_diagonal[i]= " << data.A_diagonal[i]
-														   << " data.smoother_temp[i]= " << data.smoother_temp[i]
-														   << "\n";
-											 }
-											 std::cout << "\n";
+								if( i < 10 || i + 10 > size( data.A_diagonal ) ) {
+									std::cout << " i= " << i
+									          << " data.z[i]= " << data.z[ i ]
+									          << " data.A_diagonal[i]= " << data.A_diagonal[ i ]
+									          << " data.smoother_temp[i]= " << data.smoother_temp[ i ]
+									          << "\n";
+								}
+								std::cout << "\n";
 #endif
-											 data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
-										 },
-										 data.A_diagonal, data.z, data.smoother_temp );
+								data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
+							},
+							data.A_diagonal, data.z, data.smoother_temp );
 
 					assert( ret == SUCCESS );
 				}
@@ -196,44 +208,48 @@ namespace grb {
 			 * @brief Multi-grid V cycle implementation to refine a given solution.
 			 *
 			 * A full multi-grid run goes through the following steps:
-			 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the Red-Black Gauss-Seidel smoother are run
-			 *    to improve on the initial solution stored into \p data.z
-			 * -# the coarsening of \f$ r - A*z \f$ is computed to find the coarser residual vector
+			 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the SPAI0
+			 *    smoother, run to improve on the initial solution stored into \p data.z
+			 * -# the coarsening of \f$ r - A*z \f$ is computed to find the
+			 *      coarser residual vector
 			 * -# a multi-grid run is recursively performed on the coarser system
-			 * -# the tentative solution from the coarser multi-grid run is prolonged and added to the current tentative solution
-			 *    into \p data.z
+			 * -# the tentative solution from the coarser multi-grid run is prolonged
+			 *     and added to the current tentative solution into \p data.z
 			 * -# this solution is further smoothed for \p postsmoother_steps steps
 			 *
-			 * If coarsening information is not available, the multi-grid run consists in a single smmothing run.
+			 * If coarsening information is not available, the multi-grid run
+			 * consists in a single smmothing run.
 			 *
-			 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
-			 * the failure code.
+			 * Failuers of GraphBLAS operations are handled by immediately
+			 * stopping the execution and by returning the failure code.
 			 *
 			 * @tparam IOType type of result and intermediate vectors used during computation
 			 * @tparam NonzeroType type of matrix values
 			 * @tparam Ring the ring of algebraic operators zero-values
 			 * @tparam Minus the minus operator for subtractions
 			 *
-			 * @param[in,out] data \ref multi_grid_data object storing the relevant data for the multi-grid run of the current
-			 *                     clevel
-			 * @param[in,out] coarsening_data pointer to information for the coarsening/refinement operations and for the
-			 *                recursive multi-grid run on the coarsened system; if \c nullptr, no coarsening/refinement occurs
+			 * @param[in,out] data \ref multi_grid_data object storing the relevant data
+			 *            for the multi-grid run of the current clevel
+			 * @param[in,out] coarsening_data pointer to information for the
+			 *                coarsening/refinement operations and
+			 *                for the recursive multi-grid run on the coarsened system;
+			 *                if \c nullptr, no coarsening/refinement occurs
 			 *                and only smoothing occurs on the current solution
 			 * @param[in] presmoother_steps number of pre-smoother steps
 			 * @param[in] postsmoother_steps number of post-smoother steps
 			 * @param[in] ring the ring to perform the operations on
 			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate,
+			 *         the error code of the first unsuccessful operation otherwise
 			 */
 			template< typename IOType, typename NonzeroType, class Ring, class Minus >
-			grb::RC multi_grid( system_data< IOType, NonzeroType > & data,
-				struct multi_grid_data< IOType, NonzeroType > * const coarsening_data,
+			grb::RC multi_grid( system_data< IOType, NonzeroType > &data,
+				struct multi_grid_data< IOType, NonzeroType > *const coarsening_data,
 				const size_t presmoother_steps,
 				const size_t postsmoother_steps,
-				const Ring & ring,
-				const Minus & minus ) {
-				RC ret { SUCCESS };
+				const Ring &ring,
+				const Minus &minus ) {
+				RC ret = SUCCESS;
 
 #ifdef HPCG_PRINT_STEPS
 				DBG_println( "mg BEGINNING {" );
@@ -242,7 +258,7 @@ namespace grb {
 				// clean destination vector
 				//ret = ret ? ret : grb::set( data.z, data.r );
 				ret = ret ? ret : grb::set( data.z, 0 );
-				
+
 #ifdef HPCG_PRINT_STEPS
 				print_norm( data.z, "first print smoothed z" );
 				print_norm( data.r, "initial r" );
@@ -258,9 +274,7 @@ namespace grb {
 					return ret;
 				}
 
-				struct multi_grid_data< IOType, NonzeroType > & cd {
-					*coarsening_data
-				};
+				struct multi_grid_data< IOType, NonzeroType > &cd = *coarsening_data;
 
 				// pre-smoother
 				ret = ret ? ret : run_spai0_smoother( data, presmoother_steps, ring, minus );
@@ -268,40 +282,34 @@ namespace grb {
 #ifdef HPCG_PRINT_STEPS
 				print_norm( data.z, "pre-smoothed z" );
 #endif
-
 				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
 				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
 				assert( ret == SUCCESS );
 
-				
 #ifdef HPCG_PRINT_STEPS
 				std::cout << "  data.A: " << nrows(data.A) << " x " << ncols(data.A) << " \n";
 				print_norm( cd.r, "before coarse cd.r" );
-#endif				
-
+#endif
 				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
 				assert( ret == SUCCESS );
 
-				
 #ifdef HPCG_PRINT_STEPS
-				print_norm( cd.z, "after cd.coarse z" );				
+				print_norm( cd.z, "after cd.coarse z" );
 				print_norm( cd.r, "after cd.coarse r" );
 #endif
-
-				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );			
+				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
 				assert( ret == SUCCESS );
 
 				ret = ret ? ret : compute_prolongation( data.z, cd, ring );
 				assert( ret == SUCCESS );
-				
+
 #ifdef HPCG_PRINT_STEPS
 				print_norm( data.z, "prolonged z" );
 #endif
-
 				// post-smoother
 				ret = ret ? ret : run_spai0_smoother( data, postsmoother_steps, ring, minus );
 				assert( ret == SUCCESS );
-				
+
 #ifdef HPCG_PRINT_STEPS
 				print_norm( data.z, "post-smoothed z" );
 				DBG_println( "} mg END" );
@@ -311,7 +319,9 @@ namespace grb {
 			}
 
 		} // namespace internal
-	}     // namespace algorithms
+
+	} // namespace algorithms
+
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
diff --git a/include/graphblas/algorithms/amg/system_building_utils.hpp b/include/graphblas/algorithms/amg/system_building_utils.hpp
index 30a98e78b..f2a431ec9 100644
--- a/include/graphblas/algorithms/amg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/amg/system_building_utils.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,11 @@
  */
 
 /**
+ * Utilities to build an antire system for AMG simulations.
  * @file amg_system_building_utils.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build an antire system for AMG simulations in an arbitrary number of dimensions.
- * @date 2021-04-30
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-08-10
  */
 
 #ifndef _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
@@ -29,98 +30,104 @@
 #include <cassert>
 #include <cstddef>
 #include <memory>
-
 #include <graphblas.hpp>
 #include <graphblas/utils/parser.hpp>
-
 #include "amg_data.hpp"
 
 namespace grb {
+
 	namespace algorithms {
 
 		/**
-		 * @brief Generates an entire AMG problem according to the parameters in \p params , storing it in \p holder .
+		 * Generates an entire AMG problem, storing it in \p holder.
 		 *
 		 * @tparam DIMS dimensions of the system
 		 * @tparam T type of matrix values
 		 * @param holder std::unique_ptr to store the AMG problem into
 		 * @param params parameters container to build the AMG problem
-		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors and matrices) succeeded,
-		 * otherwise the first unsuccessful return value
+		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors
+		 *          and matrices) succeeded, otherwise the first unsuccessful return value
 		 */
 		template< typename T = double, typename SYSINP >
-		grb::RC build_amg_system( std::unique_ptr< grb::algorithms::amg_data< T, T, T > > & holder, const std::size_t max_levels, SYSINP &in ) {
-			grb::RC rc { grb::SUCCESS };
+		grb::RC build_amg_system(
+				std::unique_ptr< grb::algorithms::amg_data< T, T, T > > &holder,
+				const std::size_t max_levels,
+				SYSINP &in
+			) {
+			grb::RC rc = grb::SUCCESS;
 			std::size_t coarsening_level = 0UL;
 			const size_t n_A = in.matAbuffer[ coarsening_level ].get_n();
-			grb::algorithms::amg_data< T, T, T > * data { new grb::algorithms::amg_data< T, T, T >( n_A ) };
-			rc = buildMatrixUnique( data->A,
-									in.matAbuffer[ coarsening_level ].i_data,
-									in.matAbuffer[ coarsening_level ].j_data,
-									in.matAbuffer[ coarsening_level ].v_data,
-									in.matAbuffer[ coarsening_level ].size(),
-									SEQUENTIAL
-									);
-
-			/* Once internal issue #342 is resolved this can be re-enabled
-			   const RC rc = buildMatrixUnique( L,
-			   parser.begin( PARALLEL ), parser.end( PARALLEL),
-			   PARALLEL
-			   );*/
+			grb::algorithms::amg_data< T, T, T > *data = new grb::algorithms::amg_data< T, T, T >( n_A );
+			rc = buildMatrixUnique(
+				data->A,
+				in.matAbuffer[ coarsening_level ].i_data,
+				in.matAbuffer[ coarsening_level ].j_data,
+				in.matAbuffer[ coarsening_level ].v_data,
+				in.matAbuffer[ coarsening_level ].size(),
+				SEQUENTIAL
+			);
+
 			if( rc != SUCCESS ) {
 				std::cerr << "Failure: call to buildMatrixUnique did not succeed "
 						  << "(" << toString( rc ) << ")." << std::endl;
 				return rc;
 			}
 #ifdef AMG_PRINT_STEPS
-			std::cout << " buildMatrixUnique: constructed data->A " << nrows(data->A) << " x " << ncols(data->A) << " matrix \n";
+			std::cout << " buildMatrixUnique: constructed data->A " << nrows( data->A )
+			          << " x " << ncols( data->A ) << " matrix \n";
 #endif
 			assert( ! holder ); // should be empty
 			holder = std::unique_ptr< grb::algorithms::amg_data< T, T, T > >( data );
 
 			{
-				RC rc = grb::buildVector( data->A_diagonal,
-										  in.matMbuffer[coarsening_level].v_data,
-										  in.matMbuffer[coarsening_level].v_data + in.matMbuffer[coarsening_level].size(),
-										  SEQUENTIAL );
+				RC rc = grb::buildVector(
+					data->A_diagonal,
+					in.matMbuffer[ coarsening_level ].v_data,
+					in.matMbuffer[ coarsening_level ].v_data + in.matMbuffer[ coarsening_level ].size(),
+					SEQUENTIAL
+				);
 				if ( rc != SUCCESS ) {
 					std::cerr << " buildVector failed!\n ";
 					return rc;
 				}
 #ifdef AMG_PRINT_STEPS
 				std::cout << " buildVector: data->A_diagonal "
-						  << size(data->A_diagonal) << " vector \n";
+				          << size( data->A_diagonal ) << " vector \n";
 #endif
 			}
 
 			std::size_t coarser_size;
-			std::size_t previous_size=n_A;
-			
-			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
-			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
+			std::size_t previous_size = n_A;
+
+			// initialize coarsening with additional pointers and
+			// dimensions copies to iterate and divide
+			grb::algorithms::multi_grid_data< T, T > **coarser = &data->coarser_level;
 			assert( *coarser == nullptr );
-			
+
 			// generate linked list of hierarchical coarseners
 			while( coarsening_level  < max_levels ) {
 				assert( *coarser == nullptr );
-				
+
 				coarser_size = in.matAbuffer[ coarsening_level + 1 ].get_n();
 
 				// build data structures for new level
-				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
-				
+				grb::algorithms::multi_grid_data< double, double > *new_coarser =
+					new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size );
+
 				// install coarser level immediately to cleanup in case of build error
 				*coarser = new_coarser;
 
-				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
+				// initialize coarsener matrix, system matrix and
+				// diagonal vector for the coarser level
 				{
-					rc = buildMatrixUnique( new_coarser->coarsening_matrix,
-											in.matRbuffer[ coarsening_level ].i_data,
-											in.matRbuffer[ coarsening_level ].j_data,
-											in.matRbuffer[ coarsening_level ].v_data,
-											in.matRbuffer[ coarsening_level ].size(),
-											SEQUENTIAL
-											);
+					rc = buildMatrixUnique(
+						new_coarser->coarsening_matrix,
+						in.matRbuffer[ coarsening_level ].i_data,
+						in.matRbuffer[ coarsening_level ].j_data,
+						in.matRbuffer[ coarsening_level ].v_data,
+						in.matRbuffer[ coarsening_level ].size(),
+						SEQUENTIAL
+					);
 
 					if( rc != SUCCESS ) {
 						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
@@ -129,40 +136,45 @@ namespace grb {
 					}
 #ifdef AMG_PRINT_STEPS
 					std::cout << " buildMatrixUnique: constructed new_coarser->coarsening_matrix "
-							  << nrows(new_coarser->coarsening_matrix) << " x " << ncols(new_coarser->coarsening_matrix) << " matrix \n";
+					          << nrows(new_coarser->coarsening_matrix) << " x "
+					          << ncols(new_coarser->coarsening_matrix) << " matrix \n";
 #endif
 				}
 				{
-					rc = buildMatrixUnique( new_coarser->A,
-											in.matAbuffer[ coarsening_level + 1 ].i_data,
-											in.matAbuffer[ coarsening_level + 1 ].j_data,
-											in.matAbuffer[ coarsening_level + 1 ].v_data,
-											in.matAbuffer[ coarsening_level + 1 ].size(),
-											SEQUENTIAL
-											);
+					rc = buildMatrixUnique(
+						new_coarser->A,
+						in.matAbuffer[ coarsening_level + 1 ].i_data,
+						in.matAbuffer[ coarsening_level + 1 ].j_data,
+						in.matAbuffer[ coarsening_level + 1 ].v_data,
+						in.matAbuffer[ coarsening_level + 1 ].size(),
+						SEQUENTIAL
+					);
 
 					if( rc != SUCCESS ) {
 						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
-								  << "(" << toString( rc ) << ")." << std::endl;
+						          << "(" << toString( rc ) << ")." << std::endl;
 						return rc;
 					}
 #ifdef AMG_PRINT_STEPS
 					std::cout << " buildMatrixUnique: constructed new_coarser->A "
-							  << nrows(new_coarser->A) << " x " << ncols(new_coarser->A) << " matrix \n";
+					          << nrows(new_coarser->A) << " x " << ncols(new_coarser->A) << " matrix \n";
 #endif
 				}
-				
-				RC rc = grb::buildVector( new_coarser->A_diagonal,
-										  in.matMbuffer[ coarsening_level + 1 ].v_data,
-										  in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
-										  SEQUENTIAL );
+
+				RC rc = grb::buildVector(
+					new_coarser->A_diagonal,
+					in.matMbuffer[ coarsening_level + 1 ].v_data,
+					in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
+					SEQUENTIAL
+				);
+
 				if ( rc != SUCCESS ) {
 					std::cerr << " buildVector failed!\n ";
 					return rc;
 				}
 #ifdef AMG_PRINT_STEPS
 				std::cout << " buildVector: new_coarser->A_diagonal "
-						  << size(new_coarser->A_diagonal) << " vector \n";
+				          << size(new_coarser->A_diagonal) << " vector \n";
 #endif
 
 				// prepare for new iteration
@@ -170,10 +182,12 @@ namespace grb {
 				previous_size = coarser_size;
 				coarsening_level++;
 			}
+
 			return rc;
 		}
 
 	} // namespace algorithms
+
 } // namespace grb
 
 #endif // _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS

From d1f2fe3f8f68dd3092b3417038c0b499002a1a7d Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Wed, 10 Aug 2022 16:18:29 +0200
Subject: [PATCH 17/17] update code style

---
 tests/smoke/amg.cpp | 345 ++++++++++++++++++++++----------------------
 1 file changed, 174 insertions(+), 171 deletions(-)

diff --git a/tests/smoke/amg.cpp b/tests/smoke/amg.cpp
index 4d42b3185..380e614f8 100644
--- a/tests/smoke/amg.cpp
+++ b/tests/smoke/amg.cpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,13 @@
  */
 
 /**
+ * Test for AMG solver, levels provided by AMGCL.
  * @file amg.cpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Test for AMG solver, levels provided by AMGCL.
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
  *
  *
- * @date 2021-04-30
+ * @date 2022-10-08
  */
 #include <array>
 #include <cassert>
@@ -31,13 +32,22 @@
 #include <memory>
 #include <type_traits>
 #include <string>
-
 #include <graphblas.hpp>
-
+#include <graphblas/algorithms/amg/amg.hpp>
+#include <graphblas/algorithms/amg/system_building_utils.hpp>
+#include <graphblas/utils/Timer.hpp>
+#include <utils/argument_parser.hpp>
+#include <utils/assertions.hpp>
+#include <utils/print_vec_mat.hpp>
 
 // forward declaration for the tracing facility
 template< typename T,
-	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
+	class Ring = grb::Semiring<
+		grb::operators::add< T >,
+		grb::operators::mul< T >,
+		grb::identities::zero,
+		grb::identities::one
+	>
 >
 void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
 
@@ -46,36 +56,22 @@ void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring
 // AMG_PRINT_STEPS requires defining the following symbols
 
 /**
- * @brief simply prints \p args on a dedicated line.
+ * simply prints \p args on a dedicated line.
  */
 #define DBG_println( args ) std::cout << args << std::endl;
 
-
 /**
- * @brief prints \p head and the norm of \p r.
+ * prints \p head and the norm of \p r.
  */
 #define DBG_print_norm( vec, head ) print_norm( vec, head )
 #endif
 
-
-#include <graphblas/algorithms/amg/amg.hpp>
-#include <graphblas/algorithms/amg/system_building_utils.hpp>
-
-// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
-
-
-#include <graphblas/utils/Timer.hpp>
-
-#include <utils/argument_parser.hpp>
-#include <utils/assertions.hpp>
-#include <utils/print_vec_mat.hpp>
-
 //========== MAIN PROBLEM PARAMETERS =========
 // values modifiable via cmd line args: default set as in reference AMG
-constexpr std::size_t DEF_COARSENING_LEVELS{ 1U };
-constexpr std::size_t MAX_COARSENING_LEVELS{ 4U };
-constexpr std::size_t MAX_ITERATIONS_DEF{ 56UL };
-constexpr std::size_t SMOOTHER_STEPS_DEF{ 1 };
+constexpr size_t DEF_COARSENING_LEVELS{ 1U };
+constexpr size_t MAX_COARSENING_LEVELS{ 4U };
+constexpr size_t MAX_ITERATIONS_DEF{ 56UL };
+constexpr size_t SMOOTHER_STEPS_DEF{ 1 };
 //============================================
 
 constexpr double MAX_NORM { 4.0e-14 };
@@ -87,43 +83,42 @@ static const char * const TEXT_HIGHLIGHT = "===> ";
 #define thcout ( std::cout << TEXT_HIGHLIGHT )
 #define thcerr ( std::cerr << TEXT_HIGHLIGHT )
 
-
 /**
- * @brief Container to store matrices loaded from a file.
+ * Container to store matrices loaded from a file.
  */
 template< typename T = double >
 struct mat_data {
-	std::size_t *i_data;
-	std::size_t *j_data;
+	size_t *i_data;
+	size_t *j_data;
 	T *v_data;
-	std::size_t nz, n, m;
+	size_t nz, n, m;
 
-	void resize( std::size_t innz, std::size_t inn, std::size_t inm ){
+	void resize( size_t innz, size_t inn, size_t inm ){
 		nz=innz;
 		n=inn;
 		m=inm;
-		i_data = new std::size_t [ nz ];
-		j_data = new std::size_t [ nz ];
+		i_data = new size_t [ nz ];
+		j_data = new size_t [ nz ];
 		v_data = new T [ nz ];
 	};
 
 	mat_data( ){
 		nz = 0;
 	};
-	
-	mat_data( std::size_t in ){
+
+	mat_data( size_t in ){
 		resize( in );
 	};
 
-	std::size_t size(){
+	size_t size(){
 		return nz;
 	};
 
-	std::size_t get_n(){
+	size_t get_n(){
 		return n;
 	};
 
-	std::size_t get_m(){
+	size_t get_m(){
 		return m;
 	};
 
@@ -135,14 +130,14 @@ struct mat_data {
 };
 
 /**
- * @brief Container to store vectors loaded from a file.
+ * Container to store vectors loaded from a file.
  */
 template< typename T = double >
 struct vec_data {
 	T *v_data;
-	std::size_t n;
+	size_t n;
 
-	void resize( std::size_t in ){
+	void resize( size_t in ){
 		n=in;
 		v_data = new T [ n ];
 	};
@@ -150,12 +145,12 @@ struct vec_data {
 	vec_data( ){
 		n = 0;
 	};
-	
-	vec_data( std::size_t in ){
+
+	vec_data( size_t in ){
 		resize( in );
 	};
 
-	std::size_t size(){
+	size_t size(){
 		return n;
 	};
 
@@ -164,32 +159,31 @@ struct vec_data {
 	};
 };
 
-
 static bool matloaded = false;
 
 /**
- * @brief Container for the parameters for the AMG simulation.
+ * Container for the parameters for the AMG simulation.
  */
 struct simulation_input {
-	std::size_t max_coarsening_levels;
-	std::size_t test_repetitions;
-	std::size_t max_iterations;
-	std::size_t smoother_steps;
+	size_t max_coarsening_levels;
+	size_t test_repetitions;
+	size_t max_iterations;
+	size_t smoother_steps;
 	const char * matAfile_c_str;
+	// vectors of input matrix filenames
 	std::vector< std::string > matAfiles;
 	std::vector< std::string > matMfiles;
 	std::vector< std::string > matPfiles;
 	std::vector< std::string > matRfiles;
-	
 	bool evaluation_run;
 	bool no_preconditioning;
 };
 
-template<typename CharT, typename TraitsT = std::char_traits<CharT> >
+template<typename CharT, typename TraitsT = std::char_traits< CharT > >
 class vectorwrapbuf :
 	public std::basic_streambuf<CharT, TraitsT> {
 public:
-    vectorwrapbuf(std::vector<CharT> &vec) {
+	vectorwrapbuf(  std::vector<CharT> &vec) {
 		this->setg( vec.data(), vec.data(), vec.data() + vec.size() );
     }
 };
@@ -198,28 +192,30 @@ std::istream& operator>>(std::istream& is, std::string& s){
 	return is;
 }
 
-
 /**
  * @brief Container to store all data for AMG hierarchy.
  */
 class preloaded_matrices : public simulation_input {
 
 public :
-	std::size_t nzAmt, nzMmt, nzPmt, nzRmt;
-	mat_data< double > * matAbuffer;
-	vec_data< double > * matMbuffer;
-	mat_data< double > * matPbuffer;
-	mat_data< double > * matRbuffer;
-
-	grb::RC read_matrix( std::vector< std::string > & fname,
-						  mat_data<double> * data ) {
+	size_t nzAmt, nzMmt, nzPmt, nzRmt;
+	mat_data< double > *matAbuffer;
+	vec_data< double > *matMbuffer;
+	mat_data< double > *matPbuffer;
+	mat_data< double > *matRbuffer;
+
+	grb::RC read_matrix(
+		std::vector< std::string > &fname,
+		mat_data<double> *data
+	) {
 		grb::RC rc = SUCCESS;
 		for ( size_t i = 0; i < fname.size(); i++ ) {
-			grb::utils::MatrixFileReader< double, std::conditional<
-				( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
-													  grb::config::RowIndexType,
-													  grb::config::ColIndexType >::type
-										  > parser_mat( fname[ i ].c_str(), true );
+			grb::utils::MatrixFileReader<
+				double, std::conditional<
+					( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+							grb::config::RowIndexType,
+							grb::config::ColIndexType >::type
+				> parser_mat( fname[ i ].c_str(), true );
 #ifdef DEBUG
 			std::cout << " ---> parser_mat.filename()=" << parser_mat.filename() << "\n";
 			std::cout << " ---> parser_mat.nz()=" << parser_mat.nz() << "\n";
@@ -227,22 +223,23 @@ public :
 			std::cout << " ---> parser_mat.m()=" << parser_mat.m() << "\n";
 			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
 #endif
+			// very poor choice and temp solution size = 2 x nz
 			data[i].resize( parser_mat.entries()*2, parser_mat.n(), parser_mat.m() );
-			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
+			std::ifstream inFile( fname[ i ], std::ios::binary | std::ios::ate );
 			if( ! inFile.is_open() ) {
 				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
 				return( PANIC );
 			}
 			std::streamsize size = inFile.tellg();
-			inFile.seekg(0, std::ios::beg);
-			std::vector<char> buffer(size);
-			if ( inFile.read(buffer.data(), size) )	{
-				/* all fine! */
+			inFile.seekg( 0, std::ios::beg );
+			std::vector< char > buffer( size );
+			if ( inFile.read( buffer.data(), size ) ) {
+				// all fine
 			}
 			else {
 				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
 				return( PANIC );
-			} 
+			}
 			inFile.close();
 			vectorwrapbuf< char > databuf( buffer );
 			std::istream isdata( &databuf );
@@ -267,30 +264,30 @@ public :
 			data[ i ].nz = k;
 		}
 		return ( rc );
-	}	
+	}
 
-	grb::RC read_vector( std::vector< std::string > & fname,
-						 vec_data< double > * data ) {
+	grb::RC read_vector( std::vector< std::string > &fname,
+						 vec_data< double > *data ) {
 		grb::RC rc = SUCCESS;
 		for ( size_t i = 0; i < fname.size(); i++ ){
 #ifdef DEBUG
 			std::cout << " Reading " << fname[ i ].c_str() << ".\n";
 #endif
-			std::ifstream inFile(fname[ i ], std::ios::binary | std::ios::ate);
+			std::ifstream inFile( fname[ i ], std::ios::binary | std::ios::ate );
 			if( ! inFile.is_open() ) {
 				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
 				return( PANIC );
 			}
 			std::streamsize size = inFile.tellg();
-			inFile.seekg(0, std::ios::beg);
-			std::vector<char> buffer(size);
-			if (inFile.read(buffer.data(), size))	{
-				/* alles gut! */
+			inFile.seekg( 0, std::ios::beg );
+			std::vector< char > buffer( size );
+			if ( inFile.read( buffer.data(), size ) ) {
+				// all fine
 			}
 			else {
 				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
 				return( PANIC );
-			} 
+			}
 			inFile.close();
 			vectorwrapbuf< char > databuf( buffer );
 			std::istream isdata( &databuf );
@@ -303,7 +300,7 @@ public :
 			std::stringstream ss( headder );
 			ss >> n >> m;
 			std::cout << " Reading from" << fname[ i ] << " dense matrix with dimensions: "
-					  << " n x m = " << n << " x " << m << "\n";
+			          << " n x m = " << n << " x " << m << "\n";
 			data[ i ].resize( n );
 			for ( size_t j = 0; j < n; j++ ) {
 				isdata >> data[ i ].v_data[ j ];
@@ -311,7 +308,7 @@ public :
 		}
 		return( rc );
 	}
-	
+
 	grb::RC read_vec_matrics(){
 		grb::RC rc = SUCCESS;
 
@@ -324,7 +321,7 @@ public :
 		matMbuffer = new vec_data< double > [ nzMmt ];
 		matPbuffer = new mat_data< double > [ nzPmt ];
 		matRbuffer = new mat_data< double > [ nzRmt ];
-		
+
 		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
 		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
 		// rc = rc ? rc : read_matrix( matPfiles, matPbuffer );
@@ -339,7 +336,7 @@ public :
 preloaded_matrices inputData;
 
 /**
- * @brief Containers for test outputs.
+ * Containers for test outputs.
  */
 struct output {
 	RC error_code;
@@ -358,22 +355,6 @@ struct output {
 	}
 };
 
-/**
- * @brief Returns the closets power of 2 bigger or equal to \p n .
- */
-template< typename T = std::size_t >
-T static next_pow_2( T n ) {
-	static_assert( std::is_integral< T >::value, "Integral required." );
-	--n;
-	n |= ( n >> 1 );
-	for( unsigned i = 1; i <= sizeof( T ) * 4; i *= 2 ) {
-		const unsigned shift = static_cast< T >( 1U ) << i;
-		n |= ( n >> shift );
-	}
-	return n + 1;
-}
-
-
 #ifdef AMG_PRINT_SYSTEM
 static void print_system( const amg_data< double, double, double > & data ) {
 	print_matrix( data.A, 70, "A" );
@@ -387,11 +368,16 @@ static void print_system( const amg_data< double, double, double > & data ) {
 #endif
 
 #ifdef AMG_PRINT_STEPS
-template< typename T,
-		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
+template<
+	typename T,
+	class Ring = Semiring<
+		grb::operators::add< T >,
+		grb::operators::mul< T >,
+		grb::identities::zero,
+		grb::identities::one >
 	>
-void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm=0;
+void print_norm( const grb::Vector< T > &r, const char *head, const Ring &ring ) {
+	T norm = 0;
 	RC ret = grb::dot( norm, r, r, ring ); // residual = r' * r;
 	(void)ret;
 	assert( ret == SUCCESS );
@@ -404,17 +390,16 @@ void print_norm( const grb::Vector< T > & r, const char * head, const Ring & rin
 #endif
 
 /**
- * @brief Main test, building an AMG problem and running the simulation closely following the
- * parameters in the reference AMG test.
+ * Main test, building an AMG problem and running the simulation closely
+ *  following the parameters in the reference AMG test.
  */
-void grbProgram( const simulation_input & in, struct output & out ) {
+void grbProgram( const simulation_input &in, struct output &out ) {
 	grb::utils::Timer timer;
 	timer.reset();
 
 	// get user process ID
 	assert( spmd<>::pid() < spmd<>::nprocs() );
 
-
 	// assume successful run
 	out.error_code = SUCCESS;
 	RC rc { SUCCESS };
@@ -451,23 +436,29 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
-	Matrix< double > & A { amg_state->A };
-	Vector< double > & x { amg_state->x };
-	Vector< double > & b { amg_state->b };
+	Matrix< double > &A = amg_state->A;
+	Vector< double > &x = amg_state->x;
+	Vector< double > &b = amg_state->b;
 
 	// set vectors as from standard AMG benchmark
 	set( x, 1.0 );
 	set( b, 0.0 );
-	rc = grb::mxv( b, A, x, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
+	rc = grb::mxv( b, A, x,
+		grb::Semiring< grb::operators::add< double >,
+			grb::operators::mul< double >,
+			grb::identities::zero,
+			grb::identities::one >() );
 	set( x, 0.0 );
 
-	double norm_b=0;
-	rc = grb::dot( norm_b, b, b, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
+	double norm_b = 0;
+	rc = grb::dot( norm_b, b, b,
+		grb::Semiring< grb::operators::add< double >,
+			grb::operators::mul< double >,
+			grb::identities::zero,
+			grb::identities::one >() );
 	(void)rc;
 	assert( rc == SUCCESS );
 
-	
-
 #ifdef AMG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {
 		print_vector( x, 50, " ---> X(1)" );
@@ -475,8 +466,6 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	}
 #endif
 
-	std::cout  << " ---> before solver \n";
-
 	out.times.preamble = timer.time();
 	timer.reset();
 
@@ -484,7 +473,8 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	out.test_repetitions = 0;
 	if( in.evaluation_run ) {
 		double single_time_start = timer.time();
-		rc = amg( *amg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+		rc = amg( *amg_state, with_preconditioning, in.smoother_steps,
+			in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
 		double single_time = timer.time() - single_time_start;
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
@@ -497,8 +487,9 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 		for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
 			rc = set( x, 0.0 );
 			assert( rc == SUCCESS );
-			rc = amg( *amg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-			
+			rc = amg( *amg_state, with_preconditioning, in.smoother_steps,
+				in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+
 			out.test_repetitions++;
 			if( rc != SUCCESS ) {
 				break;
@@ -511,7 +502,7 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 #ifdef AMG_PRINT_SYSTEM
 		rc = rc ? rc : grb::eWiseLambda( [ &x ]( const size_t i ) { x[ i ] -= 1;}, x );
-		print_norm (x, " ---> norm(x)");
+		print_norm (x, " norm(x)");
 #endif
 		// sleep( 1 );
 	}
@@ -519,19 +510,24 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 
 #ifdef AMG_PRINT_SYSTEM
 	if( spmd<>::pid() == 0 ) {
-		print_vector( x, 50, " ---> X(check out)" );
-		print_vector( b, 50, " ---> B(check out)" );
+		print_vector( x, 50, " x(first 50 elements)" );
+		print_vector( b, 50, " b(first 50 elements)" );
 	}
 #endif
 
 	if( spmd<>::pid() == 0 ) {
 		if( rc == SUCCESS ) {
 			if( in.evaluation_run ) {
-				std::cout << "Info: cold AMG completed within " << out.performed_iterations << " iterations. Last computed residual is " << out.residual << ". Time taken was " << out.times.useful
-						  << " ms. Deduced inner repetitions parameter of " << out.test_repetitions << " to take 1 second or more per inner benchmark." << std::endl;
+				std::cout << "Info: cold AMG completed within " << out.performed_iterations
+				          << " iterations. Last computed residual is " << out.residual
+				          << ". Time taken was " << out.times.useful
+				          << " ms. Deduced inner repetitions parameter of " << out.test_repetitions
+				          << " to take 1 second or more per inner benchmark." << std::endl;
 			} else {
-				std::cout << "Final residual= "<< out.residual << " relative error= " <<  out.residual/sqrt(norm_b) << "\n";
-				std::cout << "Average time taken for each of " << out.test_repetitions << " AMG calls (hot start): " << out.times.useful << std::endl;
+				std::cout << "Final residual= "<< out.residual << " relative error= "
+				          <<  out.residual/sqrt(norm_b) << "\n";
+				std::cout << "Average time taken for each of " << out.test_repetitions
+				          << " AMG calls (hot start): " << out.times.useful << std::endl;
 			}
 		} else {
 			std::cerr << "Failure: call to AMG did not succeed (" << toString( rc ) << ")." << std::endl;
@@ -543,7 +539,10 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 	// set error code
 	out.error_code = rc;
 
-	Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one > ring;
+	Semiring< grb::operators::add< double >,
+		grb::operators::mul< double >,
+		grb::identities::zero,
+		grb::identities::one > ring;
 	grb::set( b, 1.0 );
 	out.square_norm_diff = 0.0;
 	grb::eWiseMul( b, -1.0, x, ring );
@@ -557,9 +556,10 @@ void grbProgram( const simulation_input & in, struct output & out ) {
 }
 
 /**
- * @brief Parser the command-line arguments to extract the simulation information and checks they are valid.
+ * Parser the command-line arguments to extract the simulation information
+ * and checks they are valid.
  */
-static void parse_arguments( simulation_input &, std::size_t &, double &, int, char ** );
+static void parse_arguments( simulation_input &, size_t &, double &, int, char ** );
 
 int main( int argc, char ** argv ) {
 	simulation_input sim_in;
@@ -580,7 +580,7 @@ int main( int argc, char ** argv ) {
 	struct output out;
 
 	// set standard exit code
-	grb::RC rc { SUCCESS };
+	grb::RC rc = SUCCESS;
 
 	// launch estimator (if requested)
 	if( sim_in.evaluation_run ) {
@@ -598,12 +598,13 @@ int main( int argc, char ** argv ) {
 	grb::Benchmarker< AUTOMATIC > benchmarker;
 	rc = benchmarker.exec( &grbProgram, sim_in, out, 1, test_outer_iterations, true );
 	ASSERT_RC_SUCCESS( rc );
-	thcout << "Benchmark completed successfully and took " << out.performed_iterations << " iterations to converge with residual " << out.residual << std::endl;
+	thcout << "Benchmark completed successfully and took " << out.performed_iterations
+	       << " iterations to converge with residual " << out.residual << std::endl;
 
 	if( ! out.pinnedVector ) {
 		thcerr << "no output vector to inspect" << std::endl;
 	} else {
-		const PinnedVector< double > &solution { *out.pinnedVector };
+		const PinnedVector< double > &solution = *out.pinnedVector;
 		thcout << "Size of x is " << solution.size() << std::endl;
 		if( solution.size() > 0 ) {
 			print_vector( solution, 30, "SOLUTION" );
@@ -614,7 +615,7 @@ int main( int argc, char ** argv ) {
 
 	ASSERT_RC_SUCCESS( out.error_code );
 
-	double residual_norm { sqrt( out.square_norm_diff ) };
+	double residual_norm = sqrt( out.square_norm_diff );
 	thcout << "Residual norm: " << residual_norm << std::endl;
 
 	ASSERT_LT( residual_norm, max_residual_norm );
@@ -623,69 +624,70 @@ int main( int argc, char ** argv ) {
 	return 0;
 }
 
-static void parse_arguments( simulation_input & sim_in, std::size_t & outer_iterations, double & max_residual_norm, int argc, char ** argv ) {
+static void parse_arguments( simulation_input &sim_in, size_t &outer_iterations,
+	double &max_residual_norm, int argc, char **argv ) {
 
 	argument_parser parser;
-	parser
-		.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
-								"maximum level for coarsening; 0 means no coarsening; note: actual "
-								"level may be limited by the minimum system dimension" )
-		.add_optional_argument( "--mat_files_pattern", sim_in.matAfile_c_str, "","file pattern for files contining matrices A, M_diag, P, R "
-		 						"i.e. '--mat_a_file_names /path/to/dir/level_  --max_coarse-levels 2' will read /path/to/dir/level_0_A.mtx,  /path/to/dir/level_1_A.mtx, /path/to/dir/level_2_A.mtx ... " )
-		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
-		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
-		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of AMG iterations" )
+	parser.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
+			"maximum level for coarsening; 0 means no coarsening; note: actual "
+			"level may be limited by the minimum system dimension" )
+		.add_optional_argument( "--mat_files_pattern", sim_in.matAfile_c_str,
+			"file pattern for files contining matrices A, M_diag, P, R "
+			"i.e. '--mat_a_file_names /path/to/dir/level_  --max_coarse-levels 2' will read "
+			"/path/to/dir/level_0_A.mtx,  /path/to/dir/level_1_A.mtx, /path/to/dir/level_2_A.mtx ... " )
+		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(),
+			"consecutive test repetitions before benchmarking" )
+		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(),
+			"test repetitions with complete initialization" )
+		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF,
+			"maximum number of AMG iterations" )
 		.add_optional_argument( "--max-residual-norm", max_residual_norm, MAX_NORM,
 			"maximum norm for the residual to be acceptable (does NOT limit "
 			"the execution of the algorithm)" )
-		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF, "number of pre/post-smoother steps; 0 disables smoothing" )
+		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps,
+			SMOOTHER_STEPS_DEF, "number of pre/post-smoother steps; 0 disables smoothing" )
 		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
-			"launch single run directly, without benchmarker (ignore "
-			"repetitions)" )
-		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" );
-	
+			"launch single run directly, without benchmarker (ignore repetitions)" )
+		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false,
+			"do not apply pre-conditioning via multi-grid V cycle" );
 
 	parser.parse( argc, argv );
-	
-	std::string matAfile=sim_in.matAfile_c_str;
-	if ( !matAfile.empty() ) {
+
+	std::string matAfile = sim_in.matAfile_c_str;
+	if ( ! matAfile.empty() ) {
 		std::cout << "Using  <<" << matAfile << ">> pattern to read matrices " << std::endl;
-		for ( size_t i=0 ; i < sim_in.max_coarsening_levels ; i++ ){
+		for ( size_t i = 0 ; i < sim_in.max_coarsening_levels ; i++ ){
 			std::string fnamebase = matAfile + std::to_string(  i );
 			std::string fnameA = fnamebase + "_A.mtx";
 			std::string fnameM = fnamebase + "_M_diag.mtx";
 			std::string fnameP = fnamebase + "_P.mtx";
 			std::string fnameR = fnamebase + "_R.mtx";
-			sim_in.matAfiles.push_back (fnameA);
-			sim_in.matMfiles.push_back (fnameM);
-			sim_in.matPfiles.push_back (fnameP);
-			sim_in.matRfiles.push_back (fnameR);
-			
+			sim_in.matAfiles.push_back( fnameA );
+			sim_in.matMfiles.push_back( fnameM );
+			sim_in.matPfiles.push_back( fnameP );
+			sim_in.matRfiles.push_back( fnameR );
 		}
 		{
 			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
-
 			std::string fnameA = fnamebase + "_A.mtx";
 			sim_in.matAfiles.push_back (fnameA);
-
 			std::string fnameM = fnamebase + "_M_diag.mtx";
 			sim_in.matMfiles.push_back (fnameM);
-		}		
-
+		}
 		std::cout << "files to read matrices: " << std::endl;
-		for (std::string fname: sim_in.matAfiles) {
+		for ( std::string fname: sim_in.matAfiles ) {
 			std::cout << fname << " \n";
 		}
 
-		for (std::string fname: sim_in.matMfiles) {
+		for ( std::string fname: sim_in.matMfiles ) {
 			std::cout << fname << " \n";
 		}
 
-		for (std::string fname: sim_in.matPfiles) {
+		for ( std::string fname: sim_in.matPfiles ) {
 			std::cout << fname << " \n";
 		}
 
-		for (std::string fname: sim_in.matRfiles) {
+		for ( std::string fname: sim_in.matRfiles ) {
 			std::cout << fname << " \n";
 		}
 
@@ -696,7 +698,8 @@ static void parse_arguments( simulation_input & sim_in, std::size_t & outer_iter
 
 	// check for valid values
 	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
-		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
+		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS
+		          << " instead of " << sim_in.max_coarsening_levels << std::endl;
 		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;
 	}
 	if( sim_in.test_repetitions == 0 ) {