diff --git a/include/graphblas/algorithms/amg/amg.hpp b/include/graphblas/algorithms/amg/amg.hpp
new file mode 100644
index 000000000..660b0e63a
--- /dev/null
+++ b/include/graphblas/algorithms/amg/amg.hpp
@@ -0,0 +1,251 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Algebraic Multi-grid algorithm relying on level_ matrices from AMGCL.
+ * @file amg.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-08-10
+ */
+
+#ifndef _H_GRB_ALGORITHMS_AMG
+#define _H_GRB_ALGORITHMS_AMG
+#ifdef _DEBUG
+ #define AMG_PRINT_STEPS
+#endif
+
+#include <graphblas.hpp>
+#include "amg_data.hpp"
+#include "multigrid_v_cycle.hpp"
+#include <utils/print_vec_mat.hpp>
+
+namespace grb {
+
+	namespace algorithms {
+
+		/**
+		 * Algebraic Multi-grid algorithm relying on level_ matrices from AMGCL.
+		 *
+		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running
+		 * the AMG algorithm.  AMG implementation (as the standard one) couples a
+		 * standard CG algorithm with a V-cycle  multi-grid solver to initially
+		 * refine the tentative solution. This refinement step depends on the
+		 * availability of coarsening information, which should be stored inside
+		 * \p data; otherwise, the refinement is not performed and only the CG
+		 * algorithm is run.
+		 *
+		 * This implementation assumes that the vectors and matrices inside \p data
+		 * are all correctly initialized and populated with the proper values;
+		 * in particular
+		 * - amg_data#x with the initial tentative solution
+		 *              (iterative solutions are also stored here)
+		 * - amg_data#A with the system matrix
+		 * - amg_data#b with the right-hand side vector \f$ b \f$
+		 * - amg_data#A_diagonal with the diagonal values of the matrix
+		 * - amg_data#coarser_level with the information for
+		 *                the coarser multi-grid run (if any)
+		 * The other vectors are assumed to be inizialized
+		 * (via the usual grb::Vector#Vector(size_t) constructor)
+		 * but not necessarily populated with values, as they are internally populated
+		 * when needed; hence, any previous values are overwritten.
+		 *
+		 * Failuers of GraphBLAS operations are handled by immediately stopping the
+		 * execution and by returning the failure code.
+		 *
+		 * @tparam IOType type of result and intermediate vectors used during computation
+		 * @tparam ResidualType type of the residual norm
+		 * @tparam NonzeroType type of matrix values
+		 * @tparam InputType type of values of the right-hand side vector b
+		 * @tparam Ring the ring of algebraic operators zero-values
+		 * @tparam Minus the minus operator for subtractions
+		 *
+		 * @param[in,out] data \ref amg_data object storing inputs, outputs and temporary
+		 *                   vectors used for the computation, as long as the information
+		 *                   for the recursive multi-grid runs
+		 * @param[in] with_preconditioning whether to use pre-conditioning,
+		 *                     i.e. to perform multi-grid runs
+		 * @param[in] presmoother_steps number of pre-smoother steps, for multi-grid runs
+		 * @param[in] postsmoother_steps nomber of post-smoother steps,
+		 *                          for multi-grid runs
+		 * @param[in] max_iterations maximum number if iterations the simulation may run
+		 *                       for; once reached, the simulation stops even if the
+		 *                       residual norm is above \p tolerance
+		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of
+		 *                      the residual norm to stop  the simulation at
+		 * @param[out] iterations numbers of iterations performed
+		 * @param[out] norm_residual norm of the final residual
+		 * @param[in] ring the ring to perform the operations on
+		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error
+		 *                          code of the first unsuccessful operation otherwise
+		 */
+		template< typename IOType,
+			typename ResidualType,
+			typename NonzeroType,
+			typename InputType,
+			class Ring = Semiring<
+				grb::operators::add< IOType >,
+				grb::operators::mul< IOType >,
+				grb::identities::zero,
+				grb::identities::one
+			>,
+			class Minus = operators::subtract< IOType > >
+		grb::RC amg(
+			amg_data< IOType, NonzeroType, InputType > &data,
+			bool with_preconditioning,
+			const size_t presmoother_steps,
+			const size_t postsmoother_steps,
+			const size_t max_iterations,
+			const ResidualType tolerance,
+			size_t &iterations,
+			ResidualType &norm_residual,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus()
+		) {
+			ResidualType alpha;
+			const grb::Matrix< NonzeroType > &A = data.A;
+			grb::Vector< IOType > &x = data.x;
+			const grb::Vector< InputType > &b = data.b;
+			grb::Vector< IOType > &r = data.r;  // residual vector
+			grb::Vector< IOType > &p = data.p;  // direction vector
+			grb::Vector< IOType > &Ap = data.u; // temp vector
+			grb::Vector< IOType > &z = data.z;  // pre-conditioned residual vector
+			grb::RC ret = SUCCESS;
+
+			ret = ret ? ret : grb::set( Ap, 0 );
+			ret = ret ? ret : grb::set( r, 0 );
+			ret = ret ? ret : grb::set( p, 0 );
+
+			ret = ret ? ret : grb::set( p, x );
+			ret = ret ? ret : grb::mxv( Ap, A, x, ring ); // Ap = A * x
+			assert( ret == SUCCESS );
+
+			ret = ret ? ret : grb::eWiseApply( r, b, Ap, minus ); // r = b - Ap;
+			assert( ret == SUCCESS );
+
+			norm_residual = ring.template getZero< ResidualType >();
+			ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // norm_residual = r' * r;
+			assert( ret == SUCCESS );
+
+			// compute sqrt to avoid underflow
+			norm_residual = std::sqrt( norm_residual );
+
+			// initial norm of residual
+			const ResidualType norm_residual_initial = norm_residual;
+			ResidualType old_r_dot_z = 0.0;
+			ResidualType r_dot_z = 0.0;
+			ResidualType beta = 0.0;
+			size_t iter = 0;
+
+#ifdef AMG_PRINT_STEPS
+			print_norm( p, "start p" );
+			print_norm( Ap, "start Ap" );
+			print_norm( r, "start r" );
+#endif
+
+			do {
+#ifdef AMG_PRINT_STEPS
+				DBG_println( "========= iteration " << iter << " =========" );
+#endif
+				if( with_preconditioning ) {
+					//ret = ret ? ret : grb::set( z, r ); // z = r;
+					ret = ret ? ret : internal::multi_grid(
+						data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus
+					);
+					//ret = ret ? ret : grb::set( x, z );
+					assert( ret == SUCCESS );
+				} else {
+					ret = ret ? ret : grb::set( z, r ); // z = r;
+					assert( ret == SUCCESS );
+				}
+#ifdef AMG_PRINT_STEPS
+				print_norm( z, "initial z" );
+#endif
+
+				ResidualType pAp;
+
+				if( iter == 0 ) {
+					ret = ret ? ret : grb::set( p, z ); //  p = z;
+					assert( ret == SUCCESS );
+
+					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
+					assert( ret == SUCCESS );
+				} else {
+					old_r_dot_z = r_dot_z;
+
+					r_dot_z = ring.template getZero< ResidualType >();
+					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
+					assert( ret == SUCCESS );
+
+					beta = r_dot_z / old_r_dot_z;
+					ret = ret ? ret : grb::clear( Ap );                         // Ap  = 0;
+					ret = ret ? ret : grb::eWiseMulAdd( Ap, beta, p, z, ring ); // Ap += beta * p + z;
+					std::swap( Ap, p );                                         // p = Ap;
+					assert( ret == SUCCESS );
+				}
+#ifdef AMG_PRINT_STEPS
+				print_norm( p, "middle p" );
+#endif
+				ret = ret ? ret : grb::set( Ap, 0 );
+				ret = ret ? ret : grb::mxv( Ap, A, p, ring ); // Ap = A * p;
+
+				assert( ret == SUCCESS );
+#ifdef AMG_PRINT_STEPS
+				print_norm( Ap, "middle Ap" );
+#endif
+				pAp = static_cast< ResidualType >( 0.0 );
+				ret = ret ? ret : grb::dot( pAp, Ap, p, ring ); // pAp = p' * Ap
+				assert( ret == SUCCESS );
+
+				alpha = r_dot_z / pAp;
+
+				ret = ret ? ret : grb::eWiseMul( x, alpha, p, ring ); // x += alpha * p;
+				assert( ret == SUCCESS );
+#ifdef AMG_PRINT_STEPS
+				print_norm( x, "end x" );
+#endif
+				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, ring ); // r += - alpha * Ap;
+				assert( ret == SUCCESS );
+#ifdef AMG_PRINT_STEPS
+				print_norm( r, "end r" );
+#endif
+
+				norm_residual = static_cast< ResidualType >( 0.0 );
+				ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // residual = r' * r;
+				assert( ret == SUCCESS );
+
+				norm_residual = std::sqrt( norm_residual );
+
+#ifdef AMG_PRINT_STEPS
+				std::cout << " ---> norm_residual=" << norm_residual << "\n";
+#endif
+				++iter;
+			} while(
+				iter < max_iterations &&
+				norm_residual / norm_residual_initial > tolerance &&
+				ret == SUCCESS
+				);
+			iterations = iter;
+			return ret;
+		}
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_AMG
diff --git a/include/graphblas/algorithms/amg/amg_data.hpp b/include/graphblas/algorithms/amg/amg_data.hpp
new file mode 100644
index 000000000..2be2524db
--- /dev/null
+++ b/include/graphblas/algorithms/amg/amg_data.hpp
@@ -0,0 +1,201 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Data structures to store AMG input/output data.
+ * @file amg_data.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-10-08
+ */
+
+#ifndef _H_GRB_ALGORITHMS_AMG_DATA
+#define _H_GRB_ALGORITHMS_AMG_DATA
+
+#include <vector>
+#include <cstddef>
+#include <graphblas.hpp>
+
+namespace grb {
+
+	namespace algorithms {
+
+		/**
+		 * basic data container for the AMG algorithm, storing \b only the
+		 * data in common between the full CG run and the V-cycle multi-grid solver.
+		 * Additional data are stored in inheriting data structures.
+		 *
+		 * @tparam IOType type of values of the vectors for intermediate results
+		 * @tparam NonzeroType type of the values stored inside the system matrix #A
+		 */
+		template< typename IOType, typename NonzeroType >
+		struct system_data {
+			const std::size_t system_size; ///< size of the system, i.e. side of the #A
+			grb::Matrix< NonzeroType > A;                   ///< system matrix
+			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
+			grb::Vector< IOType > z;                        ///< multi-grid solution
+			grb::Vector< IOType > r;                        ///< residual
+			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
+			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
+
+			/**
+			 * Constructor building all the stored vectors and matrices.
+			 *
+			 * Stored vectors and matrices are constructed according to
+			 * \p sys_size but \b not initialized
+			 * to any value internally, as initialization is up to users's code.
+			 *
+			 * @param[in] sys_size the size of the underlying physical system,
+			 *                      i.e. the size of vectors and the number
+			 *                      of rows and columns of the #A matrix.
+			 */
+			system_data( std::size_t sys_size ) :
+				system_size( sys_size ), A( sys_size, sys_size ), A_diagonal( sys_size ), z( sys_size ), r( sys_size ),
+				smoother_temp( sys_size ) {}
+			// for safety, disable copy semantics
+			system_data( const system_data & o ) = delete;
+			system_data & operator=( const system_data & ) = delete;
+		};
+
+		/**
+		 * Data container for all multi-grid inputs and outputs.
+		 *
+		 * @tparam IOType Type of values of the vectors for intermediate results
+		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
+		 *                     and the coarsening matrix #Ax_finer
+		 *
+		 * This data structure stores information for a full multi-grid V cycle, i.e.
+		 * - input and output vectors for solution, residual and temporary vectors
+		 * - coarsening information, in particular the #coarsening_matrix that
+		 *   coarsens a larger system of size #finer_size to the current system
+		 *   of size #system_size
+		 * - the next level of coarsening, pointed to by #coarser_level,
+		 *        possibly being \c nullptr if no further coarsening is desired;
+		 *        note that this information is automatically
+		 *        destructed on object destruction (if any)
+		 *
+		 * Vectors stored here refer to the \b coarsened system
+		 * (with the exception of #Ax_finer),
+		 * thus having size #system_size; this also holds for the system matrix #A,
+		 * while #coarsening_matrix has size #system_size \f$ \times \f$ #finer_size.
+		 * Hence, the typical usage of this data structure is to coarsen \b external
+		 * vectors, e.g. vectors coming from another
+		 * \code multi_grid_data<IOType, NonZeroType>
+		 * \endcode object whose #system_size equals
+		 * \code this-> \endcode #fines_size, via
+		 * \code this-> \endcode #coarsening_matrix and store the coarsened
+		 * vectors internally. Mimicing the recursive behavior of standard multi-grid
+		 * simulations, the information for a further coarsening is stored inside
+		 * #coarser_level, so that the  hierarchy of coarsened levels is reflected
+		 * inside this data structure.
+		 *
+		 * As for \ref system_data, internal vectors and matrices are initialized to the
+		 * proper size, but their values are \b not initialized.
+		 */
+		template< typename IOType, typename NonzeroType >
+		struct multi_grid_data : public system_data< IOType, NonzeroType > {
+
+			const std::size_t finer_size; ///< ssize of the finer system to coarse from;
+			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
+			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
+			                                              ///< to coarsen an input vector of size #finer_size
+			                                              ///< into a vector of size #system_size
+			struct multi_grid_data< IOType, NonzeroType > *coarser_level; ///< pointer to next coarsening level, for recursive
+			                                                               ///< multi-grid V cycle implementations
+
+			/**
+			 * Construct a new \c multi_grid_data_object by initializing internal
+			 *     data structures and setting #coarser_level to \c nullptr.
+			 * @param[in] coarser_size size of the current system,
+			 *            i.e. size \b after coarsening
+			 * @param[in] _finer_size  size of the finer system,
+			 *            i.e. size of external objects \b before coarsening
+			 */
+			multi_grid_data( std::size_t coarser_size, std::size_t _finer_size ) :
+				system_data< IOType, NonzeroType >( coarser_size ), finer_size( _finer_size ),
+				Ax_finer( finer_size ), coarsening_matrix( coarser_size, finer_size ) {
+				coarser_level = nullptr;
+			}
+
+			/**
+			 * @brief Destroys the \c multi_grid_data_object object
+			 * by destroying #coarser_level.
+			 */
+			virtual ~multi_grid_data() {
+				if( coarser_level != nullptr ) {
+					delete coarser_level;
+				}
+			}
+		};
+
+		/**
+		 * @brief Data stucture to store the data for a full AMG run:
+		 * system vectors and matrix, coarsening information and temporary vectors.
+		 *
+		 * This data structures contains all the needed vectors and matrices to
+		 * solve a linear system \f$ A x = b \f$. As for \ref system_data,
+		 * internal elements are built and their sizes properly initialized
+		 * to #system_size, but internal values are \b not initialized, as
+		 * they are left to user's logic. Similarly, the coarsening information
+		 * in #coarser_level is to be initialized by users by properly
+		 * building a \code multi_grid_data<IOType, NonzeroType> \endcode object
+		 * and storing its pointer into #coarser_level; on destruction, #coarser_level
+		 * will also be properly destroyed without user's intervention.
+		 *
+		 * @tparam IOType type of values of the vectors for intermediate results
+		 * @tparam NonzeroType type of the values stored inside the system matrix #A
+		 * @tparam InputType type of the values of the right-hand side vector #b
+		 */
+		template< typename IOType, typename NonzeroType, typename InputType >
+		struct amg_data : public system_data< IOType, NonzeroType > {
+
+			grb::Vector< InputType > b; ///< right-side vector of known values
+			grb::Vector< IOType > u;    ///< temporary vectors (typically for CG exploration directions)
+			grb::Vector< IOType > p;    ///< temporary vector (typically for x refinements coming from the multi-grid run)
+			grb::Vector< IOType > x;    // system solution being refined over the iterations: it us up to the user
+			///< to set the initial solution value
+
+			struct multi_grid_data< IOType, NonzeroType > *coarser_level; ///< information about the coarser system, for
+			                                                               ///< the multi-grid run
+
+			/**
+			 * Construct a new \c amg_data object by building vectors and matrices
+			 * and by setting #coarser_level to \c nullptr (i.e. no coarser level is assumed).
+			 *
+			 * @param[in] sys_size the size of the simulated system,
+			 *            i.e. of all the internal vectors and matrices
+			 */
+			amg_data( std::size_t sys_size ) : system_data< IOType, NonzeroType >( sys_size ), b( sys_size ), u( sys_size ), p( sys_size ), x( sys_size ) {
+				coarser_level = nullptr;
+			}
+
+			/**
+			 * Destroy the \c amg_data object by destroying the #coarser_level informartion,
+			 *  if any.
+			 */
+			virtual ~amg_data() {
+				if( coarser_level != nullptr ) {
+					delete coarser_level;
+				}
+			}
+		};
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_AMG_DATA
diff --git a/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
new file mode 100644
index 000000000..e19da1026
--- /dev/null
+++ b/include/graphblas/algorithms/amg/multigrid_v_cycle.hpp
@@ -0,0 +1,327 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * This file contains the routines for multi-grid solution refinement,
+ *        including the main routine and those for coarsening and refinement
+ *        of the tentative solution.
+ * @file multigrid_v_cycle.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-08-10
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+#define _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+
+#include <cassert>
+#include <vector>
+#include <graphblas.hpp>
+#include "amg_data.hpp"
+#include <utils/print_vec_mat.hpp>
+
+#define DBG_println( args ) std::cout << args << std::endl;
+
+namespace grb {
+
+	namespace algorithms {
+
+		/**
+		 * Namespace for interfaces that should not be used outside of the algorithm
+		 * namespace.
+		 */
+		namespace internal {
+
+			/**
+			 * computes the coarser residual vector \p coarsening_data.r by coarsening
+			 *        \p coarsening_data.Ax_finer - \p r_fine via
+			 * \p coarsening_data.coarsening_matrix.
+			 *
+			 * The coarsening information are stored inside \p coarsening_data.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during
+			 *         computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 * @tparam Minus the minus operator for subtractions
+			 *
+			 * @param[in] r_fine fine residual vector
+			 * @param[in,out] coarsening_data \ref multi_grid_data data structure storing
+			 *                the information for coarsening
+			 * @param[in] ring the ring to perform the operations on
+			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the
+			 *                 error code of the first unsuccessful operation otherwise
+			 */
+			template< typename IOType,
+				typename NonzeroType,
+				class Ring,
+				class Minus >
+			grb::RC compute_coarsening(
+				const grb::Vector< IOType > &r_fine, // fine residual
+				struct multi_grid_data< IOType, NonzeroType > &coarsening_data,
+				const Ring &ring,
+				const Minus &minus
+				) {
+				RC ret = SUCCESS;
+				ret = ret ? ret : grb::eWiseApply(
+					coarsening_data.Ax_finer,
+					r_fine, coarsening_data.Ax_finer,
+					minus
+				); // Ax_finer = r_fine - Ax_finer
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : grb::set( coarsening_data.r, 0 );
+				ret = ret ? ret : grb::mxv(
+					coarsening_data.r, coarsening_data.coarsening_matrix,
+					coarsening_data.Ax_finer, ring
+				); // r = coarsening_matrix * Ax_finer
+				return ret;
+			}
+
+			/**
+			 * computes the prolongation of the coarser solution \p coarsening_data.z and
+			 *  stores it into \p x_fine.
+			 *
+			 * For prolongation, this function uses the matrix
+			 *   \p coarsening_data.coarsening_matrix by transposing it.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used
+			 * during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 *
+			 * @param[out] x_fine the solution vector to store the
+			 *             prolonged solution into
+			 * @param[in,out] coarsening_data information for coarsening
+			 * @param[in] ring the ring to perform the operations on
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate,
+			 *              the error code of the first unsuccessful operation otherwise
+			 */
+			template< typename IOType,
+				typename NonzeroType,
+				class Ring >
+			grb::RC compute_prolongation(
+				grb::Vector< IOType > &x_fine, // fine residual
+				struct multi_grid_data< IOType, NonzeroType > &coarsening_data,
+				const Ring &ring
+				) {
+				RC ret = SUCCESS;
+				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
+
+				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix >(
+					coarsening_data.Ax_finer,
+					coarsening_data.coarsening_matrix,
+					coarsening_data.z,
+					ring
+				);
+				assert( ret == SUCCESS );
+				// x_fine += Ax_finer;
+				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() );
+				assert( ret == SUCCESS );
+				return ret;
+			}
+
+			template< typename IOType, typename NonzeroType, class Ring >
+			grb::RC spai0_smoother( system_data< IOType, NonzeroType > &data, const Ring &ring ) {
+				RC ret = SUCCESS;
+
+				NonzeroType alpha = 1.;
+				ret = ret ? ret : grb::eWiseMulAdd( data.z, alpha, data.A_diagonal, data.r, ring );
+				assert( ret == SUCCESS );
+				return ret;
+			}
+
+			/**
+			 * Runs \p smoother_steps iteration of the SPAI0 smoother,
+			 *        with inputs and outputs stored inside \p data.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 *
+			 * @param[in,out] data \ref system_data data structure with relevant inpus
+			 *                 and outputs: system matrix, initial solution, residual,
+			 *                 system matrix colors, temporary vectors
+			 * @param[in] smoother_steps how many smoothing steps to run
+			 * @param[in] ring the ring to perform the operations on
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate,
+			 *         the error code of the first unsuccessful operation otherwise
+			 */
+			template< typename IOType, typename NonzeroType, class Ring, class Minus >
+			grb::RC run_spai0_smoother( system_data< IOType, NonzeroType > &data,
+								  const size_t smoother_steps, const Ring &ring, const Minus &minus ) {
+				RC ret = SUCCESS;
+
+				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
+					ret = ret ? ret : grb::set( data.smoother_temp, 0 );
+					ret = ret ? ret : grb::mxv( data.smoother_temp, data.A, data.z, ring );
+					ret = ret ? ret : grb::eWiseApply( data.smoother_temp, data.r, data.smoother_temp,
+													   minus );
+
+#ifdef HPCG_PRINT_STEPS
+					std::cout << " data.A(spai0): " << nrows(data.A) << " x " << ncols(data.A) << " \n";
+					print_norm( data.A_diagonal, " data.A_diagonal" );
+					print_norm( data.smoother_temp, " data.smoother_temp" );
+					print_norm( data.z, " data.z" );
+#endif
+
+					ret = ret ? ret :
+						grb::eWiseLambda(
+							[ &data ]( const size_t i ) {
+#ifdef HPCG_PRINT_STEPS
+								if( i < 10 || i + 10 > size( data.A_diagonal ) ) {
+									std::cout << " i= " << i
+									          << " data.z[i]= " << data.z[ i ]
+									          << " data.A_diagonal[i]= " << data.A_diagonal[ i ]
+									          << " data.smoother_temp[i]= " << data.smoother_temp[ i ]
+									          << "\n";
+								}
+								std::cout << "\n";
+#endif
+								data.z[ i ] += data.A_diagonal[ i ] * data.smoother_temp[ i ];
+							},
+							data.A_diagonal, data.z, data.smoother_temp );
+
+					assert( ret == SUCCESS );
+				}
+
+				return ret;
+			}
+
+			/**
+			 * @brief Multi-grid V cycle implementation to refine a given solution.
+			 *
+			 * A full multi-grid run goes through the following steps:
+			 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the SPAI0
+			 *    smoother, run to improve on the initial solution stored into \p data.z
+			 * -# the coarsening of \f$ r - A*z \f$ is computed to find the
+			 *      coarser residual vector
+			 * -# a multi-grid run is recursively performed on the coarser system
+			 * -# the tentative solution from the coarser multi-grid run is prolonged
+			 *     and added to the current tentative solution into \p data.z
+			 * -# this solution is further smoothed for \p postsmoother_steps steps
+			 *
+			 * If coarsening information is not available, the multi-grid run
+			 * consists in a single smmothing run.
+			 *
+			 * Failuers of GraphBLAS operations are handled by immediately
+			 * stopping the execution and by returning the failure code.
+			 *
+			 * @tparam IOType type of result and intermediate vectors used during computation
+			 * @tparam NonzeroType type of matrix values
+			 * @tparam Ring the ring of algebraic operators zero-values
+			 * @tparam Minus the minus operator for subtractions
+			 *
+			 * @param[in,out] data \ref multi_grid_data object storing the relevant data
+			 *            for the multi-grid run of the current clevel
+			 * @param[in,out] coarsening_data pointer to information for the
+			 *                coarsening/refinement operations and
+			 *                for the recursive multi-grid run on the coarsened system;
+			 *                if \c nullptr, no coarsening/refinement occurs
+			 *                and only smoothing occurs on the current solution
+			 * @param[in] presmoother_steps number of pre-smoother steps
+			 * @param[in] postsmoother_steps number of post-smoother steps
+			 * @param[in] ring the ring to perform the operations on
+			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate,
+			 *         the error code of the first unsuccessful operation otherwise
+			 */
+			template< typename IOType, typename NonzeroType, class Ring, class Minus >
+			grb::RC multi_grid( system_data< IOType, NonzeroType > &data,
+				struct multi_grid_data< IOType, NonzeroType > *const coarsening_data,
+				const size_t presmoother_steps,
+				const size_t postsmoother_steps,
+				const Ring &ring,
+				const Minus &minus ) {
+				RC ret = SUCCESS;
+
+#ifdef HPCG_PRINT_STEPS
+				DBG_println( "mg BEGINNING {" );
+#endif
+
+				// clean destination vector
+				//ret = ret ? ret : grb::set( data.z, data.r );
+				ret = ret ? ret : grb::set( data.z, 0 );
+
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "first print smoothed z" );
+				print_norm( data.r, "initial r" );
+#endif
+				if( coarsening_data == nullptr ) {
+					//compute one round of smoother
+					ret = ret ? ret : run_spai0_smoother( data, 1, ring, minus );
+					assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+					print_norm( data.z, "smoothed z" );
+					DBG_println( "} mg END" );
+#endif
+					return ret;
+				}
+
+				struct multi_grid_data< IOType, NonzeroType > &cd = *coarsening_data;
+
+				// pre-smoother
+				ret = ret ? ret : run_spai0_smoother( data, presmoother_steps, ring, minus );
+				assert( ret == SUCCESS );
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "pre-smoothed z" );
+#endif
+				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
+				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
+				assert( ret == SUCCESS );
+
+#ifdef HPCG_PRINT_STEPS
+				std::cout << "  data.A: " << nrows(data.A) << " x " << ncols(data.A) << " \n";
+				print_norm( cd.r, "before coarse cd.r" );
+#endif
+				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
+				assert( ret == SUCCESS );
+
+#ifdef HPCG_PRINT_STEPS
+				print_norm( cd.z, "after cd.coarse z" );
+				print_norm( cd.r, "after cd.coarse r" );
+#endif
+				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : compute_prolongation( data.z, cd, ring );
+				assert( ret == SUCCESS );
+
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "prolonged z" );
+#endif
+				// post-smoother
+				ret = ret ? ret : run_spai0_smoother( data, postsmoother_steps, ring, minus );
+				assert( ret == SUCCESS );
+
+#ifdef HPCG_PRINT_STEPS
+				print_norm( data.z, "post-smoothed z" );
+				DBG_println( "} mg END" );
+#endif
+
+				return ret;
+			}
+
+		} // namespace internal
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
diff --git a/include/graphblas/algorithms/amg/system_building_utils.hpp b/include/graphblas/algorithms/amg/system_building_utils.hpp
new file mode 100644
index 000000000..f2a431ec9
--- /dev/null
+++ b/include/graphblas/algorithms/amg/system_building_utils.hpp
@@ -0,0 +1,193 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Utilities to build an antire system for AMG simulations.
+ * @file amg_system_building_utils.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ * @date 2022-08-10
+ */
+
+#ifndef _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <graphblas.hpp>
+#include <graphblas/utils/parser.hpp>
+#include "amg_data.hpp"
+
+namespace grb {
+
+	namespace algorithms {
+
+		/**
+		 * Generates an entire AMG problem, storing it in \p holder.
+		 *
+		 * @tparam DIMS dimensions of the system
+		 * @tparam T type of matrix values
+		 * @param holder std::unique_ptr to store the AMG problem into
+		 * @param params parameters container to build the AMG problem
+		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors
+		 *          and matrices) succeeded, otherwise the first unsuccessful return value
+		 */
+		template< typename T = double, typename SYSINP >
+		grb::RC build_amg_system(
+				std::unique_ptr< grb::algorithms::amg_data< T, T, T > > &holder,
+				const std::size_t max_levels,
+				SYSINP &in
+			) {
+			grb::RC rc = grb::SUCCESS;
+			std::size_t coarsening_level = 0UL;
+			const size_t n_A = in.matAbuffer[ coarsening_level ].get_n();
+			grb::algorithms::amg_data< T, T, T > *data = new grb::algorithms::amg_data< T, T, T >( n_A );
+			rc = buildMatrixUnique(
+				data->A,
+				in.matAbuffer[ coarsening_level ].i_data,
+				in.matAbuffer[ coarsening_level ].j_data,
+				in.matAbuffer[ coarsening_level ].v_data,
+				in.matAbuffer[ coarsening_level ].size(),
+				SEQUENTIAL
+			);
+
+			if( rc != SUCCESS ) {
+				std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+						  << "(" << toString( rc ) << ")." << std::endl;
+				return rc;
+			}
+#ifdef AMG_PRINT_STEPS
+			std::cout << " buildMatrixUnique: constructed data->A " << nrows( data->A )
+			          << " x " << ncols( data->A ) << " matrix \n";
+#endif
+			assert( ! holder ); // should be empty
+			holder = std::unique_ptr< grb::algorithms::amg_data< T, T, T > >( data );
+
+			{
+				RC rc = grb::buildVector(
+					data->A_diagonal,
+					in.matMbuffer[ coarsening_level ].v_data,
+					in.matMbuffer[ coarsening_level ].v_data + in.matMbuffer[ coarsening_level ].size(),
+					SEQUENTIAL
+				);
+				if ( rc != SUCCESS ) {
+					std::cerr << " buildVector failed!\n ";
+					return rc;
+				}
+#ifdef AMG_PRINT_STEPS
+				std::cout << " buildVector: data->A_diagonal "
+				          << size( data->A_diagonal ) << " vector \n";
+#endif
+			}
+
+			std::size_t coarser_size;
+			std::size_t previous_size = n_A;
+
+			// initialize coarsening with additional pointers and
+			// dimensions copies to iterate and divide
+			grb::algorithms::multi_grid_data< T, T > **coarser = &data->coarser_level;
+			assert( *coarser == nullptr );
+
+			// generate linked list of hierarchical coarseners
+			while( coarsening_level  < max_levels ) {
+				assert( *coarser == nullptr );
+
+				coarser_size = in.matAbuffer[ coarsening_level + 1 ].get_n();
+
+				// build data structures for new level
+				grb::algorithms::multi_grid_data< double, double > *new_coarser =
+					new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size );
+
+				// install coarser level immediately to cleanup in case of build error
+				*coarser = new_coarser;
+
+				// initialize coarsener matrix, system matrix and
+				// diagonal vector for the coarser level
+				{
+					rc = buildMatrixUnique(
+						new_coarser->coarsening_matrix,
+						in.matRbuffer[ coarsening_level ].i_data,
+						in.matRbuffer[ coarsening_level ].j_data,
+						in.matRbuffer[ coarsening_level ].v_data,
+						in.matRbuffer[ coarsening_level ].size(),
+						SEQUENTIAL
+					);
+
+					if( rc != SUCCESS ) {
+						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+								  << "(" << toString( rc ) << ")." << std::endl;
+						return rc;
+					}
+#ifdef AMG_PRINT_STEPS
+					std::cout << " buildMatrixUnique: constructed new_coarser->coarsening_matrix "
+					          << nrows(new_coarser->coarsening_matrix) << " x "
+					          << ncols(new_coarser->coarsening_matrix) << " matrix \n";
+#endif
+				}
+				{
+					rc = buildMatrixUnique(
+						new_coarser->A,
+						in.matAbuffer[ coarsening_level + 1 ].i_data,
+						in.matAbuffer[ coarsening_level + 1 ].j_data,
+						in.matAbuffer[ coarsening_level + 1 ].v_data,
+						in.matAbuffer[ coarsening_level + 1 ].size(),
+						SEQUENTIAL
+					);
+
+					if( rc != SUCCESS ) {
+						std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+						          << "(" << toString( rc ) << ")." << std::endl;
+						return rc;
+					}
+#ifdef AMG_PRINT_STEPS
+					std::cout << " buildMatrixUnique: constructed new_coarser->A "
+					          << nrows(new_coarser->A) << " x " << ncols(new_coarser->A) << " matrix \n";
+#endif
+				}
+
+				RC rc = grb::buildVector(
+					new_coarser->A_diagonal,
+					in.matMbuffer[ coarsening_level + 1 ].v_data,
+					in.matMbuffer[ coarsening_level + 1 ].v_data + in.matMbuffer[ coarsening_level + 1 ].size(),
+					SEQUENTIAL
+				);
+
+				if ( rc != SUCCESS ) {
+					std::cerr << " buildVector failed!\n ";
+					return rc;
+				}
+#ifdef AMG_PRINT_STEPS
+				std::cout << " buildVector: new_coarser->A_diagonal "
+				          << size(new_coarser->A_diagonal) << " vector \n";
+#endif
+
+				// prepare for new iteration
+				coarser = &new_coarser->coarser_level;
+				previous_size = coarser_size;
+				coarsening_level++;
+			}
+
+			return rc;
+		}
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index dbd34c199..01bb250ce 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -334,6 +334,9 @@ namespace grb {
 
 	} // namespace internal
 
+	extern "C" void foo();
+
+
 	/**
 	 * Sets all elements of a vector to the given value.
 	 *
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index fac6ceee8..21f97586a 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -98,7 +98,11 @@ add_grb_executables( knn knn.cpp ../unit/parser.cpp
 )
 
 add_grb_executables( hpcg hpcg.cpp
+	BACKENDS reference reference_omp bsp1d hybrid
+	ADDITIONAL_LINK_LIBRARIES test_utils
+)
 
+add_grb_executables( amg amg.cpp
 	BACKENDS reference reference_omp bsp1d hybrid
 	ADDITIONAL_LINK_LIBRARIES test_utils
 )
diff --git a/tests/smoke/amg.cpp b/tests/smoke/amg.cpp
new file mode 100644
index 000000000..380e614f8
--- /dev/null
+++ b/tests/smoke/amg.cpp
@@ -0,0 +1,713 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Test for AMG solver, levels provided by AMGCL.
+ * @file amg.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * @author Denis Jelovina (denis.jelovina@huawei.com)
+ *
+ *
+ * @date 2022-10-08
+ */
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <string>
+#include <graphblas.hpp>
+#include <graphblas/algorithms/amg/amg.hpp>
+#include <graphblas/algorithms/amg/system_building_utils.hpp>
+#include <graphblas/utils/Timer.hpp>
+#include <utils/argument_parser.hpp>
+#include <utils/assertions.hpp>
+#include <utils/print_vec_mat.hpp>
+
+// forward declaration for the tracing facility
+template< typename T,
+	class Ring = grb::Semiring<
+		grb::operators::add< T >,
+		grb::operators::mul< T >,
+		grb::identities::zero,
+		grb::identities::one
+	>
+>
+void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
+
+#ifdef AMG_PRINT_STEPS
+
+// AMG_PRINT_STEPS requires defining the following symbols
+
+/**
+ * simply prints \p args on a dedicated line.
+ */
+#define DBG_println( args ) std::cout << args << std::endl;
+
+/**
+ * prints \p head and the norm of \p r.
+ */
+#define DBG_print_norm( vec, head ) print_norm( vec, head )
+#endif
+
+//========== MAIN PROBLEM PARAMETERS =========
+// values modifiable via cmd line args: default set as in reference AMG
+constexpr size_t DEF_COARSENING_LEVELS{ 1U };
+constexpr size_t MAX_COARSENING_LEVELS{ 4U };
+constexpr size_t MAX_ITERATIONS_DEF{ 56UL };
+constexpr size_t SMOOTHER_STEPS_DEF{ 1 };
+//============================================
+
+constexpr double MAX_NORM { 4.0e-14 };
+
+using namespace grb;
+using namespace algorithms;
+
+static const char * const TEXT_HIGHLIGHT = "===> ";
+#define thcout ( std::cout << TEXT_HIGHLIGHT )
+#define thcerr ( std::cerr << TEXT_HIGHLIGHT )
+
+/**
+ * Container to store matrices loaded from a file.
+ */
+template< typename T = double >
+struct mat_data {
+	size_t *i_data;
+	size_t *j_data;
+	T *v_data;
+	size_t nz, n, m;
+
+	void resize( size_t innz, size_t inn, size_t inm ){
+		nz=innz;
+		n=inn;
+		m=inm;
+		i_data = new size_t [ nz ];
+		j_data = new size_t [ nz ];
+		v_data = new T [ nz ];
+	};
+
+	mat_data( ){
+		nz = 0;
+	};
+
+	mat_data( size_t in ){
+		resize( in );
+	};
+
+	size_t size(){
+		return nz;
+	};
+
+	size_t get_n(){
+		return n;
+	};
+
+	size_t get_m(){
+		return m;
+	};
+
+	~mat_data(){
+		delete [] i_data;
+		delete [] i_data;
+		delete [] v_data;
+	};
+};
+
+/**
+ * Container to store vectors loaded from a file.
+ */
+template< typename T = double >
+struct vec_data {
+	T *v_data;
+	size_t n;
+
+	void resize( size_t in ){
+		n=in;
+		v_data = new T [ n ];
+	};
+
+	vec_data( ){
+		n = 0;
+	};
+
+	vec_data( size_t in ){
+		resize( in );
+	};
+
+	size_t size(){
+		return n;
+	};
+
+	~vec_data(){
+		delete [] v_data;
+	};
+};
+
+static bool matloaded = false;
+
+/**
+ * Container for the parameters for the AMG simulation.
+ */
+struct simulation_input {
+	size_t max_coarsening_levels;
+	size_t test_repetitions;
+	size_t max_iterations;
+	size_t smoother_steps;
+	const char * matAfile_c_str;
+	// vectors of input matrix filenames
+	std::vector< std::string > matAfiles;
+	std::vector< std::string > matMfiles;
+	std::vector< std::string > matPfiles;
+	std::vector< std::string > matRfiles;
+	bool evaluation_run;
+	bool no_preconditioning;
+};
+
+template<typename CharT, typename TraitsT = std::char_traits< CharT > >
+class vectorwrapbuf :
+	public std::basic_streambuf<CharT, TraitsT> {
+public:
+	vectorwrapbuf(  std::vector<CharT> &vec) {
+		this->setg( vec.data(), vec.data(), vec.data() + vec.size() );
+    }
+};
+std::istream& operator>>(std::istream& is, std::string& s){
+	std::getline(is, s);
+	return is;
+}
+
+/**
+ * @brief Container to store all data for AMG hierarchy.
+ */
+class preloaded_matrices : public simulation_input {
+
+public :
+	size_t nzAmt, nzMmt, nzPmt, nzRmt;
+	mat_data< double > *matAbuffer;
+	vec_data< double > *matMbuffer;
+	mat_data< double > *matPbuffer;
+	mat_data< double > *matRbuffer;
+
+	grb::RC read_matrix(
+		std::vector< std::string > &fname,
+		mat_data<double> *data
+	) {
+		grb::RC rc = SUCCESS;
+		for ( size_t i = 0; i < fname.size(); i++ ) {
+			grb::utils::MatrixFileReader<
+				double, std::conditional<
+					( sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType ) ),
+							grb::config::RowIndexType,
+							grb::config::ColIndexType >::type
+				> parser_mat( fname[ i ].c_str(), true );
+#ifdef DEBUG
+			std::cout << " ---> parser_mat.filename()=" << parser_mat.filename() << "\n";
+			std::cout << " ---> parser_mat.nz()=" << parser_mat.nz() << "\n";
+			std::cout << " ---> parser_mat.n()=" << parser_mat.n() << "\n";
+			std::cout << " ---> parser_mat.m()=" << parser_mat.m() << "\n";
+			std::cout << " ---> parser_mat.entries()=" << parser_mat.entries() << "\n";
+#endif
+			// very poor choice and temp solution size = 2 x nz
+			data[i].resize( parser_mat.entries()*2, parser_mat.n(), parser_mat.m() );
+			std::ifstream inFile( fname[ i ], std::ios::binary | std::ios::ate );
+			if( ! inFile.is_open() ) {
+				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			}
+			std::streamsize size = inFile.tellg();
+			inFile.seekg( 0, std::ios::beg );
+			std::vector< char > buffer( size );
+			if ( inFile.read( buffer.data(), size ) ) {
+				// all fine
+			}
+			else {
+				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			}
+			inFile.close();
+			vectorwrapbuf< char > databuf( buffer );
+			std::istream isdata( &databuf );
+			std::string headder;
+			isdata >> headder;
+			while( headder.at( 0 ) == '%' ) {
+				isdata >> headder;
+			}
+			std::stringstream ss( headder );
+			size_t n, m, nz;
+			ss >> n >> m >> nz ;
+			size_t k = 0;
+			for ( size_t j = 0; j < nz; j++ ) {
+				size_t itmp, jtmp;
+				double vtmp;
+				isdata >> itmp >> jtmp >> vtmp;
+				data[ i ].i_data[ k ] = itmp - 1;
+				data[ i ].j_data[ k ] = jtmp - 1;
+				data[ i ].v_data[ k ] = vtmp;
+				k += 1;
+			}
+			data[ i ].nz = k;
+		}
+		return ( rc );
+	}
+
+	grb::RC read_vector( std::vector< std::string > &fname,
+						 vec_data< double > *data ) {
+		grb::RC rc = SUCCESS;
+		for ( size_t i = 0; i < fname.size(); i++ ){
+#ifdef DEBUG
+			std::cout << " Reading " << fname[ i ].c_str() << ".\n";
+#endif
+			std::ifstream inFile( fname[ i ], std::ios::binary | std::ios::ate );
+			if( ! inFile.is_open() ) {
+				std::cout << " Cannot open "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			}
+			std::streamsize size = inFile.tellg();
+			inFile.seekg( 0, std::ios::beg );
+			std::vector< char > buffer( size );
+			if ( inFile.read( buffer.data(), size ) ) {
+				// all fine
+			}
+			else {
+				std::cout << " Cannot read "<< fname[ i ].c_str() <<  "\n";
+				return( PANIC );
+			}
+			inFile.close();
+			vectorwrapbuf< char > databuf( buffer );
+			std::istream isdata( &databuf );
+			std::string headder;
+			size_t n, m;
+			std::getline( isdata, headder ); // skip the first line
+			while( headder.at( 0 ) == '%' ) {
+				std::getline( isdata, headder );
+			}
+			std::stringstream ss( headder );
+			ss >> n >> m;
+			std::cout << " Reading from" << fname[ i ] << " dense matrix with dimensions: "
+			          << " n x m = " << n << " x " << m << "\n";
+			data[ i ].resize( n );
+			for ( size_t j = 0; j < n; j++ ) {
+				isdata >> data[ i ].v_data[ j ];
+			}
+		}
+		return( rc );
+	}
+
+	grb::RC read_vec_matrics(){
+		grb::RC rc = SUCCESS;
+
+		nzAmt = matAfiles.size() ;
+		nzMmt = matMfiles.size() ;
+		nzPmt = matPfiles.size() ;
+		nzRmt = matRfiles.size() ;
+
+		matAbuffer = new mat_data< double > [ nzAmt ];
+		matMbuffer = new vec_data< double > [ nzMmt ];
+		matPbuffer = new mat_data< double > [ nzPmt ];
+		matRbuffer = new mat_data< double > [ nzRmt ];
+
+		rc = rc ? rc : read_matrix( matAfiles, matAbuffer );
+		rc = rc ? rc : read_matrix( matRfiles, matRbuffer );
+		// rc = rc ? rc : read_matrix( matPfiles, matPbuffer );
+		rc = rc ? rc : read_vector( matMfiles, matMbuffer );
+
+		return rc;
+	}
+};
+
+
+
+preloaded_matrices inputData;
+
+/**
+ * Containers for test outputs.
+ */
+struct output {
+	RC error_code;
+	size_t test_repetitions;
+	size_t performed_iterations;
+	double residual;
+	grb::utils::TimerResults times;
+	std::unique_ptr< PinnedVector< double > > pinnedVector;
+	double square_norm_diff;
+
+	output() {
+		error_code = SUCCESS;
+		test_repetitions = 0;
+		performed_iterations = 0;
+		residual = 0.0;
+	}
+};
+
+#ifdef AMG_PRINT_SYSTEM
+static void print_system( const amg_data< double, double, double > & data ) {
+	print_matrix( data.A, 70, "A" );
+	multi_grid_data< double, double > * coarser = data.coarser_level;
+	while( coarser != nullptr ) {
+		print_matrix( coarser->coarsening_matrix, 50, "COARSENING MATRIX" );
+		print_matrix( coarser->A, 50, "COARSER SYSTEM MATRIX" );
+		coarser = coarser->coarser_level;
+	}
+}
+#endif
+
+#ifdef AMG_PRINT_STEPS
+template<
+	typename T,
+	class Ring = Semiring<
+		grb::operators::add< T >,
+		grb::operators::mul< T >,
+		grb::identities::zero,
+		grb::identities::one >
+	>
+void print_norm( const grb::Vector< T > &r, const char *head, const Ring &ring ) {
+	T norm = 0;
+	RC ret = grb::dot( norm, r, r, ring ); // residual = r' * r;
+	(void)ret;
+	assert( ret == SUCCESS );
+	std::cout << ">>> ";
+	if( head != nullptr ) {
+		std::cout << head << ": ";
+	}
+	std::cout << norm << std::endl;
+}
+#endif
+
+/**
+ * Main test, building an AMG problem and running the simulation closely
+ *  following the parameters in the reference AMG test.
+ */
+void grbProgram( const simulation_input &in, struct output &out ) {
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// get user process ID
+	assert( spmd<>::pid() < spmd<>::nprocs() );
+
+	// assume successful run
+	out.error_code = SUCCESS;
+	RC rc { SUCCESS };
+
+	if( ! matloaded ) {
+		//preloaded_matrices inputData;
+		inputData.matAfiles = in.matAfiles;
+		inputData.matMfiles = in.matMfiles;
+		inputData.matPfiles = in.matPfiles;
+		inputData.matRfiles = in.matRfiles;
+
+		rc = inputData.read_vec_matrics();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure to read data" << std::endl;
+		}
+		matloaded = true ;
+	}
+
+	out.times.io = timer.time();
+	timer.reset();
+
+	// wrap amg_data inside a unique_ptr to forget about cleaning chores
+	std::unique_ptr< amg_data< double, double, double > > amg_state;
+	rc = build_amg_system< double >( amg_state, in.max_coarsening_levels, inputData );
+
+	if( rc != SUCCESS ) {
+		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
+		out.error_code = rc;
+		return;
+	}
+#ifdef AMG_PRINT_SYSTEM
+	if( spmd<>::pid() == 0 ) {
+		print_system( *amg_state );
+	}
+#endif
+
+	Matrix< double > &A = amg_state->A;
+	Vector< double > &x = amg_state->x;
+	Vector< double > &b = amg_state->b;
+
+	// set vectors as from standard AMG benchmark
+	set( x, 1.0 );
+	set( b, 0.0 );
+	rc = grb::mxv( b, A, x,
+		grb::Semiring< grb::operators::add< double >,
+			grb::operators::mul< double >,
+			grb::identities::zero,
+			grb::identities::one >() );
+	set( x, 0.0 );
+
+	double norm_b = 0;
+	rc = grb::dot( norm_b, b, b,
+		grb::Semiring< grb::operators::add< double >,
+			grb::operators::mul< double >,
+			grb::identities::zero,
+			grb::identities::one >() );
+	(void)rc;
+	assert( rc == SUCCESS );
+
+#ifdef AMG_PRINT_SYSTEM
+	if( spmd<>::pid() == 0 ) {
+		print_vector( x, 50, " ---> X(1)" );
+		print_vector( b, 50, " ---> B(1)" );
+	}
+#endif
+
+	out.times.preamble = timer.time();
+	timer.reset();
+
+	const bool with_preconditioning = ! in.no_preconditioning;
+	out.test_repetitions = 0;
+	if( in.evaluation_run ) {
+		double single_time_start = timer.time();
+		rc = amg( *amg_state, with_preconditioning, in.smoother_steps,
+			in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+		double single_time = timer.time() - single_time_start;
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		out.times.useful = single_time;
+		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
+	} else {
+		// do benchmark
+		double time_start = timer.time();
+		for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
+			rc = set( x, 0.0 );
+			assert( rc == SUCCESS );
+			rc = amg( *amg_state, with_preconditioning, in.smoother_steps,
+				in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
+
+			out.test_repetitions++;
+			if( rc != SUCCESS ) {
+				break;
+			}
+
+		}
+		double time_taken = timer.time() - time_start;
+		rc = rc ? rc : collectives<>::reduce( time_taken, 0, operators::max< double >() );
+		out.times.useful = time_taken / static_cast< double >( out.test_repetitions );
+
+#ifdef AMG_PRINT_SYSTEM
+		rc = rc ? rc : grb::eWiseLambda( [ &x ]( const size_t i ) { x[ i ] -= 1;}, x );
+		print_norm (x, " norm(x)");
+#endif
+		// sleep( 1 );
+	}
+	timer.reset();
+
+#ifdef AMG_PRINT_SYSTEM
+	if( spmd<>::pid() == 0 ) {
+		print_vector( x, 50, " x(first 50 elements)" );
+		print_vector( b, 50, " b(first 50 elements)" );
+	}
+#endif
+
+	if( spmd<>::pid() == 0 ) {
+		if( rc == SUCCESS ) {
+			if( in.evaluation_run ) {
+				std::cout << "Info: cold AMG completed within " << out.performed_iterations
+				          << " iterations. Last computed residual is " << out.residual
+				          << ". Time taken was " << out.times.useful
+				          << " ms. Deduced inner repetitions parameter of " << out.test_repetitions
+				          << " to take 1 second or more per inner benchmark." << std::endl;
+			} else {
+				std::cout << "Final residual= "<< out.residual << " relative error= "
+				          <<  out.residual/sqrt(norm_b) << "\n";
+				std::cout << "Average time taken for each of " << out.test_repetitions
+				          << " AMG calls (hot start): " << out.times.useful << std::endl;
+			}
+		} else {
+			std::cerr << "Failure: call to AMG did not succeed (" << toString( rc ) << ")." << std::endl;
+		}
+	}
+
+	// start postamble
+	timer.reset();
+	// set error code
+	out.error_code = rc;
+
+	Semiring< grb::operators::add< double >,
+		grb::operators::mul< double >,
+		grb::identities::zero,
+		grb::identities::one > ring;
+	grb::set( b, 1.0 );
+	out.square_norm_diff = 0.0;
+	grb::eWiseMul( b, -1.0, x, ring );
+	grb::dot( out.square_norm_diff, b, b, ring );
+
+	// output
+	out.pinnedVector = std::unique_ptr< PinnedVector< double > >( new PinnedVector< double >( x, SEQUENTIAL ) );
+	// finish timing
+	const double time_taken { timer.time() };
+	out.times.postamble = time_taken;
+}
+
+/**
+ * Parser the command-line arguments to extract the simulation information
+ * and checks they are valid.
+ */
+static void parse_arguments( simulation_input &, size_t &, double &, int, char ** );
+
+int main( int argc, char ** argv ) {
+	simulation_input sim_in;
+	size_t test_outer_iterations;
+	double max_residual_norm;
+
+	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
+	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
+	thcout << "Test repetitions: " << sim_in.test_repetitions << std::endl;
+	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
+	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run << std::noboolalpha << std::endl;
+	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning << std::noboolalpha << std::endl;
+	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
+	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
+	thcout << "Maximum norm for residual: " << max_residual_norm << std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc = SUCCESS;
+
+	// launch estimator (if requested)
+	if( sim_in.evaluation_run ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, sim_in, out, true );
+		if( rc == SUCCESS ) {
+			sim_in.test_repetitions = out.test_repetitions;
+		} else {
+			thcout << "launcher.exec returns with non-SUCCESS error code " << grb::toString( rc ) << std::endl;
+			std::exit( -1 );
+		}
+	}
+
+	// launch full benchmark
+	grb::Benchmarker< AUTOMATIC > benchmarker;
+	rc = benchmarker.exec( &grbProgram, sim_in, out, 1, test_outer_iterations, true );
+	ASSERT_RC_SUCCESS( rc );
+	thcout << "Benchmark completed successfully and took " << out.performed_iterations
+	       << " iterations to converge with residual " << out.residual << std::endl;
+
+	if( ! out.pinnedVector ) {
+		thcerr << "no output vector to inspect" << std::endl;
+	} else {
+		const PinnedVector< double > &solution = *out.pinnedVector;
+		thcout << "Size of x is " << solution.size() << std::endl;
+		if( solution.size() > 0 ) {
+			print_vector( solution, 30, "SOLUTION" );
+		} else {
+			thcerr << "ERROR: solution contains no values" << std::endl;
+		}
+	}
+
+	ASSERT_RC_SUCCESS( out.error_code );
+
+	double residual_norm = sqrt( out.square_norm_diff );
+	thcout << "Residual norm: " << residual_norm << std::endl;
+
+	ASSERT_LT( residual_norm, max_residual_norm );
+
+	thcout << "Test OK" << std::endl;
+	return 0;
+}
+
+static void parse_arguments( simulation_input &sim_in, size_t &outer_iterations,
+	double &max_residual_norm, int argc, char **argv ) {
+
+	argument_parser parser;
+	parser.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
+			"maximum level for coarsening; 0 means no coarsening; note: actual "
+			"level may be limited by the minimum system dimension" )
+		.add_optional_argument( "--mat_files_pattern", sim_in.matAfile_c_str,
+			"file pattern for files contining matrices A, M_diag, P, R "
+			"i.e. '--mat_a_file_names /path/to/dir/level_  --max_coarse-levels 2' will read "
+			"/path/to/dir/level_0_A.mtx,  /path/to/dir/level_1_A.mtx, /path/to/dir/level_2_A.mtx ... " )
+		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(),
+			"consecutive test repetitions before benchmarking" )
+		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(),
+			"test repetitions with complete initialization" )
+		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF,
+			"maximum number of AMG iterations" )
+		.add_optional_argument( "--max-residual-norm", max_residual_norm, MAX_NORM,
+			"maximum norm for the residual to be acceptable (does NOT limit "
+			"the execution of the algorithm)" )
+		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps,
+			SMOOTHER_STEPS_DEF, "number of pre/post-smoother steps; 0 disables smoothing" )
+		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
+			"launch single run directly, without benchmarker (ignore repetitions)" )
+		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false,
+			"do not apply pre-conditioning via multi-grid V cycle" );
+
+	parser.parse( argc, argv );
+
+	std::string matAfile = sim_in.matAfile_c_str;
+	if ( ! matAfile.empty() ) {
+		std::cout << "Using  <<" << matAfile << ">> pattern to read matrices " << std::endl;
+		for ( size_t i = 0 ; i < sim_in.max_coarsening_levels ; i++ ){
+			std::string fnamebase = matAfile + std::to_string(  i );
+			std::string fnameA = fnamebase + "_A.mtx";
+			std::string fnameM = fnamebase + "_M_diag.mtx";
+			std::string fnameP = fnamebase + "_P.mtx";
+			std::string fnameR = fnamebase + "_R.mtx";
+			sim_in.matAfiles.push_back( fnameA );
+			sim_in.matMfiles.push_back( fnameM );
+			sim_in.matPfiles.push_back( fnameP );
+			sim_in.matRfiles.push_back( fnameR );
+		}
+		{
+			std::string fnamebase = matAfile + std::to_string( sim_in.max_coarsening_levels );
+			std::string fnameA = fnamebase + "_A.mtx";
+			sim_in.matAfiles.push_back (fnameA);
+			std::string fnameM = fnamebase + "_M_diag.mtx";
+			sim_in.matMfiles.push_back (fnameM);
+		}
+		std::cout << "files to read matrices: " << std::endl;
+		for ( std::string fname: sim_in.matAfiles ) {
+			std::cout << fname << " \n";
+		}
+
+		for ( std::string fname: sim_in.matMfiles ) {
+			std::cout << fname << " \n";
+		}
+
+		for ( std::string fname: sim_in.matPfiles ) {
+			std::cout << fname << " \n";
+		}
+
+		for ( std::string fname: sim_in.matRfiles ) {
+			std::cout << fname << " \n";
+		}
+
+	}
+	else {
+		std::cout << "No pattern to read matrices provided" << std::endl;
+	}
+
+	// check for valid values
+	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
+		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS
+		          << " instead of " << sim_in.max_coarsening_levels << std::endl;
+		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;
+	}
+	if( sim_in.test_repetitions == 0 ) {
+		std::cerr << "ERROR no test runs selected: set \"--test-rep >0\"" << std::endl;
+		std::exit( -1 );
+	}
+	if( sim_in.max_iterations == 0 ) {
+		std::cout << "Setting number of iterations to 1" << std::endl;
+		sim_in.max_iterations = 1;
+	}
+}