From f67b46346f6bf1aea3682239c53efc6e1dcf6275 Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Wed, 21 Feb 2024 14:52:16 +0100
Subject: [PATCH 01/39] masked set matrix to matrix

---
 include/graphblas/reference/io.hpp | 200 +++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index f4f1b0709..b3792528e 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2028,6 +2028,206 @@ namespace grb {
 		}
 	}
 
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename InputType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2
+	>
+	RC set(
+		Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, reference, RIT2, CIT2, NIT2 > &Mask,
+		const Matrix< InputType, reference, RIT2, CIT2, NIT2 > &A,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		static_assert( !std::is_void< OutputType >::value,
+			"grb::set (masked set to matrix): cannot have a pattern "
+			"matrix as output" );
+		static_assert( std::is_convertible< InputType, OutputType >::value,
+			"grb::set (masked set to matrix): input type cannot be "
+			"converted to output type"
+		);
+		static_assert(
+			! ( descr & descriptors::structural && descr & descriptors::invert_mask ),
+			"grb::set can not be called with both descriptors::structural "
+			"and descriptors::invert_mask in the masked variant"
+		);
+#ifdef _DEBUG
+		std::cout << "Called grb::set (matrix-to-matrix-masked, reference)\n";
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType, OutputType >::value
+			), "grb::set",
+			"called with non-matching value types"
+		);
+
+		// dynamic checks
+		assert( phase != TRY );
+
+		const size_t nrows = grb::nrows( C );
+		const size_t ncols = grb::ncols( C );
+
+		const size_t m = grb::nrows( Mask );
+		const size_t n = grb::ncols( Mask );
+
+		/*grb::Monoid<
+			grb::operators::left_assign< OutputType >,
+			grb::identities::zero
+		> dummyMonoid;*/
+
+		if( m == 0 || n == 0 ) {
+			// If the mask has a null size, it will be ignored
+			return set< descr >( C, A, phase );
+		}
+
+		if( nrows != grb::nrows( A ) || nrows != m ) {
+			return MISMATCH;
+		}
+
+		if( ncols != grb::ncols( A ) || ncols != n ) {
+			return MISMATCH;
+		}
+
+		const auto &mask_raw = internal::getCRS( Mask );
+
+		const auto &A_raw = internal::getCRS( A );
+
+		size_t nzc = 0;
+
+		char * mask_arr = nullptr;
+		char * mask_buf = nullptr;
+		OutputType * mask_valbuf = nullptr;
+		internal::getMatrixBuffers( mask_arr, mask_buf, mask_valbuf, 1, Mask );
+
+		internal::Coordinates< reference > mask_coors;
+		mask_coors.set( mask_arr, false, mask_buf, ncols );
+
+		for( size_t i = 0; i < nrows; ++i ) {
+			mask_coors.clear();
+			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+				const size_t k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMask< descr, MaskType >( true, mask_raw.values, k ) ) {
+					mask_coors.assign( k_col );
+				}
+			}
+			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+				const size_t k_col = A_raw.row_index[ k ];
+				if( mask_coors.assigned( k_col ) ) {
+					nzc++;
+				}
+			}
+		}
+
+		if( phase == RESIZE ) {
+			return resize( C, nzc );
+		}
+
+		assert( phase == EXECUTE );
+
+		if( capacity( C ) < nzc ) {
+#ifdef _DEBUG
+			std::cout << "\t insufficient capacity to complete "
+				"requested masked set matrix to matrix computation\n";
+#endif
+			const RC clear_rc = clear( C );
+			if( clear_rc != SUCCESS ) {
+				return PANIC;
+			} else {
+				return FAILED;
+			}
+		}
+
+		auto &CRS_raw = internal::getCRS( C );
+		auto &CCS_raw = internal::getCCS( C );
+
+		config::NonzeroIndexType * C_col_index = internal::template
+			getReferenceBuffer< typename config::NonzeroIndexType >( ncols + 1 );
+
+		CRS_raw.col_start[ 0 ] = 0;
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+			#pragma omp parallel for simd
+#endif
+		for( size_t j = 0; j <= ncols; ++j ) {
+			CCS_raw.col_start[ j ] = 0;
+		}
+
+
+		nzc = 0;
+		for( size_t i = 0; i < nrows; ++i ) {
+			mask_coors.clear();
+			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+				const size_t k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMask< descr, MaskType >( true, mask_raw.values, k ) ) {
+					mask_coors.assign( k_col );
+				}
+			}
+			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+				const size_t k_col = A_raw.row_index[ k ];
+				if( mask_coors.assigned( k_col ) ) {
+					nzc++;
+					CCS_raw.col_start[ k_col + 1 ]++;
+				}
+			}
+			CRS_raw.col_start[ i + 1 ] = nzc;
+		}
+
+		for( size_t j = 1; j < ncols; ++j ) {
+			CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
+		}
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+		#pragma omp parallel for simd
+#endif
+		for( size_t j = 0; j < ncols; ++j ) {
+			C_col_index[ j ] = 0;
+		}
+
+
+		const size_t old_nzc = nzc;
+
+		// use previously computed CCS offset array to update CCS during the
+		// computational phase
+		nzc = 0;
+		for( size_t i = 0; i < nrows; ++i ) {
+			mask_coors.clear();
+			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+				const size_t k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMask< descr, MaskType >( true, mask_raw.values, k ) ) {
+					mask_coors.assign( k_col );
+				}
+			}
+			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+				const size_t k_col = A_raw.row_index[ k ];
+				if( mask_coors.assigned( k_col ) ) {
+					OutputType val = A_raw.getValue( k, *( (OutputType*) nullptr ) );
+					CRS_raw.row_index[ nzc ] = k_col;
+					CRS_raw.setValue( nzc, val );
+					const size_t CCS_index = C_col_index[ k_col ]++ + CCS_raw.col_start[ k_col ];
+					CCS_raw.row_index[ CCS_index ] = i;
+					CCS_raw.setValue( CCS_index, val );
+					nzc++;
+				}
+			}
+		}
+
+		for( size_t j = 0; j < ncols; ++j ) {
+			assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] ==
+				C_col_index[ j ] );
+		}
+		assert( old_nzc == nzc );
+
+		internal::setCurrentNonzeroes( C, nzc );
+
+		return SUCCESS;
+	}
+
+
+
+
+	
+
 	/**
 	 * Ingests raw data into a GraphBLAS vector.
 	 *

From 969ddd9c25e0c15d7da4e6e6c4017e98bf656ba3 Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Wed, 21 Feb 2024 14:52:25 +0100
Subject: [PATCH 02/39] tests

---
 tests/unit/matrixSet.cpp | 120 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index c6ab6739d..02eecfee2 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -224,6 +224,126 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		}
 	}
 	if( rc != SUCCESS ) { return; }
+
+	//check masked matrix set
+	grb::Matrix< int > Mask( n, n );
+	grb::Matrix< int > Output( n, n );
+	grb::Matrix< int > Input( n, n );
+	size_t I[ 2 * n - 1 ], J[ 2 * n - 1 ];
+	int mask_vals [ 2 * n - 1 ];
+	int input_vals [ 2 * n - 1 ];
+
+	for( size_t k = 0; k < n; ++k ) {
+		I[ k ] = J[ k ] = k;
+		mask_vals[ k ] = 1;
+		input_vals[ k ] = k;
+		if( k < n - 1 ) {
+			I[ n + k ] = k;
+			J[ n + k ] = k + 1;
+			mask_vals [ n + k ] = 0;
+			input_vals[ n + k ] = k;
+		}
+	}
+
+	rc = grb::buildMatrixUnique( Mask, I, J, mask_vals, 2 * n - 1, SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n";
+		return;
+	}
+
+	rc = grb::buildMatrixUnique( Input, I, J, input_vals, 2 * n - 1, SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
+		return;
+	}
+
+	rc = grb::resize(Output, 2 * n - 1 );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::resize of output matrix for the matrix to matrix masked set FAILED\n";
+		return;
+	}
+
+	rc = grb::set< descriptors::structural >( Output, Mask, Input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
+		return;
+	}
+
+	if( grb::nnz( Output ) != 2 * n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( Output ) << " ), expected " << 2 * n - 1 <<".\n";
+		rc = FAILED;
+	}
+
+	for( const auto & triplet : Output ) {
+		if( triplet.first.first != triplet.first.second && triplet.first.first != triplet.first.second - 1 ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
+			rc = FAILED;
+		} if( triplet.first.first != triplet.second ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
+	}
+
+	if( rc != SUCCESS ) {
+		return;
+	}
+
+	
+
+	rc = grb::set( Output, Mask, Input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
+		return;
+	}
+
+	if( grb::nnz( Output ) != n ) {
+		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( Output ) << " ), expected " << n <<".\n";
+		rc = FAILED;
+	}
+
+	for( const auto & triplet : Output ) {
+		if( triplet.first.first != triplet.first.second ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
+			rc = FAILED;
+		} if( triplet.first.first != triplet.second ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
+	}
+
+	if( rc != SUCCESS ) {
+		return;
+	}
+
+
+
+	rc = grb::set< descriptors::invert_mask >( Output, Mask, Input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
+		return;
+	}
+
+	if( grb::nnz( Output ) != n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( Output ) << " ), expected " << n - 1 <<".\n";
+		rc = FAILED;
+	}
+
+	for( const auto & triplet : Output ) {
+		if( triplet.first.first != triplet.first.second - 1 ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
+			rc = FAILED;
+		} if( triplet.first.first != triplet.second ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
+	}
+
+	if( rc != SUCCESS ) {
+		return;
+	}
 }
 
 int main( int argc, char ** argv ) {

From c7cb4f5a682d003c44c89873bfff0f44e7a67959 Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Wed, 21 Feb 2024 15:14:13 +0100
Subject: [PATCH 03/39] tests refactor

---
 tests/unit/matrixSet.cpp | 80 ++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index 02eecfee2..f66600729 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -43,6 +43,12 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	grb::Matrix< void > C( n, n );
 	grb::Matrix< void > D( n, n );
 	grb::Matrix< unsigned int > E( n, n );
+	grb::Matrix< int > Mask( n, n );
+	grb::Matrix< int > Output( n, n );
+	grb::Matrix< int > Input( n, n );
+	
+
+	
 	rc = grb::resize( A, 15 );
 	if( rc == SUCCESS ) {
 		rc = grb::buildMatrixUnique( A, I, J, data1, 15, SEQUENTIAL );
@@ -64,6 +70,35 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 			}
 		}
 	}
+
+	size_t I_mask[ 2 * n - 1 ], J_mask[ 2 * n - 1 ];
+	int mask_vals [ 2 * n - 1 ];
+	int input_vals [ 2 * n - 1 ];
+
+	for( size_t k = 0; k < n; ++k ) {
+		I_mask[ k ] = J_mask[ k ] = k;
+		mask_vals[ k ] = 1;
+		input_vals[ k ] = k;
+		if( k < n - 1 ) {
+			I_mask[ n + k ] = k;
+			J_mask[ n + k ] = k + 1;
+			mask_vals [ n + k ] = 0;
+			input_vals[ n + k ] = k;
+		}
+	}
+
+	rc = grb::buildMatrixUnique( Mask, I_mask, J_mask, mask_vals, 2 * n - 1, SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n";
+		return;
+	}
+
+	rc = grb::buildMatrixUnique( Input, I_mask, J_mask, input_vals, 2 * n - 1, SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
+		return;
+	}
+
 	if( rc == SUCCESS ) {
 		rc = grb::resize( B, 15 );
 	}
@@ -76,6 +111,9 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	if( rc == SUCCESS ) {
 		rc = grb::resize( E, 15 );
 	}
+	if( rc == SUCCESS ) {
+		rc = grb::resize(Output, 2 * n - 1 );
+	}
 	if( rc != SUCCESS || grb::nnz( A ) != 15 ) {
 		std::cerr << "\tinitialisation FAILED\n";
 		return;
@@ -226,42 +264,6 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	if( rc != SUCCESS ) { return; }
 
 	//check masked matrix set
-	grb::Matrix< int > Mask( n, n );
-	grb::Matrix< int > Output( n, n );
-	grb::Matrix< int > Input( n, n );
-	size_t I[ 2 * n - 1 ], J[ 2 * n - 1 ];
-	int mask_vals [ 2 * n - 1 ];
-	int input_vals [ 2 * n - 1 ];
-
-	for( size_t k = 0; k < n; ++k ) {
-		I[ k ] = J[ k ] = k;
-		mask_vals[ k ] = 1;
-		input_vals[ k ] = k;
-		if( k < n - 1 ) {
-			I[ n + k ] = k;
-			J[ n + k ] = k + 1;
-			mask_vals [ n + k ] = 0;
-			input_vals[ n + k ] = k;
-		}
-	}
-
-	rc = grb::buildMatrixUnique( Mask, I, J, mask_vals, 2 * n - 1, SEQUENTIAL );
-	if( rc != SUCCESS ) {
-		std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n";
-		return;
-	}
-
-	rc = grb::buildMatrixUnique( Input, I, J, input_vals, 2 * n - 1, SEQUENTIAL );
-	if( rc != SUCCESS ) {
-		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
-		return;
-	}
-
-	rc = grb::resize(Output, 2 * n - 1 );
-	if( rc != SUCCESS ) {
-		std::cerr << "\t grb::resize of output matrix for the matrix to matrix masked set FAILED\n";
-		return;
-	}
 
 	rc = grb::set< descriptors::structural >( Output, Mask, Input );
 	if( rc != SUCCESS ) {
@@ -278,7 +280,7 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		if( triplet.first.first != triplet.first.second && triplet.first.first != triplet.first.second - 1 ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
-		} if( triplet.first.first != triplet.second ) {
+		} if( (int) triplet.first.first != triplet.second ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
 			std::cerr << ", expected value "<< triplet.first.first <<".\n";
 			rc = FAILED;
@@ -306,7 +308,7 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		if( triplet.first.first != triplet.first.second ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
-		} if( triplet.first.first != triplet.second ) {
+		} if( (int) triplet.first.first != triplet.second ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
 			std::cerr << ", expected value "<< triplet.first.first <<".\n";
 			rc = FAILED;
@@ -334,7 +336,7 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		if( triplet.first.first != triplet.first.second - 1 ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
-		} if( triplet.first.first != triplet.second ) {
+		} if( (int) triplet.first.first != triplet.second ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
 			std::cerr << ", expected value "<< triplet.first.first <<".\n";
 			rc = FAILED;

From 34af750117f3d7579ad3ab82e95bc86e1025a9a4 Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Wed, 21 Feb 2024 15:14:42 +0100
Subject: [PATCH 04/39] nonblocking implementation

---
 include/graphblas/nonblocking/io.hpp | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp
index 0831fc564..1246e7c37 100644
--- a/include/graphblas/nonblocking/io.hpp
+++ b/include/graphblas/nonblocking/io.hpp
@@ -1227,6 +1227,41 @@ namespace grb {
 			internal::getRefMatrix( C ), internal::getRefMatrix( A ), val, phase );
 	}
 
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename InputType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2
+	>
+	RC set(
+		Matrix< OutputType, nonblocking, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, nonblocking, RIT2, CIT2, NIT2 > &Mask,
+		const Matrix< InputType, nonblocking, RIT2, CIT2, NIT2 > &A,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		if( internal::NONBLOCKING::warn_if_not_native &&
+			config::PIPELINE::warn_if_not_native
+		) {
+			std::cerr << "Warning: masked set (nonblocking) currently delegates "
+				<< "to a blocking implementation.\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
+
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return set< descr >(
+			internal::getRefMatrix( C ),
+			internal::getRefMatrix( Mask ),
+			internal::getRefMatrix( A ),
+			phase
+		);
+	}
+
 	template<
 		Descriptor descr = descriptors::no_operation,
 		typename InputType,

From 1611bc4c7dbc6f3a36e5cbf08cb3c82450747bd4 Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Wed, 21 Feb 2024 15:15:09 +0100
Subject: [PATCH 05/39] hyperdags implementation

---
 include/graphblas/hyperdags/hyperdags.hpp |  4 ++-
 include/graphblas/hyperdags/io.hpp        | 38 +++++++++++++++++++++++
 src/graphblas/hyperdags/hyperdags.cpp     |  3 ++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
index 2ae951565..b4a650c0e 100644
--- a/include/graphblas/hyperdags/hyperdags.hpp
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -381,6 +381,8 @@ namespace grb {
 
 				SET_MATRIX_MATRIX_INPUT2,
 
+				SET_MATRIX_MATRIX_MASKED,
+
 				MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
 
 				MXM_MATRIX_MATRIX_MATRIX_MONOID,
@@ -503,7 +505,7 @@ namespace grb {
 			};
 
 			/** \internal How many operation vertex types exist. */
-			const constexpr size_t numOperationVertexTypes = 111;
+			const constexpr size_t numOperationVertexTypes = 112;
 
 			/** \internal An array of all operation vertex types. */
 			const constexpr enum OperationVertexType
diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
index 4ec81917b..c9347adfd 100644
--- a/include/graphblas/hyperdags/io.hpp
+++ b/include/graphblas/hyperdags/io.hpp
@@ -440,6 +440,44 @@ namespace grb {
 		return ret;
 	}
 
+	/**
+	 * This function inherits the performance semantics of the underlying backend.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename InputType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2
+	>
+	RC set(
+		Matrix< OutputType, hyperdags, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, hyperdags, RIT2, CIT2, NIT2 > &Mask,
+		const Matrix< InputType, hyperdags, RIT2, CIT2, NIT2 > &A,
+		const Phase &phase = EXECUTE
+	) {
+		const RC ret = set< descr >(
+			internal::getMatrix( C ), internal::getMatrix( Mask ),
+			internal::getMatrix( A ), phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(Mask) ),
+			getID( internal::getMatrix(C) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::SET_MATRIX_MATRIX_MASKED,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
 	/**
 	 * This function inherits the performance semantics of the underlying backend.
 	 */
diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp
index 256b31fba..320b522f5 100644
--- a/src/graphblas/hyperdags/hyperdags.cpp
+++ b/src/graphblas/hyperdags/hyperdags.cpp
@@ -215,6 +215,9 @@ std::string grb::internal::hyperdags::toString(
 		case SET_MATRIX_MATRIX_INPUT2:
 			return "set( matrix, matrix, scalar )";
 
+		case SET_MATRIX_MATRIX_MASKED:
+			return "set( matrix, matrix, matrix )";
+
 		case MXM_MATRIX_MATRIX_MATRIX_MONOID:
 			return "mxm( matrix, matrix, matrix, monoid, scalar, scalar )";
 

From d250c5463f38962e8ee3b4da48d3b6d2a72baf87 Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Mon, 26 Feb 2024 12:25:56 +0100
Subject: [PATCH 06/39] rename

---
 include/graphblas/hyperdags/io.hpp   |  6 ++---
 include/graphblas/nonblocking/io.hpp |  4 ++--
 include/graphblas/reference/io.hpp   | 13 ++++------
 tests/unit/matrixSet.cpp             | 36 ++++++++++++++--------------
 4 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
index c9347adfd..6c93044b4 100644
--- a/include/graphblas/hyperdags/io.hpp
+++ b/include/graphblas/hyperdags/io.hpp
@@ -451,12 +451,12 @@ namespace grb {
 	>
 	RC set(
 		Matrix< OutputType, hyperdags, RIT1, CIT1, NIT1 > &C,
-		const Matrix< MaskType, hyperdags, RIT2, CIT2, NIT2 > &Mask,
+		const Matrix< MaskType, hyperdags, RIT2, CIT2, NIT2 > &M,
 		const Matrix< InputType, hyperdags, RIT2, CIT2, NIT2 > &A,
 		const Phase &phase = EXECUTE
 	) {
 		const RC ret = set< descr >(
-			internal::getMatrix( C ), internal::getMatrix( Mask ),
+			internal::getMatrix( C ), internal::getMatrix( M ),
 			internal::getMatrix( A ), phase
 		);
 		if( ret != SUCCESS ) { return ret; }
@@ -465,7 +465,7 @@ namespace grb {
 		std::array< const void *, 0 > sourcesP{};
 		std::array< uintptr_t, 3 > sourcesC{
 			getID( internal::getMatrix(A) ),
-			getID( internal::getMatrix(Mask) ),
+			getID( internal::getMatrix(M) ),
 			getID( internal::getMatrix(C) )
 		};
 		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp
index 1246e7c37..022a50901 100644
--- a/include/graphblas/nonblocking/io.hpp
+++ b/include/graphblas/nonblocking/io.hpp
@@ -1236,7 +1236,7 @@ namespace grb {
 	>
 	RC set(
 		Matrix< OutputType, nonblocking, RIT1, CIT1, NIT1 > &C,
-		const Matrix< MaskType, nonblocking, RIT2, CIT2, NIT2 > &Mask,
+		const Matrix< MaskType, nonblocking, RIT2, CIT2, NIT2 > &M,
 		const Matrix< InputType, nonblocking, RIT2, CIT2, NIT2 > &A,
 		const Phase &phase = EXECUTE
 	) noexcept {
@@ -1256,7 +1256,7 @@ namespace grb {
 		// second, delegate to the reference backend
 		return set< descr >(
 			internal::getRefMatrix( C ),
-			internal::getRefMatrix( Mask ),
+			internal::getRefMatrix( M ),
 			internal::getRefMatrix( A ),
 			phase
 		);
diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index b3792528e..b0402cb05 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2036,7 +2036,7 @@ namespace grb {
 	>
 	RC set(
 		Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
-		const Matrix< MaskType, reference, RIT2, CIT2, NIT2 > &Mask,
+		const Matrix< MaskType, reference, RIT2, CIT2, NIT2 > &M,
 		const Matrix< InputType, reference, RIT2, CIT2, NIT2 > &A,
 		const Phase &phase = EXECUTE
 	) noexcept {
@@ -2068,8 +2068,8 @@ namespace grb {
 		const size_t nrows = grb::nrows( C );
 		const size_t ncols = grb::ncols( C );
 
-		const size_t m = grb::nrows( Mask );
-		const size_t n = grb::ncols( Mask );
+		const size_t m = grb::nrows( M );
+		const size_t n = grb::ncols( M );
 
 		/*grb::Monoid<
 			grb::operators::left_assign< OutputType >,
@@ -2089,7 +2089,7 @@ namespace grb {
 			return MISMATCH;
 		}
 
-		const auto &mask_raw = internal::getCRS( Mask );
+		const auto &mask_raw = internal::getCRS( M );
 
 		const auto &A_raw = internal::getCRS( A );
 
@@ -2098,7 +2098,7 @@ namespace grb {
 		char * mask_arr = nullptr;
 		char * mask_buf = nullptr;
 		OutputType * mask_valbuf = nullptr;
-		internal::getMatrixBuffers( mask_arr, mask_buf, mask_valbuf, 1, Mask );
+		internal::getMatrixBuffers( mask_arr, mask_buf, mask_valbuf, 1, M );
 
 		internal::Coordinates< reference > mask_coors;
 		mask_coors.set( mask_arr, false, mask_buf, ncols );
@@ -2185,8 +2185,6 @@ namespace grb {
 		}
 
 
-		const size_t old_nzc = nzc;
-
 		// use previously computed CCS offset array to update CCS during the
 		// computational phase
 		nzc = 0;
@@ -2216,7 +2214,6 @@ namespace grb {
 			assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] ==
 				C_col_index[ j ] );
 		}
-		assert( old_nzc == nzc );
 
 		internal::setCurrentNonzeroes( C, nzc );
 
diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index f66600729..6088a5927 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -43,9 +43,9 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	grb::Matrix< void > C( n, n );
 	grb::Matrix< void > D( n, n );
 	grb::Matrix< unsigned int > E( n, n );
-	grb::Matrix< int > Mask( n, n );
-	grb::Matrix< int > Output( n, n );
-	grb::Matrix< int > Input( n, n );
+	grb::Matrix< int > mask( n, n );
+	grb::Matrix< int > output( n, n );
+	grb::Matrix< int > input( n, n );
 	
 
 	
@@ -87,13 +87,13 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		}
 	}
 
-	rc = grb::buildMatrixUnique( Mask, I_mask, J_mask, mask_vals, 2 * n - 1, SEQUENTIAL );
+	rc = grb::buildMatrixUnique( mask, I_mask, J_mask, mask_vals, 2 * n - 1, SEQUENTIAL );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n";
 		return;
 	}
 
-	rc = grb::buildMatrixUnique( Input, I_mask, J_mask, input_vals, 2 * n - 1, SEQUENTIAL );
+	rc = grb::buildMatrixUnique( input, I_mask, J_mask, input_vals, 2 * n - 1, SEQUENTIAL );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
 		return;
@@ -112,7 +112,7 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		rc = grb::resize( E, 15 );
 	}
 	if( rc == SUCCESS ) {
-		rc = grb::resize(Output, 2 * n - 1 );
+		rc = grb::resize(output, 2 * n - 1 );
 	}
 	if( rc != SUCCESS || grb::nnz( A ) != 15 ) {
 		std::cerr << "\tinitialisation FAILED\n";
@@ -265,18 +265,18 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 
 	//check masked matrix set
 
-	rc = grb::set< descriptors::structural >( Output, Mask, Input );
+	rc = grb::set< descriptors::structural >( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
 		return;
 	}
 
-	if( grb::nnz( Output ) != 2 * n - 1 ) {
-		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( Output ) << " ), expected " << 2 * n - 1 <<".\n";
+	if( grb::nnz( output ) != 2 * n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
 		rc = FAILED;
 	}
 
-	for( const auto & triplet : Output ) {
+	for( const auto & triplet : output ) {
 		if( triplet.first.first != triplet.first.second && triplet.first.first != triplet.first.second - 1 ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
@@ -293,18 +293,18 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 
 	
 
-	rc = grb::set( Output, Mask, Input );
+	rc = grb::set( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
 		return;
 	}
 
-	if( grb::nnz( Output ) != n ) {
-		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( Output ) << " ), expected " << n <<".\n";
+	if( grb::nnz( output ) != n ) {
+		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( output ) << " ), expected " << n <<".\n";
 		rc = FAILED;
 	}
 
-	for( const auto & triplet : Output ) {
+	for( const auto & triplet : output ) {
 		if( triplet.first.first != triplet.first.second ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
@@ -321,18 +321,18 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 
 
 
-	rc = grb::set< descriptors::invert_mask >( Output, Mask, Input );
+	rc = grb::set< descriptors::invert_mask >( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
 		return;
 	}
 
-	if( grb::nnz( Output ) != n - 1 ) {
-		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( Output ) << " ), expected " << n - 1 <<".\n";
+	if( grb::nnz( output ) != n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
 		rc = FAILED;
 	}
 
-	for( const auto & triplet : Output ) {
+	for( const auto & triplet : output ) {
 		if( triplet.first.first != triplet.first.second - 1 ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;

From cc7368960e6a921bd7a038c66f9f3c7f2591878f Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Tue, 27 Feb 2024 10:44:51 +0100
Subject: [PATCH 07/39] code cleanup

---
 include/graphblas/reference/io.hpp | 28 ++++++++++++++------------
 include/graphblas/utils.hpp        | 32 ++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index b0402cb05..2033b31ee 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2097,7 +2097,7 @@ namespace grb {
 
 		char * mask_arr = nullptr;
 		char * mask_buf = nullptr;
-		OutputType * mask_valbuf = nullptr;
+		MaskType * mask_valbuf = nullptr;
 		internal::getMatrixBuffers( mask_arr, mask_buf, mask_valbuf, 1, M );
 
 		internal::Coordinates< reference > mask_coors;
@@ -2106,13 +2106,13 @@ namespace grb {
 		for( size_t i = 0; i < nrows; ++i ) {
 			mask_coors.clear();
 			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
-				const size_t k_col = mask_raw.row_index[ k ];
-				if( utils::interpretMask< descr, MaskType >( true, mask_raw.values, k ) ) {
+				const auto k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
 					mask_coors.assign( k_col );
 				}
 			}
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-				const size_t k_col = A_raw.row_index[ k ];
+				const auto k_col = A_raw.row_index[ k ];
 				if( mask_coors.assigned( k_col ) ) {
 					nzc++;
 				}
@@ -2147,7 +2147,7 @@ namespace grb {
 		CRS_raw.col_start[ 0 ] = 0;
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			#pragma omp parallel for simd
+		#pragma omp parallel for simd
 #endif
 		for( size_t j = 0; j <= ncols; ++j ) {
 			CCS_raw.col_start[ j ] = 0;
@@ -2158,13 +2158,13 @@ namespace grb {
 		for( size_t i = 0; i < nrows; ++i ) {
 			mask_coors.clear();
 			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
-				const size_t k_col = mask_raw.row_index[ k ];
-				if( utils::interpretMask< descr, MaskType >( true, mask_raw.values, k ) ) {
+				const auto k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
 					mask_coors.assign( k_col );
 				}
 			}
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-				const size_t k_col = A_raw.row_index[ k ];
+				const auto k_col = A_raw.row_index[ k ];
 				if( mask_coors.assigned( k_col ) ) {
 					nzc++;
 					CCS_raw.col_start[ k_col + 1 ]++;
@@ -2191,29 +2191,31 @@ namespace grb {
 		for( size_t i = 0; i < nrows; ++i ) {
 			mask_coors.clear();
 			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
-				const size_t k_col = mask_raw.row_index[ k ];
-				if( utils::interpretMask< descr, MaskType >( true, mask_raw.values, k ) ) {
+				const auto k_col = mask_raw.row_index[ k ];
+				if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
 					mask_coors.assign( k_col );
 				}
 			}
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-				const size_t k_col = A_raw.row_index[ k ];
+				const auto k_col = A_raw.row_index[ k ];
 				if( mask_coors.assigned( k_col ) ) {
 					OutputType val = A_raw.getValue( k, *( (OutputType*) nullptr ) );
 					CRS_raw.row_index[ nzc ] = k_col;
 					CRS_raw.setValue( nzc, val );
-					const size_t CCS_index = C_col_index[ k_col ]++ + CCS_raw.col_start[ k_col ];
+					const size_t CCS_index = C_col_index[ k_col ] + CCS_raw.col_start[ k_col ];
+					C_col_index[ k_col ]++;
 					CCS_raw.row_index[ CCS_index ] = i;
 					CCS_raw.setValue( CCS_index, val );
 					nzc++;
 				}
 			}
 		}
-
+#ifndef NDEBUG
 		for( size_t j = 0; j < ncols; ++j ) {
 			assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] ==
 				C_col_index[ j ] );
 		}
+#endif
 
 		internal::setCurrentNonzeroes( C, nzc );
 
diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp
index 3ab508eec..09c8d8b71 100644
--- a/include/graphblas/utils.hpp
+++ b/include/graphblas/utils.hpp
@@ -384,6 +384,38 @@ namespace grb {
 				return ret;
 			}
 		}
+		/** Specialisation for void-valued matrice's masks */
+		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
+		static bool interpretMatrixMask(
+			const bool &assigned,
+			const ValuesType * const values,
+			const size_t k,
+			typename std::enable_if<
+				!std::is_void< MatrixDataType >::value
+			>::type * = nullptr
+		) {
+			return interpretMask< descriptor, ValuesType >( assigned, values, k );
+		}
+
+		/** Specialisation for void-valued matrice's masks */
+		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
+		static bool interpretMatrixMask(
+			const bool &assigned,
+			const ValuesType * const,
+			const size_t,
+			typename std::enable_if<
+				std::is_void< MatrixDataType >::value
+			>::type * = nullptr
+		) {
+			// set default mask to false
+			bool ret = assigned;
+			// check whether we should return the inverted value
+			if( descriptor & descriptors::invert_mask ) {
+				return !ret;
+			} else {
+				return ret;
+			}
+		}
 
 		/** Specialisation for void-valued matrice's masks */
 		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >

From 3e33029504a73d27dcc2f4b570082cbbc71ab7fa Mon Sep 17 00:00:00 2001
From: Aleksa Milisavljevic <aleksamilis@gmail.com>
Date: Tue, 27 Feb 2024 10:58:39 +0100
Subject: [PATCH 08/39] omp reductions

---
 include/graphblas/reference/io.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index 2033b31ee..032027fb2 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2111,6 +2111,9 @@ namespace grb {
 					mask_coors.assign( k_col );
 				}
 			}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+			#pragma omp parallel for reduction(+:nzc)
+#endif
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = A_raw.row_index[ k ];
 				if( mask_coors.assigned( k_col ) ) {

From 36849bdb5a75ee6a76812361f41909e953b78ba7 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sun, 17 Aug 2025 16:51:20 +0200
Subject: [PATCH 09/39] Fixes after rebase

---
 include/graphblas/base/io.hpp | 31 +++++++++++++++
 include/graphblas/utils.hpp   | 43 ++++-----------------
 tests/unit/matrixSet.cpp      | 71 ++++++++++++++++-------------------
 3 files changed, 72 insertions(+), 73 deletions(-)

diff --git a/include/graphblas/base/io.hpp b/include/graphblas/base/io.hpp
index 493dfcfea..4e493332b 100644
--- a/include/graphblas/base/io.hpp
+++ b/include/graphblas/base/io.hpp
@@ -1320,6 +1320,37 @@ namespace grb {
 		return UNSUPPORTED;
 	}
 
+	/** TODO add specification */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename ValueType,
+		typename RIT1, typename CIT1, typename NIT1,
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3,
+		Backend backend
+	>
+	RC set(
+		Matrix< OutputType, backend, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, backend, RIT2, CIT2, NIT2 > &mask,
+		const Matrix< ValueType, backend, RIT3, CIT3, NIT3 > &A,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< ValueType >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) noexcept {
+#ifndef NDEBUG
+		const bool should_not_call_base_matrix_masked_matrix_set = false;
+		assert( should_not_call_base_matrix_masked_matrix_set );
+#endif
+		(void) C;
+		(void) mask;
+		(void) A;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
 	/**
 	 * Sets the element of a given vector at a given position to a given value.
 	 *
diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp
index 09c8d8b71..440a835f0 100644
--- a/include/graphblas/utils.hpp
+++ b/include/graphblas/utils.hpp
@@ -385,7 +385,10 @@ namespace grb {
 			}
 		}
 		/** Specialisation for void-valued matrice's masks */
-		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
+		template<
+			Descriptor descriptor,
+			typename MatrixDataType, typename ValuesType
+		>
 		static bool interpretMatrixMask(
 			const bool &assigned,
 			const ValuesType * const values,
@@ -398,40 +401,10 @@ namespace grb {
 		}
 
 		/** Specialisation for void-valued matrice's masks */
-		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
-		static bool interpretMatrixMask(
-			const bool &assigned,
-			const ValuesType * const,
-			const size_t,
-			typename std::enable_if<
-				std::is_void< MatrixDataType >::value
-			>::type * = nullptr
-		) {
-			// set default mask to false
-			bool ret = assigned;
-			// check whether we should return the inverted value
-			if( descriptor & descriptors::invert_mask ) {
-				return !ret;
-			} else {
-				return ret;
-			}
-		}
-
-		/** Specialisation for void-valued matrice's masks */
-		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
-		static bool interpretMatrixMask(
-			const bool &assigned,
-			const ValuesType * const values,
-			const size_t k,
-			typename std::enable_if<
-				!std::is_void< MatrixDataType >::value
-			>::type * = nullptr
-		) {
-			return interpretMask< descriptor, ValuesType >( assigned, values, k );
-		}
-
-		/** Specialisation for void-valued matrice's masks */
-		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
+		template<
+			Descriptor descriptor,
+			typename MatrixDataType, typename ValuesType
+		>
 		static bool interpretMatrixMask(
 			const bool &assigned,
 			const ValuesType * const,
diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index 6088a5927..617d4af8a 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -270,82 +270,77 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
 		return;
 	}
-
 	if( grb::nnz( output ) != 2 * n - 1 ) {
-		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+		std::cerr << "\t unexpected number of output elements ( "
+			<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
 		rc = FAILED;
 	}
-
-	for( const auto & triplet : output ) {
-		if( triplet.first.first != triplet.first.second && triplet.first.first != triplet.first.second - 1 ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
+	for( const auto &triplet : output ) {
+		if(
+			triplet.first.first != triplet.first.second &&
+			triplet.first.first != triplet.first.second - 1
+		) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
-		} if( (int) triplet.first.first != triplet.second ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
+		} if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
 			std::cerr << ", expected value "<< triplet.first.first <<".\n";
 			rc = FAILED;
 		}
 	}
-
-	if( rc != SUCCESS ) {
-		return;
-	}
-
-	
+	if( rc != SUCCESS ) { return; }
 
 	rc = grb::set( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
 		return;
 	}
-
 	if( grb::nnz( output ) != n ) {
-		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( output ) << " ), expected " << n <<".\n";
+		std::cerr << "\t unexpected number of output elements ( "
+			<< grb::nnz( output ) << " ), expected " << n <<".\n";
 		rc = FAILED;
 	}
-
-	for( const auto & triplet : output ) {
+	for( const auto &triplet : output ) {
 		if( triplet.first.first != triplet.first.second ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
-		} if( (int) triplet.first.first != triplet.second ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
+		} if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
 			std::cerr << ", expected value "<< triplet.first.first <<".\n";
 			rc = FAILED;
 		}
 	}
+	if( rc != SUCCESS ) { return; }
 
-	if( rc != SUCCESS ) {
-		return;
-	}
-
-
-
+	/* TODO check: what are the GraphBLAS semantics for this operation?
 	rc = grb::set< descriptors::invert_mask >( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
 		return;
 	}
-
 	if( grb::nnz( output ) != n - 1 ) {
-		std::cerr << "\t unexpected number of output elements ( " << grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
+		std::cerr << "\t unexpected number of output elements ( "
+			<< grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
 		rc = FAILED;
 	}
-
-	for( const auto & triplet : output ) {
+	for( const auto &triplet : output ) {
 		if( triplet.first.first != triplet.first.second - 1 ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n";
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
-		} if( (int) triplet.first.first != triplet.second ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second;
+		} if( triplet.first.first != triplet.second ) {
+			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
 			std::cerr << ", expected value "<< triplet.first.first <<".\n";
 			rc = FAILED;
 		}
 	}
-
-	if( rc != SUCCESS ) {
-		return;
-	}
+	if( rc != SUCCESS ) { return; }
+	*/
 }
 
 int main( int argc, char ** argv ) {

From 969ce5c80565f6b6844361626dceae726315a9e3 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sun, 17 Aug 2025 19:52:49 +0200
Subject: [PATCH 10/39] The code in the unit test was correct re invert_mask,
 even though presently it trips an internal static_assert. Fixing that would
 be a first TODO

---
 include/graphblas/descriptors.hpp | 24 +++++++++++++++++++++++-
 tests/unit/matrixSet.cpp          | 23 ++++++++++++-----------
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/include/graphblas/descriptors.hpp b/include/graphblas/descriptors.hpp
index cd0a5ab25..2c0a793ae 100644
--- a/include/graphblas/descriptors.hpp
+++ b/include/graphblas/descriptors.hpp
@@ -62,7 +62,20 @@ namespace grb {
 		 */
 		static constexpr Descriptor no_operation = 0;
 
-		/** Inverts the mask prior to applying it. */
+		/**
+		 * Inverts the mask prior to applying it.
+		 *
+		 * Applying this descriptor to a sparse mask may still only generate output
+		 * where the mask has elements. To decide whether an output will be generated
+		 * at a given element in the mask, its value will be inverted.
+		 *
+		 * If instead the structural complement is to be taken as a mask, this
+		 * descriptor must be combined with #grb::descriptors::structural.
+		 * ALP/GraphBLAS forbids taking the structural inverse of matrix masks
+		 * (because then either the output matrix or the mask matrix has
+		 * \f$ \mathcal{mn} \f$ values, which defeats any useful application of
+		 * GraphBLAS as this signifies one of the containers is, in fact, not sparse).
+		 */
 		static constexpr Descriptor invert_mask = 1;
 
 		/**
@@ -98,6 +111,11 @@ namespace grb {
 		 * i-th index, regardless of how that value evaluates. It evaluates false
 		 * if there was no value assigned.
 		 *
+		 * These semantics are inverted when this descriptor is combined with
+		 * #grb::descriptors::invert_mask: in that case, the mask evaluates true at
+		 * index \f$ i \f$ if the mask had no value at that index; and evaluates false
+		 * otherwise.
+		 *
 		 * @see structural_complement
 		 */
 		static constexpr Descriptor structural = 8;
@@ -113,6 +131,10 @@ namespace grb {
 		 * This ignores the actual values of the mask argument. The i-th element of
 		 * the mask now evaluates true if the mask has \em no value assigned to its
 		 * i-th index, and evaluates false otherwise.
+		 *
+		 * The application of this descriptor is forbidden for matrix mask arguments,
+		 * as otherwise either the output or the mask is dense or is (too) close to
+		 * being dense.
 		 */
 		static constexpr Descriptor structural_complement = structural | invert_mask;
 
diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index 617d4af8a..891b91c85 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -47,8 +47,6 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	grb::Matrix< int > output( n, n );
 	grb::Matrix< int > input( n, n );
 	
-
-	
 	rc = grb::resize( A, 15 );
 	if( rc == SUCCESS ) {
 		rc = grb::buildMatrixUnique( A, I, J, data1, 15, SEQUENTIAL );
@@ -71,14 +69,19 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		}
 	}
 
+	// initialise data for masked-set tests
+	//  - mask will be an n by n matrix with its diagonal set to 1 and its
+	//    superdiagonal set to 0.
+	//  - input will be an n by n matrix with each element on its diagonal and
+	//    superdiagonal set to its row index (meaning the entries on row 0 have
+	//    value 0).
 	size_t I_mask[ 2 * n - 1 ], J_mask[ 2 * n - 1 ];
 	int mask_vals [ 2 * n - 1 ];
 	int input_vals [ 2 * n - 1 ];
-
 	for( size_t k = 0; k < n; ++k ) {
 		I_mask[ k ] = J_mask[ k ] = k;
 		mask_vals[ k ] = 1;
-		input_vals[ k ] = k;
+		input_vals[ k ] = static_cast< int >( k );
 		if( k < n - 1 ) {
 			I_mask[ n + k ] = k;
 			J_mask[ n + k ] = k + 1;
@@ -86,14 +89,14 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 			input_vals[ n + k ] = k;
 		}
 	}
-
-	rc = grb::buildMatrixUnique( mask, I_mask, J_mask, mask_vals, 2 * n - 1, SEQUENTIAL );
+	rc = grb::buildMatrixUnique( mask, I_mask, J_mask, mask_vals, 2 * n - 1,
+		SEQUENTIAL );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n";
 		return;
 	}
-
-	rc = grb::buildMatrixUnique( input, I_mask, J_mask, input_vals, 2 * n - 1, SEQUENTIAL );
+	rc = grb::buildMatrixUnique( input, I_mask, J_mask, input_vals, 2 * n - 1,
+		SEQUENTIAL );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
 		return;
@@ -316,7 +319,6 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	if( rc != SUCCESS ) { return; }
 
-	/* TODO check: what are the GraphBLAS semantics for this operation?
 	rc = grb::set< descriptors::invert_mask >( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
@@ -332,7 +334,7 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
 				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
-		} if( triplet.first.first != triplet.second ) {
+		} if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
 			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
 				<< triplet.first.second << " ) with value " << triplet.second;
 			std::cerr << ", expected value "<< triplet.first.first <<".\n";
@@ -340,7 +342,6 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		}
 	}
 	if( rc != SUCCESS ) { return; }
-	*/
 }
 
 int main( int argc, char ** argv ) {

From 17868d8995c2e75cc0044a24234de23db2eb174a Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sun, 17 Aug 2025 21:08:09 +0200
Subject: [PATCH 11/39] Code review on implementations for indirect backends

---
 include/graphblas/hyperdags/io.hpp   | 5 +++--
 include/graphblas/nonblocking/io.hpp | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
index 6c93044b4..4ac5b4635 100644
--- a/include/graphblas/hyperdags/io.hpp
+++ b/include/graphblas/hyperdags/io.hpp
@@ -447,12 +447,13 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		typename OutputType, typename MaskType, typename InputType,
 		typename RIT1, typename CIT1, typename NIT1,
-		typename RIT2, typename CIT2, typename NIT2
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
 	>
 	RC set(
 		Matrix< OutputType, hyperdags, RIT1, CIT1, NIT1 > &C,
 		const Matrix< MaskType, hyperdags, RIT2, CIT2, NIT2 > &M,
-		const Matrix< InputType, hyperdags, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType, hyperdags, RIT3, CIT3, NIT3 > &A,
 		const Phase &phase = EXECUTE
 	) {
 		const RC ret = set< descr >(
diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp
index 022a50901..963846251 100644
--- a/include/graphblas/nonblocking/io.hpp
+++ b/include/graphblas/nonblocking/io.hpp
@@ -1232,12 +1232,13 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		typename OutputType, typename MaskType, typename InputType,
 		typename RIT1, typename CIT1, typename NIT1,
-		typename RIT2, typename CIT2, typename NIT2
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
 	>
 	RC set(
 		Matrix< OutputType, nonblocking, RIT1, CIT1, NIT1 > &C,
 		const Matrix< MaskType, nonblocking, RIT2, CIT2, NIT2 > &M,
-		const Matrix< InputType, nonblocking, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType, nonblocking, RIT3, CIT3, NIT3 > &A,
 		const Phase &phase = EXECUTE
 	) noexcept {
 		if( internal::NONBLOCKING::warn_if_not_native &&

From 6c7d9db84dcda4e2d56e06a4f2970af5f7c67854 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sun, 17 Aug 2025 21:10:15 +0200
Subject: [PATCH 12/39] Code review utils

---
 include/graphblas/utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp
index 440a835f0..4a2235501 100644
--- a/include/graphblas/utils.hpp
+++ b/include/graphblas/utils.hpp
@@ -384,7 +384,7 @@ namespace grb {
 				return ret;
 			}
 		}
-		/** Specialisation for void-valued matrice's masks */
+		/** Specialisation for void-valued matrix masks */
 		template<
 			Descriptor descriptor,
 			typename MatrixDataType, typename ValuesType
@@ -400,7 +400,7 @@ namespace grb {
 			return interpretMask< descriptor, ValuesType >( assigned, values, k );
 		}
 
-		/** Specialisation for void-valued matrice's masks */
+		/** Specialisation for void-valued matrix masks */
 		template<
 			Descriptor descriptor,
 			typename MatrixDataType, typename ValuesType

From 009ce240fd02507ccff216ffb98f57dc171aedb2 Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 18 Aug 2025 13:15:17 +0200
Subject: [PATCH 13/39] Wrote spec for set(matrix,matrix,matrix)

---
 include/graphblas/base/io.hpp | 136 ++++++++++++++++++++++++++++------
 1 file changed, 114 insertions(+), 22 deletions(-)

diff --git a/include/graphblas/base/io.hpp b/include/graphblas/base/io.hpp
index 4e493332b..c904570fc 100644
--- a/include/graphblas/base/io.hpp
+++ b/include/graphblas/base/io.hpp
@@ -1218,15 +1218,17 @@ namespace grb {
 	 *   -# #grb::descriptors::invert_mask, and
 	 *   -# #grb::descriptors::structural.
 	 *
-	 * However, and differently from most ALP primtivies, the
+	 * However, and differently from most ALP/GraphBLAS primitivies, the
 	 * #grb::descriptors::invert_mask and #grb::descriptors::structural are
 	 * mutually exclusive for this primitive.
 	 * \endparblock
 	 *
 	 * @tparam OutputType The type of each element in the given matrix.
 	 * @tparam MaskType   The type of each element in the given mask.
-	 * @tparam ValueType  The type of the given value. Should be convertible
-	 *                    to \a OutputType.
+	 * @tparam ValueType  The type of the given value.
+	 *
+	 * The given \a ValueType must be convertible to \a OutputType.
+	 *
 	 * @tparam RIT       The integer type for encoding row indices.
 	 * @tparam CIT       The integer type for encoding column indices.
 	 * @tparam NIT       The integer type for encoding nonzero indices.
@@ -1244,51 +1246,50 @@ namespace grb {
 	 *                  the default is #grb::EXECUTE.
 	 *
 	 * In #grb::RESIZE mode:
-	 * @returns #grb::SUCCESS  When the capacity of \a C has been (made) sufficient
-	 *                         to store the requested output.
+	 *
+	 * @returns #grb::SUCCESS  When the capacity of \a C (has been made or already
+	 *                         was) sufficient to store the requested output.
 	 * @returns #grb::OUTOFMEM When out-of-memory conditions have been met while
-	 *                         executing. If this error code is returned, \a C
+	 *                         resizing \a C. If this error code is returned, \a C
 	 *                         shall be unmodified compared to its state at
 	 *                         function entry.
 	 *
 	 * In #grb::EXECUTE mode:
-	 * @returns #grb::SUCCESS When the call completes successfully.
+	 *
+	 * @returns #grb::SUCCESS When the computation has completed or will execute
+	 *                        successfully.
 	 * @returns #grb::ILLEGAL When \a C did not have enough capacity to store the
 	 *                        output of the requested computation.
 	 *
 	 * Either mode may additionally return:
-	 * @returns #grb::ILLEGAL  In case the given \a mask was empty.
+	 *
 	 * @returns #grb::MISMATCH In case \a C and \a mask have mismatching sizes.
 	 * @returns #grb::PANIC    In case an unmitigable error was encountered. The
-	 *                         caller is suggested to exit gracefully, and in any
-	 *                         case to not make any further calls to ALP.
+	 *                         caller, when encountering this return code, is
+	 *                         suggested to exit gracefully and to not make any
+	 *                         further calls to ALP.
 	 *
 	 * When \a descr includes #grb::descriptors::no_casting then code shall not
 	 * compile if one of the following conditions are met:
 	 *  -# \a ValueType does not match \a OutputType; or
 	 *  -# \a MaskType does not match <tt>bool</tt>.
 	 *
-	 * In these cases, the code shall not compile: implementations must throw
-	 * a static assertion failure in this case.
-	 *
 	 * Similarly, it is forbidden to call this function with both following
 	 * descriptors simultaneously:
 	 *  - #grb::descriptors::invert_mask \em and #grb::descriptors::structural.
 	 *
 	 * The use of the #grb::descriptors::structural_complement descriptor hence is
-	 * is forbidden also. Implementations shall throw a static assertion failure
-	 * if the user nonetheless asks for structural mask inversion.
+	 * is forbidden also. These conditions, when encountered, should lead to
+	 * compile-time errors also.
+	 *
+	 * \note One vehicle to ensure compilation does not occur in these cases is via
+	 *       <tt>static_assert</tt>.
 	 *
 	 * \parblock
 	 * \par Performance semantics
 	 * Each backend must define performance semantics for this primitive.
 	 *
 	 * @see perfSemantics
-	 *
-	 * \warning Generally, if \a mask equals \a C and the mask is non-structural,
-	 *          then optimised implementations will assign higher costs than when
-	 *          \a mask does not equal \a C. This is because the nonzero structure
-	 *          update cannot be done in-place.
 	 * \endparblock
 	 */
 	template<
@@ -1320,7 +1321,98 @@ namespace grb {
 		return UNSUPPORTED;
 	}
 
-	/** TODO add specification */
+	/**
+	 * Sets all values of a matrix to that of a given source matrix, if and only if
+	 * the corresponding value coordinates evaluate <tt>true</tt> at the given mask
+	 * matrix.
+	 *
+	 * @tparam descr The descriptor used for this operation.
+	 *
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   -# #grb::descriptors::no_operation,
+	 *   -# #grb::descriptors::no_casting,
+	 *   -# #grb::descriptors::invert_mask,
+	 *   -# #grb::descriptors::structural
+	 *
+	 * However, and differently from most ALP primtivies, the
+	 * #grb::descriptors::invert_mask and #grb::descriptors::structural are
+	 * mutually exclusive for this primitive.
+	 * \endparblock
+	 *
+	 * @tparam OutputType The type of each element in the destination matrix.
+	 * @tparam MaskType   The type of each element in the output mask.
+	 * @tparam ValueType  The type of each element in the source matrix.
+	 *
+	 * The given \a ValueType must be convertible to \a OutputType.
+	 *
+	 * \internal
+	 * @tparam RIT1       The integer type for encoding row indices in \a C.
+	 * @tparam CIT1       The integer type for encoding column indices in \a C.
+	 * @tparam NIT1       The integer type for encoding nonzero indices in \a C.
+	 * @tparam RIT2       The integer type for encoding row indices in \a mask.
+	 * @tparam CIT2       The integer type for encoding column indices in \a mask.
+	 * @tparam NIT2       The integer type for encoding nonzero indices in \a mask.
+	 * @tparam RIT3       The integer type for encoding row indices in \a A.
+	 * @tparam CIT3       The integer type for encoding column indices in \a A.
+	 * @tparam NIT3       The integer type for encoding nonzero indices in \a A.
+	 * @tparam backend    The backend selected for executing this primitive.
+	 * \endinternal
+	 *
+	 * @param[out] C     The matrix that will be a masked copy of \a A.
+	 * @param[in]  mask  Matrix that acts as output mask on \a C.
+	 * @param[in]  A     The source matrix which will (partially) be copied to
+	 *                   \a A.
+	 * @param[in]  phase Which #grb::Phase of the operation is requested. Optional;
+	 *                   the default is #grb::EXECUTE.
+	 *
+	 * In #grb::RESIZE mode:
+	 *
+	 * @returns #grb::SUCCESS  When the capacity of \a C (has been made or already
+	 *                         was) sufficient to store the requested output.
+	 * @returns #grb::OUTOFMEM When out-of-memory conditions were met while
+	 *                         resizing \a C. If this error code is returned, \a C
+	 *                         shall be left unmodified compared to its state at
+	 *                         function entry.
+	 *
+	 * In #grb::EXECUTE mode:
+	 *
+	 * @returns #grb::SUCCESS When the computation has completed or will execute
+	 *                        successfully.
+	 * @returns #grb::ILLEGAL When \a C did not have enough capacity to store the
+	 *                        output of the requested computation.
+	 *
+	 * Either mode may additionally return:
+	 *
+	 * @returns #grb::MISMATCH When \a A and \a C have mismatching sizes.
+	 * @returns #grb::MISMATCH When \a C and \a mask have mismatching sizes.
+	 * @returns #grb::PANIC    In case an unmitigable error was encountered. The
+	 *                         caller, when encountering this return code, is
+	 *                         suggested to exit the program gracefully and to not
+	 *                         make any further calls to ALP.
+	 *
+	 * When \a descr includes #grb::descriptors::no_casting, then code shall not
+	 * comile if one of the following conditions are met:
+	 *  -# \a ValueType does not match \a OutputType; or
+	 *  -# \a MaskType does not match <tt>bool</tt>.
+	 *
+	 * Similarly, it is forbidden to call this function with both following
+	 * descriptors simultaneously:
+	 *  - #grb::descriptors::invert_mask \em and #grb::descriptors::structural.
+	 *
+	 * The use of the #grb::descriptors::structural_complement descriptor hence is
+	 * is forbidden also. These conditions should lead to compile-time errors also.
+	 *
+	 * \note One vehicle to ensure compilation does not occur in these cases is via
+	 *       <tt>static_assert</tt>.
+	 *
+	 * \parblock
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 * \endparblock
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		typename OutputType, typename MaskType, typename ValueType,
@@ -1794,7 +1886,7 @@ namespace grb {
 	 * successfully, then a call to this function shall return #grb::SUCCESS.
 	 *
 	 * The are several other cases in which the computation of nonblocking
-	 * primtives is forced:
+	 * primitives is forced:
 	 *   -# whenever an output iterator of an output container of any of the non-
 	 *      blocking primitives is requested; and
 	 *   -# whenever an output container of any of the non-blocking primitives is

From 473e70018daed8661c0a5cafbad947aba2e7ff4e Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Mon, 18 Aug 2025 14:02:18 +0200
Subject: [PATCH 14/39] Initial code review. Main TODO: parallelise this
 operation using the same strategy as for the mxm, i.e., get as many SPAs as
 possible and use those to parallelise

---
 include/graphblas/phase.hpp        |   2 +-
 include/graphblas/reference/io.hpp | 114 +++++++++++++++--------------
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/include/graphblas/phase.hpp b/include/graphblas/phase.hpp
index 1f12b8f05..044395ac9 100644
--- a/include/graphblas/phase.hpp
+++ b/include/graphblas/phase.hpp
@@ -236,7 +236,7 @@ namespace grb {
 		 * execute phase.
 		 *
 		 * If, instead, the output container capacity was found to be insufficient,
-		 * then the requested operation may return #grb::FAILED, in which case the
+		 * then the requested operation may return #grb::ILLEGAL, in which case the
 		 * contents of output containers shall be cleared.
 		 *
 		 * \note That on failure a primitive called using the execute phase may
diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index 032027fb2..447a2d9b9 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2032,30 +2032,32 @@ namespace grb {
 		Descriptor descr = descriptors::no_operation,
 		typename OutputType, typename MaskType, typename InputType,
 		typename RIT1, typename CIT1, typename NIT1,
-		typename RIT2, typename CIT2, typename NIT2
+		typename RIT2, typename CIT2, typename NIT2,
+		typename RIT3, typename CIT3, typename NIT3
 	>
 	RC set(
 		Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C,
 		const Matrix< MaskType, reference, RIT2, CIT2, NIT2 > &M,
-		const Matrix< InputType, reference, RIT2, CIT2, NIT2 > &A,
+		const Matrix< InputType, reference, RIT3, CIT3, NIT3 > &A,
 		const Phase &phase = EXECUTE
 	) noexcept {
-		static_assert( !std::is_void< OutputType >::value,
-			"grb::set (masked set to matrix): cannot have a pattern "
-			"matrix as output" );
-		static_assert( std::is_convertible< InputType, OutputType >::value,
+		// static checks
+		static_assert(
+			!std::is_void< InputType >::value ||
+				std::is_void< OutputType >::value, "grb::set( masked set to matrix ): "
+			"cannot have a pattern matrix as input unless the output is also a pattern "
+			"matrix"
+		);
+		static_assert(
+			std::is_convertible< InputType, OutputType >::value,
 			"grb::set (masked set to matrix): input type cannot be "
 			"converted to output type"
 		);
 		static_assert(
-			! ( descr & descriptors::structural && descr & descriptors::invert_mask ),
-			"grb::set can not be called with both descriptors::structural "
-			"and descriptors::invert_mask in the masked variant"
+			!(descr & descriptors::structural && (descr & descriptors::invert_mask)),
+			"grb::set (masked set to matrix) may not be called with both the structural "
+			"and invert_mask descriptors set"
 		);
-#ifdef _DEBUG
-		std::cout << "Called grb::set (matrix-to-matrix-masked, reference)\n";
-#endif
-		// static checks
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 				std::is_same< InputType, OutputType >::value
 			), "grb::set",
@@ -2063,24 +2065,23 @@ namespace grb {
 		);
 
 		// dynamic checks
+#ifdef _DEBUG
+		std::cout << "Called grb::set (matrix-to-matrix-masked, reference)\n";
+#endif
 		assert( phase != TRY );
-
 		const size_t nrows = grb::nrows( C );
 		const size_t ncols = grb::ncols( C );
-
 		const size_t m = grb::nrows( M );
 		const size_t n = grb::ncols( M );
 
-		/*grb::Monoid<
-			grb::operators::left_assign< OutputType >,
-			grb::identities::zero
-		> dummyMonoid;*/
-
+		// check for trivial dispatch first (otherwise the below checks fail when they
+		// should not)
 		if( m == 0 || n == 0 ) {
-			// If the mask has a null size, it will be ignored
+			// If the mask is empty, ignore it
 			return set< descr >( C, A, phase );
 		}
 
+		// dynamic checks, continued
 		if( nrows != grb::nrows( A ) || nrows != m ) {
 			return MISMATCH;
 		}
@@ -2089,20 +2090,21 @@ namespace grb {
 			return MISMATCH;
 		}
 
-		const auto &mask_raw = internal::getCRS( M );
-
-		const auto &A_raw = internal::getCRS( A );
-
+		// go for implementation, preliminaries:
 		size_t nzc = 0;
-
 		char * mask_arr = nullptr;
 		char * mask_buf = nullptr;
 		MaskType * mask_valbuf = nullptr;
-		internal::getMatrixBuffers( mask_arr, mask_buf, mask_valbuf, 1, M );
-
+		const auto &A_raw = internal::getCRS( A );
+		const auto &mask_raw = internal::getCRS( M );
 		internal::Coordinates< reference > mask_coors;
+		internal::getMatrixBuffers( mask_arr, mask_buf, mask_valbuf, 1, M );
 		mask_coors.set( mask_arr, false, mask_buf, ncols );
 
+		// we now have one (guaranteed) SPA, which is mask_coors. We now are going to
+		// check how many more SPAs ideally we would like (for reference_omp), and
+		// then go about trying to get those. If, finally, we get just this one SPA,
+		// we will go into this mostly-sequential code (essentially, big-Omega nrows):
 		for( size_t i = 0; i < nrows; ++i ) {
 			mask_coors.clear();
 			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
@@ -2112,22 +2114,24 @@ namespace grb {
 				}
 			}
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			#pragma omp parallel for reduction(+:nzc)
+			#pragma omp parallel for reduction( +: nzc ) \
+				schedule( dynamic, config::CACHE_LINE_SIZE::value() )
 #endif
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = A_raw.row_index[ k ];
 				if( mask_coors.assigned( k_col ) ) {
-					nzc++;
+					(void) nzc++;
 				}
 			}
 		}
 
+		// we now have a count. If we're in the resize phase that means we're done:
 		if( phase == RESIZE ) {
 			return resize( C, nzc );
 		}
 
+		// otherwise, we now compute the output. We start with checking capacity
 		assert( phase == EXECUTE );
-
 		if( capacity( C ) < nzc ) {
 #ifdef _DEBUG
 			std::cout << "\t insufficient capacity to complete "
@@ -2137,26 +2141,29 @@ namespace grb {
 			if( clear_rc != SUCCESS ) {
 				return PANIC;
 			} else {
-				return FAILED;
+				return ILLEGAL;
 			}
 		}
 
+		// get output CRS and CCS structures
+		// TODO: check for crs_only descriptor
 		auto &CRS_raw = internal::getCRS( C );
 		auto &CCS_raw = internal::getCCS( C );
-
 		config::NonzeroIndexType * C_col_index = internal::template
 			getReferenceBuffer< typename config::NonzeroIndexType >( ncols + 1 );
-
 		CRS_raw.col_start[ 0 ] = 0;
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
+		// TODO ALPify the below
 		#pragma omp parallel for simd
 #endif
 		for( size_t j = 0; j <= ncols; ++j ) {
 			CCS_raw.col_start[ j ] = 0;
+			C_col_index[ j ] = 0;
 		}
 
-
+		// do counting sort, phase 1 -- also this loop should employ the same
+		// parallelisation strategy during counting
 		nzc = 0;
 		for( size_t i = 0; i < nrows; ++i ) {
 			mask_coors.clear();
@@ -2169,33 +2176,30 @@ namespace grb {
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = A_raw.row_index[ k ];
 				if( mask_coors.assigned( k_col ) ) {
-					nzc++;
-					CCS_raw.col_start[ k_col + 1 ]++;
+					(void) nzc++;
+					(void) (CCS_raw.col_start[ k_col + 1 ])++;
 				}
 			}
 			CRS_raw.col_start[ i + 1 ] = nzc;
 		}
 
+		// TODO this is a prefix sum -- use the OMP utility function here to
+		//      parallelise it
 		for( size_t j = 1; j < ncols; ++j ) {
 			CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ];
 		}
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-		#pragma omp parallel for simd
-#endif
-		for( size_t j = 0; j < ncols; ++j ) {
-			C_col_index[ j ] = 0;
-		}
-
-
-		// use previously computed CCS offset array to update CCS during the
-		// computational phase
+		// do counting sort, phase 2 -- use previously computed CCS offset array to
+		// update CCS during the computational phase. Also this loop should employ
+		// the same (multiple-SPA) parallelisation strategy as above
 		nzc = 0;
 		for( size_t i = 0; i < nrows; ++i ) {
 			mask_coors.clear();
 			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = mask_raw.row_index[ k ];
-				if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
+				if( utils::interpretMatrixMask< descr, MaskType >(
+					true, mask_raw.getValues(), k )
+				) {
 					mask_coors.assign( k_col );
 				}
 			}
@@ -2206,30 +2210,28 @@ namespace grb {
 					CRS_raw.row_index[ nzc ] = k_col;
 					CRS_raw.setValue( nzc, val );
 					const size_t CCS_index = C_col_index[ k_col ] + CCS_raw.col_start[ k_col ];
-					C_col_index[ k_col ]++;
+					(void) C_col_index[ k_col ]++;
 					CCS_raw.row_index[ CCS_index ] = i;
 					CCS_raw.setValue( CCS_index, val );
-					nzc++;
+					(void) nzc++;
 				}
 			}
 		}
 #ifndef NDEBUG
+ #ifdef _H_GRB_REFERENCE_OMP_BLAS3
+		#pragma omp parallel schedule( static, config::CACHE_LINE_SIZE::value() )
 		for( size_t j = 0; j < ncols; ++j ) {
 			assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] ==
 				C_col_index[ j ] );
 		}
+ #endif
 #endif
-
 		internal::setCurrentNonzeroes( C, nzc );
 
+		// done
 		return SUCCESS;
 	}
 
-
-
-
-	
-
 	/**
 	 * Ingests raw data into a GraphBLAS vector.
 	 *

From 777f64018ec2816ad800f237ff434bcbb47b9a27 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 14:23:22 +0200
Subject: [PATCH 15/39] Fix static error

---
 include/graphblas/reference/io.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index 447a2d9b9..301829187 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -956,9 +956,9 @@ namespace grb {
 				"internal error. Please submit a bug report."
 			);
 			static_assert(
-				!(descr & descriptors::invert_mask), "internal::grb::set_copy called with "
-				"the invert_mask descriptor. This is an internal error; please submit a "
-				"bug report."
+				!A_is_mask || !(descr & descriptors::invert_mask),
+				"internal::grb::set_copy called with the invert_mask descriptor. This is "
+				"an internal error; please submit a bug report."
 			);
 
 			// run-time checks

From 3daf10a1650b139b130a58dc3c4cb1505233f9f4 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 14:24:09 +0200
Subject: [PATCH 16/39] Fix silly snafu my side

---
 include/graphblas/reference/io.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index 301829187..cab931989 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2220,11 +2220,11 @@ namespace grb {
 #ifndef NDEBUG
  #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 		#pragma omp parallel schedule( static, config::CACHE_LINE_SIZE::value() )
+ #endif
 		for( size_t j = 0; j < ncols; ++j ) {
 			assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] ==
 				C_col_index[ j ] );
 		}
- #endif
 #endif
 		internal::setCurrentNonzeroes( C, nzc );
 

From a0c297e1a32d921eb488a988a7708220447dd1be Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 15:39:03 +0200
Subject: [PATCH 17/39] Made masked tests generic, and add test for empty mask.
 Passes OK

---
 tests/unit/matrixSet.cpp | 338 +++++++++++++++++++++++++++------------
 1 file changed, 233 insertions(+), 105 deletions(-)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index 891b91c85..27eeb75c9 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -19,6 +19,7 @@
 #include <iostream>
 
 #include <graphblas.hpp>
+#include <graphblas/algorithms/matrix_factory.hpp>
 
 
 using namespace grb;
@@ -27,36 +28,214 @@ static const int data1[ 15 ] = { 4, 7, 4, 6, 4, 7, 1, 7, 3, 6, 7, 5, 1, 8, 7 };
 static const size_t I[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 8, 7, 6 };
 static const size_t J[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 5, 7, 5, 1 };
 
-void grb_program( const size_t &n, grb::RC &rc ) {
-	// initialize test
-	int chk[ 10 ][ 10 ];
-	for( size_t i = 0; i < 10; ++i ) {
-		for( size_t j = 0; j < 10; ++j ) {
-			chk[ i ][ j ] = 0;
+/** Generic implementation of masked tests */
+template< typename Tout, typename Tmask, typename Tin >
+RC masked_tests_generic_impl(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with structural descriptor\n";
+	rc = grb::set< descriptors::structural >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( grb::nnz( output ) != 2 * n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( "
+			<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+		rc = FAILED;
+	}
+	for( const auto &triplet : output ) {
+		if(
+			triplet.first.first != triplet.first.second &&
+			triplet.first.first != triplet.first.second - 1
+		) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ), value " << triplet.second << ".\n";
+			rc = FAILED;
+		}
+		if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
 		}
 	}
-	for( size_t k = 0; k < 15; ++k ) {
-		chk[ I[ k ] ][ J[ k ] ] = data1[ k ];
+	if( rc != SUCCESS ) { return rc; }
+
+	std::cout << "\t\t without descriptor\n";
+	rc = grb::set( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first.first != triplet.first.second &&
+				triplet.first.first != triplet.first.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first.first != triplet.first.second ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		}
+		if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
 	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
+/** Implementation of masked tests for non-void masks (nvm). */
+template< typename Tout, typename Tmask, typename Tin >
+RC masked_tests_nvm_impl(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with invert_mask descriptor\n";
+	rc = grb::set< descriptors::invert_mask >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first.first != triplet.first.second &&
+				triplet.first.first != triplet.first.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first.first != triplet.first.second - 1 ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+					<< triplet.first.second << " ), value " << triplet.second << ".\n";
+				rc = FAILED;
+			}
+		}
+		if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
+				<< triplet.first.second << " ) with value " << triplet.second;
+			std::cerr << ", expected value "<< triplet.first.first <<".\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
+/** Dispatch for generic masked tests */
+template< typename Tout, typename Tmask, typename Tin >
+RC masked_tests(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	if( masked_tests_generic_impl( rc, output, mask, input, n ) != SUCCESS ) {
+		return rc;
+	}
+	return masked_tests_nvm_impl( rc, output, mask, input, n );
+}
+
+/** Specialised dispatched for masked tests with void masks */
+template< typename Tout, typename Tin >
+RC masked_tests(
+	RC &rc, grb::Matrix< Tout > &output,
+	const grb::Matrix< void > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const grb::RC ret = masked_tests_generic_impl( rc, output, mask, input, n );
+	std::cout << "\t\t invert_mask descriptor SKIPPED\n";
+	return ret;
+}
+
+void grb_program( const size_t &n, grb::RC &rc ) {
+	// initialize non-masked test containers
 	grb::Matrix< double > A( n, n );
 	grb::Matrix< double > B( n, n );
 	grb::Matrix< void > C( n, n );
 	grb::Matrix< void > D( n, n );
 	grb::Matrix< unsigned int > E( n, n );
+
+	// initalize masked test containers
 	grb::Matrix< int > mask( n, n );
-	grb::Matrix< int > output( n, n );
+	grb::Matrix< bool > maskBool( n, n );
+	grb::Matrix< void > maskVoid( n, n );
+	grb::Matrix< double > maskEmpty( 0, 0 );
 	grb::Matrix< int > input( n, n );
+	grb::Matrix< void > inputVoid( n, n );
+	grb::Matrix< float > inputFloat( n, n );
+	grb::Matrix< int > output( n, n );
+	grb::Matrix< void > outputVoid( n, n );
 	
+	// initialise non-masked test data
+	int chk[ 10 ][ 10 ];
+	for( size_t i = 0; i < 10; ++i ) {
+		for( size_t j = 0; j < 10; ++j ) {
+			chk[ i ][ j ] = 0;
+		}
+	}
+	for( size_t k = 0; k < 15; ++k ) {
+		chk[ I[ k ] ][ J[ k ] ] = data1[ k ];
+	}
+
 	rc = grb::resize( A, 15 );
 	if( rc == SUCCESS ) {
 		rc = grb::buildMatrixUnique( A, I, J, data1, 15, SEQUENTIAL );
 		for( const auto &triplet : A ) {
 			if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-				std::cerr << "\tunexpected entry at A( " << triplet.first.first << ", "
+				std::cerr << "\t unexpected entry at A( " << triplet.first.first << ", "
 					<< triplet.first.second << " ).\n";
 				rc = FAILED;
 			} else if( chk[ triplet.first.first ][ triplet.first.second ] != triplet.second ) {
-				std::cerr << "\tunexpected entry at A( " << triplet.first.first << ", "
+				std::cerr << "\t unexpected entry at A( " << triplet.first.first << ", "
 					<< triplet.first.second << " ) with value " << triplet.second;
 				if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 					std::cerr << ", expected no entry here.\n";
@@ -68,6 +247,15 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 			}
 		}
 	}
+	rc = rc ? rc : grb::resize( B, 15 );
+	rc = rc ? rc : grb::resize( C, 15 );
+	rc = rc ? rc : grb::resize( D, 15 );
+	rc = rc ? rc : grb::resize( E, 15 );
+	rc = rc ? rc : grb::resize(output, 2 * n - 1 );
+	if( rc != SUCCESS || grb::nnz( A ) != 15 ) {
+		std::cerr << "\tinitialisation FAILED\n";
+		return;
+	}
 
 	// initialise data for masked-set tests
 	//  - mask will be an n by n matrix with its diagonal set to 1 and its
@@ -95,32 +283,37 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n";
 		return;
 	}
-	rc = grb::buildMatrixUnique( input, I_mask, J_mask, input_vals, 2 * n - 1,
+	rc = grb::buildMatrixUnique( maskBool, I_mask, J_mask, mask_vals, 2 * n - 1,
 		SEQUENTIAL );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
+		std::cerr << "\t buildMatrixUnique of maskBool matrix FAILED\n";
 		return;
 	}
-
-	if( rc == SUCCESS ) {
-		rc = grb::resize( B, 15 );
-	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize( C, 15 );
-	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize( D, 15 );
+	try {
+		maskVoid = grb::algorithms::matrices< void >::identity( n );
+	} catch( ... ) {
+		std::cerr << "\t constructing maskVoid FAILED\n";
+		return;
 	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize( E, 15 );
+	rc = grb::buildMatrixUnique( input, I_mask, J_mask, input_vals, 2 * n - 1,
+		SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of input matrix FAILED\n";
+		return;
 	}
-	if( rc == SUCCESS ) {
-		rc = grb::resize(output, 2 * n - 1 );
+	rc = grb::buildMatrixUnique( inputFloat, I_mask, J_mask, input_vals, 2 * n - 1,
+		SEQUENTIAL );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t buildMatrixUnique of inputFloat matrix FAILED\n";
+		return;
 	}
-	if( rc != SUCCESS || grb::nnz( A ) != 15 ) {
-		std::cerr << "\tinitialisation FAILED\n";
+	try {
+		inputVoid = grb::algorithms::matrices< void >::identity( n );
+	} catch( ... ) {
+		std::cerr << "\t constructing inputVoid FAILED\n";
 		return;
 	}
+
 	std::cout << "\t test initialisation complete\n";
 
 	// check grb::set for non-voids
@@ -137,11 +330,11 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	for( const auto &triplet : B ) {
 		if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-			std::cerr << "\tunexpected entry at B( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at B( " << triplet.first.first << ", "
 				<< triplet.first.second << " ).\n";
 			rc = FAILED;
 		} else if( chk[ triplet.first.first ][ triplet.first.second ] != triplet.second ) {
-			std::cerr << "\tunexpected entry at B( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at B( " << triplet.first.first << ", "
 				<< triplet.first.second << " ) with value " << triplet.second;
 			if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 				std::cerr << ", expected no entry here.\n";
@@ -216,14 +409,14 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	for( const auto &triplet : E ) {
 		if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-			std::cerr << "\tunexpected entry at E( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at E( " << triplet.first.first << ", "
 				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
 		} else if( static_cast< unsigned int >(
 				chk[ triplet.first.first ][ triplet.first.second ]
 			) != triplet.second
 		) {
-			std::cerr << "\tunexpected entry at E( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at E( " << triplet.first.first << ", "
 				<< triplet.first.second << " ) with value " << triplet.second;
 			if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 				std::cerr << ", expected no entry here.\n";
@@ -250,11 +443,11 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	for( const auto &triplet : E ) {
 		if( triplet.first.first >= 10 || triplet.first.second >= 10 ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
 				<< triplet.first.second << " ), value " << triplet.second << ".\n";
 			rc = FAILED;
 		} else if( 117 != triplet.second ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
+			std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", "
 				<< triplet.first.second << " ) with value " << triplet.second;
 			if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) {
 				std::cerr << ", expected no entry here.\n";
@@ -268,80 +461,15 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 
 	//check masked matrix set
 
-	rc = grb::set< descriptors::structural >( output, mask, input );
-	if( rc != SUCCESS ) {
-		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
+		<< "empty mask\n";
+	if( masked_tests( rc, output, maskEmpty, input, n ) != grb::SUCCESS ) {
 		return;
 	}
-	if( grb::nnz( output ) != 2 * n - 1 ) {
-		std::cerr << "\t unexpected number of output elements ( "
-			<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
-		rc = FAILED;
-	}
-	for( const auto &triplet : output ) {
-		if(
-			triplet.first.first != triplet.first.second &&
-			triplet.first.first != triplet.first.second - 1
-		) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
-				<< triplet.first.second << " ), value " << triplet.second << ".\n";
-			rc = FAILED;
-		} if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
-				<< triplet.first.second << " ) with value " << triplet.second;
-			std::cerr << ", expected value "<< triplet.first.first <<".\n";
-			rc = FAILED;
-		}
-	}
-	if( rc != SUCCESS ) { return; }
-
-	rc = grb::set( output, mask, input );
-	if( rc != SUCCESS ) {
-		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
-		return;
-	}
-	if( grb::nnz( output ) != n ) {
-		std::cerr << "\t unexpected number of output elements ( "
-			<< grb::nnz( output ) << " ), expected " << n <<".\n";
-		rc = FAILED;
-	}
-	for( const auto &triplet : output ) {
-		if( triplet.first.first != triplet.first.second ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
-				<< triplet.first.second << " ), value " << triplet.second << ".\n";
-			rc = FAILED;
-		} if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
-				<< triplet.first.second << " ) with value " << triplet.second;
-			std::cerr << ", expected value "<< triplet.first.first <<".\n";
-			rc = FAILED;
-		}
-	}
-	if( rc != SUCCESS ) { return; }
 
-	rc = grb::set< descriptors::invert_mask >( output, mask, input );
-	if( rc != SUCCESS ) {
-		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
-		return;
-	}
-	if( grb::nnz( output ) != n - 1 ) {
-		std::cerr << "\t unexpected number of output elements ( "
-			<< grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
-		rc = FAILED;
-	}
-	for( const auto &triplet : output ) {
-		if( triplet.first.first != triplet.first.second - 1 ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
-				<< triplet.first.second << " ), value " << triplet.second << ".\n";
-			rc = FAILED;
-		} if( triplet.first.first != static_cast< size_t >(triplet.second) ) {
-			std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", "
-				<< triplet.first.second << " ) with value " << triplet.second;
-			std::cerr << ", expected value "<< triplet.first.first <<".\n";
-			rc = FAILED;
-		}
-	}
-	if( rc != SUCCESS ) { return; }
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
+		<< "non-empty mask\n";
+	if( masked_tests( rc, output, mask, input, n ) != grb::SUCCESS ) { return; }
 }
 
 int main( int argc, char ** argv ) {

From 4f5fedd61039fe5b5bcca376e61de92319beb422 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 15:46:46 +0200
Subject: [PATCH 18/39] Enable casting tests, use Boolean and non-Boolean masks

---
 tests/unit/matrixSet.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index 27eeb75c9..d1943d647 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -470,6 +470,30 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
 		<< "non-empty mask\n";
 	if( masked_tests( rc, output, mask, input, n ) != grb::SUCCESS ) { return; }
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
+		<< "non-empty Boolean mask\n";
+	if( masked_tests( rc, output, maskBool, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from "
+		<< "float to int, empty mask\n";
+	if( masked_tests( rc, output, maskEmpty, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from "
+		<< "float to int, non-empty mask\n";
+	if( masked_tests( rc, output, mask, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from "
+		<< "float to int, non-empty Boolean mask\n";
+	if( masked_tests( rc, output, maskBool, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
 }
 
 int main( int argc, char ** argv ) {

From 47029c5fc8a544c402039982479c14bcb2b556b6 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 16:12:04 +0200
Subject: [PATCH 19/39] Bugfix for no_casting set-matrix-to-matrix

---
 include/graphblas/reference/io.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index cab931989..36a5c3ff6 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -951,7 +951,8 @@ namespace grb {
 			);
 			static_assert(
 				( !(descr & descriptors::no_casting) ||
-					( A_is_mask && std::is_same< InputType2, OutputType >::value ) ),
+					( A_is_mask && std::is_same< InputType2, OutputType >::value ) ||
+					( !A_is_mask && std::is_same< InputType1, OutputType >::value ) ),
 				"grb::internal::set_copy called with non-matching value types. This is an "
 				"internal error. Please submit a bug report."
 			);

From dff5ec278e3ce1754b5e0168c5c08ed7eb4babbb Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 16:12:31 +0200
Subject: [PATCH 20/39] Add no-casting tests

---
 tests/unit/matrixSet.cpp | 43 ++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index d1943d647..b1079a7db 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -29,7 +29,10 @@ static const size_t I[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 8, 7, 6 };
 static const size_t J[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 5, 7, 5, 1 };
 
 /** Generic implementation of masked tests */
-template< typename Tout, typename Tmask, typename Tin >
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tout, typename Tmask, typename Tin
+>
 RC masked_tests_generic_impl(
 	RC &rc, grb::Matrix< Tout > &output,
 	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
@@ -38,7 +41,7 @@ RC masked_tests_generic_impl(
 	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
 
 	std::cout << "\t\t with structural descriptor\n";
-	rc = grb::set< descriptors::structural >( output, mask, input );
+	rc = grb::set< descr | descriptors::structural >( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
 		return rc;
@@ -67,7 +70,7 @@ RC masked_tests_generic_impl(
 	if( rc != SUCCESS ) { return rc; }
 
 	std::cout << "\t\t without descriptor\n";
-	rc = grb::set( output, mask, input );
+	rc = grb::set< descr >( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
 		return rc;
@@ -115,7 +118,10 @@ RC masked_tests_generic_impl(
 }
 
 /** Implementation of masked tests for non-void masks (nvm). */
-template< typename Tout, typename Tmask, typename Tin >
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tout, typename Tmask, typename Tin
+>
 RC masked_tests_nvm_impl(
 	RC &rc, grb::Matrix< Tout > &output,
 	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
@@ -124,7 +130,7 @@ RC masked_tests_nvm_impl(
 	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
 
 	std::cout << "\t\t with invert_mask descriptor\n";
-	rc = grb::set< descriptors::invert_mask >( output, mask, input );
+	rc = grb::set< descr | descriptors::invert_mask >( output, mask, input );
 	if( rc != SUCCESS ) {
 		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
 		return rc;
@@ -184,7 +190,7 @@ RC masked_tests(
 	return masked_tests_nvm_impl( rc, output, mask, input, n );
 }
 
-/** Specialised dispatched for masked tests with void masks */
+/** Specialised dispatch for masked tests with void masks */
 template< typename Tout, typename Tin >
 RC masked_tests(
 	RC &rc, grb::Matrix< Tout > &output,
@@ -196,6 +202,31 @@ RC masked_tests(
 	return ret;
 }
 
+/** Specialised dispatch for masked tests with no-cast domains */
+template< typename T >
+RC masked_tests(
+	RC &rc, grb::Matrix< T > &output,
+	const grb::Matrix< bool > &mask, const grb::Matrix< T > &input,
+	const size_t n
+) {
+	if( masked_tests_generic_impl( rc, output, mask, input, n ) != SUCCESS ) {
+		return rc;
+	}
+	if( masked_tests_nvm_impl( rc, output, mask, input, n ) != SUCCESS ) {
+		return rc;
+	}
+	std::cout << "\t re-running previous tests with no_casting descriptor\n";
+	if(
+		masked_tests_generic_impl< descriptors::no_casting >(
+			rc, output, mask, input, n
+		) != SUCCESS
+	) {
+		return rc;
+	}
+	return masked_tests_nvm_impl< descriptors::no_casting >(
+			rc, output, mask, input, n );
+}
+
 void grb_program( const size_t &n, grb::RC &rc ) {
 	// initialize non-masked test containers
 	grb::Matrix< double > A( n, n );

From ebbd717c01fc4c312f21279e3d4d649cc4d6f83b Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 16:59:05 +0200
Subject: [PATCH 21/39] Bugfix

---
 include/graphblas/reference/io.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index 36a5c3ff6..bcce52b34 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2207,13 +2207,13 @@ namespace grb {
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = A_raw.row_index[ k ];
 				if( mask_coors.assigned( k_col ) ) {
-					OutputType val = A_raw.getValue( k, *( (OutputType*) nullptr ) );
+					constexpr int zero = 0;
 					CRS_raw.row_index[ nzc ] = k_col;
-					CRS_raw.setValue( nzc, val );
+					CRS_raw.setValue( nzc, A_raw.getValue( k, zero ) );
 					const size_t CCS_index = C_col_index[ k_col ] + CCS_raw.col_start[ k_col ];
 					(void) C_col_index[ k_col ]++;
 					CCS_raw.row_index[ CCS_index ] = i;
-					CCS_raw.setValue( CCS_index, val );
+					CCS_raw.setValue( CCS_index, A_raw.getValue( k, zero ) );
 					(void) nzc++;
 				}
 			}

From 1525dbe567c86e48eb58985efa57ed0bcc498145 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 17:09:55 +0200
Subject: [PATCH 22/39] Better debug tracing

---
 include/graphblas/reference/io.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index bcce52b34..f201daae4 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2066,7 +2066,7 @@ namespace grb {
 		);
 
 		// dynamic checks
-#ifdef _DEBUG
+#ifdef _DEBUG_REFERENCE_IO
 		std::cout << "Called grb::set (matrix-to-matrix-masked, reference)\n";
 #endif
 		assert( phase != TRY );
@@ -2078,6 +2078,9 @@ namespace grb {
 		// check for trivial dispatch first (otherwise the below checks fail when they
 		// should not)
 		if( m == 0 || n == 0 ) {
+#ifdef _DEBUG_REFERENCE_IO
+			std::cout << "\t delegating to unmasked matrix-to-matrix set, reference\n";
+#endif
 			// If the mask is empty, ignore it
 			return set< descr >( C, A, phase );
 		}

From 86b156bc51a986e1b3b693a3b8397439f64a09e8 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 17:10:13 +0200
Subject: [PATCH 23/39] Enable masked void-to-void matrix sets

---
 tests/unit/matrixSet.cpp | 171 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 165 insertions(+), 6 deletions(-)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index b1079a7db..7ab40f262 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -117,6 +117,83 @@ RC masked_tests_generic_impl(
 	return rc;
 }
 
+/** Specialisation for void output */
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tmask, typename Tin
+>
+RC masked_tests_generic_impl(
+	RC &rc, grb::Matrix< void > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with structural descriptor\n";
+	rc = grb::set< descr | descriptors::structural >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( grb::nnz( output ) != 2 * n - 1 ) {
+		std::cerr << "\t unexpected number of output elements ( "
+			<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+		rc = FAILED;
+	}
+	for( const auto &triplet : output ) {
+		if(
+			triplet.first != triplet.second &&
+			triplet.first != triplet.second - 1
+		) {
+			std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+				<< triplet.second << " ), no value (pattern matrix).\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	std::cout << "\t\t without descriptor\n";
+	rc = grb::set< descr >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first != triplet.second &&
+				triplet.first != triplet.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first != triplet.second ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
 /** Implementation of masked tests for non-void masks (nvm). */
 template<
 	Descriptor descr = descriptors::no_operation,
@@ -177,6 +254,60 @@ RC masked_tests_nvm_impl(
 	return rc;
 }
 
+/** Specialisation for void output */
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename Tmask, typename Tin
+>
+RC masked_tests_nvm_impl(
+	RC &rc, grb::Matrix< void > &output,
+	const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input,
+	const size_t n
+) {
+	const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0;
+
+	std::cout << "\t\t with invert_mask descriptor\n";
+	rc = grb::set< descr | descriptors::invert_mask >( output, mask, input );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n";
+		return rc;
+	}
+	if( emptyMask ) {
+		if( grb::nnz( output ) != 2 * n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n";
+			rc = FAILED;
+		}
+	} else {
+		if( grb::nnz( output ) != n - 1 ) {
+			std::cerr << "\t unexpected number of output elements ( "
+				<< grb::nnz( output ) << " ), expected " << n - 1 <<".\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &triplet : output ) {
+		if( emptyMask ) {
+			if( triplet.first != triplet.second &&
+				triplet.first != triplet.second - 1
+			) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		} else {
+			if( triplet.first != triplet.second - 1 ) {
+				std::cerr << "\t unexpected entry at ( " << triplet.first << ", "
+					<< triplet.second << " ), no value (pattern matrix).\n";
+				rc = FAILED;
+			}
+		}
+	}
+	if( rc != SUCCESS ) { return rc; }
+
+	// done
+	return rc;
+}
+
 /** Dispatch for generic masked tests */
 template< typename Tout, typename Tmask, typename Tin >
 RC masked_tests(
@@ -282,7 +413,7 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	rc = rc ? rc : grb::resize( C, 15 );
 	rc = rc ? rc : grb::resize( D, 15 );
 	rc = rc ? rc : grb::resize( E, 15 );
-	rc = rc ? rc : grb::resize(output, 2 * n - 1 );
+	rc = rc ? rc : grb::resize( output, 15 );
 	if( rc != SUCCESS || grb::nnz( A ) != 15 ) {
 		std::cerr << "\tinitialisation FAILED\n";
 		return;
@@ -338,12 +469,14 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		std::cerr << "\t buildMatrixUnique of inputFloat matrix FAILED\n";
 		return;
 	}
-	try {
-		inputVoid = grb::algorithms::matrices< void >::identity( n );
-	} catch( ... ) {
-		std::cerr << "\t constructing inputVoid FAILED\n";
+	rc = grb::resize( inputVoid, 2 * n - 1 );
+	rc = rc ? rc : grb::resize( outputVoid, 2 * n - 1 );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t error resizing matrices for masked tests\n";
 		return;
 	}
+	// postpone materialisation of inputVoid since it relies on unmasked grb::set
+	// (which is itself unit-tested later)
 
 	std::cout << "\t test initialisation complete\n";
 
@@ -490,7 +623,15 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 	if( rc != SUCCESS ) { return; }
 
-	//check masked matrix set
+	// check masked matrix set
+	// first, finish initialisation
+	rc = grb::set( inputVoid, input );
+	rc = rc ? rc : grb::resize( output, 2 * n - 1 );
+	if( rc != SUCCESS || grb::nnz( inputVoid ) != 2 * n - 1 ) {
+		std::cerr << "\t error in inputVoid (an earlier test likely failed)\n";
+		if( rc == SUCCESS ) { rc = FAILED; }
+		return;
+	}
 
 	std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, "
 		<< "empty mask\n";
@@ -525,6 +666,24 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	if( masked_tests( rc, output, maskBool, inputFloat, n ) != grb::SUCCESS ) {
 		return;
 	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), "
+		<< "empty mask\n";
+	if( masked_tests( rc, outputVoid, maskEmpty, inputVoid, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), "
+		<< "non-empty mask\n";
+	if( masked_tests( rc, outputVoid, mask, inputVoid, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), "
+		<< "non-empty Boolean mask\n";
+	if( masked_tests( rc, outputVoid, maskBool, inputVoid, n ) != grb::SUCCESS ) {
+		return;
+	}
 }
 
 int main( int argc, char ** argv ) {

From 7975e5626edaf2b5af2dd8c9b6be667d2b1e4bca Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 17:18:51 +0200
Subject: [PATCH 24/39] Bugfix

---
 include/graphblas/reference/io.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index f201daae4..6ca510598 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2050,7 +2050,8 @@ namespace grb {
 			"matrix"
 		);
 		static_assert(
-			std::is_convertible< InputType, OutputType >::value,
+			std::is_convertible< InputType, OutputType >::value ||
+				std::is_void< OutputType >::value,
 			"grb::set (masked set to matrix): input type cannot be "
 			"converted to output type"
 		);

From 25fdef4fc5eee0b5b0d1e75dbb02263a2cdb8f2c Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 23 Aug 2025 17:19:12 +0200
Subject: [PATCH 25/39] Complete unit test by testing casting from non-voids to
 void

---
 tests/unit/matrixSet.cpp | 41 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index 7ab40f262..305f6f10d 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -684,6 +684,47 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	if( masked_tests( rc, outputVoid, maskBool, inputVoid, n ) != grb::SUCCESS ) {
 		return;
 	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting "
+		<< "(sort of), empty mask\n";
+	if( masked_tests( rc, outputVoid, maskEmpty, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting "
+		<< "(sort of), non-empty mask\n";
+	if( masked_tests( rc, outputVoid, mask, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting "
+		<< "(sort of), Boolean mask\n";
+	if( masked_tests( rc, outputVoid, maskBool, input, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting "
+		<< "(sort of), empty mask\n";
+	if(
+		masked_tests( rc, outputVoid, maskEmpty, inputFloat, n ) !=
+			grb::SUCCESS
+	) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting "
+		<< "(sort of), non-empty mask\n";
+	if( masked_tests( rc, outputVoid, mask, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting "
+		<< "(sort of), Boolean mask\n";
+	if( masked_tests( rc, outputVoid, maskBool, inputFloat, n ) != grb::SUCCESS ) {
+		return;
+	}
+
+	// done
 }
 
 int main( int argc, char ** argv ) {

From 72da7fd91dc3969095d557e44a62851a9743f2be Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sun, 24 Aug 2025 10:54:23 +0200
Subject: [PATCH 26/39] Unit test uncovered bug in hyperdags implementation,
 herewith fixed

---
 include/graphblas/hyperdags/io.hpp | 93 ++++++++++++++++++++++++++----
 1 file changed, 81 insertions(+), 12 deletions(-)

diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
index 4ac5b4635..99711f0bb 100644
--- a/include/graphblas/hyperdags/io.hpp
+++ b/include/graphblas/hyperdags/io.hpp
@@ -28,6 +28,10 @@
 
 #include <array>
 
+#ifdef _DEBUG
+ #define _DEBUG_HYPERDAGS_IO
+#endif
+
 
 namespace grb {
 
@@ -456,26 +460,91 @@ namespace grb {
 		const Matrix< InputType, hyperdags, RIT3, CIT3, NIT3 > &A,
 		const Phase &phase = EXECUTE
 	) {
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cout << "Entering set(matrix, matrix, matrix), hyperdags backend\n";
+#endif
+		// first, check for dynamic errors
+		const size_t m = nrows( C );
+		const size_t n = ncols( C );
+		if( m != nrows( A ) || n != ncols( A ) ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (I)\n";
+#endif
+			return MISMATCH;
+		}
+		if( m != nrows( M ) && nrows( M ) != 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (II)\n";
+#endif
+			return MISMATCH;
+		}
+		if( n != ncols( M ) && ncols( M ) != 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (III)\n";
+#endif
+			return MISMATCH;
+		}
+		// second, check for trivial op
+		if( m == 0 || n == 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t WARNING set( matrix, matrix, matrix ), hyperdags: "
+				<< "trivial op detected (all containers empty). No operation will be "
+				<< "recorded\n";
+#endif
+			return SUCCESS;
+		}
+		// third, execute
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: forwarding to "
+			<< "execution backend\n";
+#endif
 		const RC ret = set< descr >(
 			internal::getMatrix( C ), internal::getMatrix( M ),
 			internal::getMatrix( A ), phase
 		);
+		// fourth, forward any errors
 		if( ret != SUCCESS ) { return ret; }
 		if( phase != EXECUTE ) { return ret; }
-		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		// fifth, record operation
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: execution had no "
+			<< "error; recording operation\n";
+#endif
 		std::array< const void *, 0 > sourcesP{};
-		std::array< uintptr_t, 3 > sourcesC{
-			getID( internal::getMatrix(A) ),
-			getID( internal::getMatrix(M) ),
-			getID( internal::getMatrix(C) )
-		};
 		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
-		internal::hyperdags::generator.addOperation(
-			internal::hyperdags::SET_MATRIX_MATRIX_MASKED,
-			sourcesP.begin(), sourcesP.end(),
-			sourcesC.begin(), sourcesC.end(),
-			destinations.begin(), destinations.end()
-		);
+		if( nrows( M ) == 0 || ncols( M ) == 0 ) {
+#ifdef _DEBUG_HYPERDAGS_IO
+			std::cerr << "\t WARNING set( matrix, matrix, matrix ), hyperdags: "
+				<< "empty mask detected; hyperDAG will not (cannot) record it\n";
+#endif
+			std::array< uintptr_t, 2 > sourcesC{
+				getID( internal::getMatrix(A) ),
+				getID( internal::getMatrix(C) )
+			};
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::SET_MATRIX_MATRIX_MASKED,
+				sourcesP.begin(), sourcesP.end(),
+				sourcesC.begin(), sourcesC.end(),
+				destinations.begin(), destinations.end()
+			);
+		} else {
+			std::array< uintptr_t, 3 > sourcesC{
+				getID( internal::getMatrix(A) ),
+				getID( internal::getMatrix(M) ),
+				getID( internal::getMatrix(C) )
+			};
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::SET_MATRIX_MATRIX_MASKED,
+				sourcesP.begin(), sourcesP.end(),
+				sourcesC.begin(), sourcesC.end(),
+				destinations.begin(), destinations.end()
+			);
+		}
+
+		// done
+#ifdef _DEBUG_HYPERDAGS_IO
+		std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: exiting\n";
+#endif
 		return ret;
 	}
 

From bc63b593985e3d5d361f0695ccc88c3cd8e72b13 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sun, 24 Aug 2025 10:54:40 +0200
Subject: [PATCH 27/39] Also execute matrixSet unit test for larger matrices

---
 tests/unit/unittests.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index 0b49d2421..6abb78e21 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -484,6 +484,12 @@ for MODE in ${MODES}; do
 				grep "Test OK" ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing grb::set (matrices), size 10 000"
+				$runner ${TEST_BIN_DIR}/matrixSet_${MODE}_${BACKEND} 10000 2> ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log
+				grep "Test OK" ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
+
 				echo ">>>      [x]           [ ]       Testing grb::set (matrix, value)"
 				$runner ${TEST_BIN_DIR}/setMatrixValue_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.log
 				head -1 ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.log

From c5b11a11f9ef1aa0610263aa35a30e7e221fb0a6 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sun, 24 Aug 2025 13:14:19 +0200
Subject: [PATCH 28/39] BSP1D I/O implementation of
 grb::set(matrix,matrix,matrix). All unit tests pass.

---
 include/graphblas/bsp1d/io.hpp | 160 +++++++++++++++++++++++++++++++--
 1 file changed, 154 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/bsp1d/io.hpp b/include/graphblas/bsp1d/io.hpp
index edada4191..cfe2844c6 100644
--- a/include/graphblas/bsp1d/io.hpp
+++ b/include/graphblas/bsp1d/io.hpp
@@ -32,7 +32,6 @@
  #include <omp.h>
 #endif
 
-#include "graphblas/blas1.hpp"                 // for grb::size
 #include "graphblas/nonzeroStorage.hpp"
 
 // the below transforms an std::vector iterator into an ALP/GraphBLAS-compatible
@@ -976,20 +975,166 @@ namespace grb {
 #ifdef _BSP1D_IO_DEBUG
 		std::cout << "Called grb::set( matrix, mask, value ) (BSP1D)\n";
 #endif
+		// catch trivial cases
 		const size_t m = nrows( C );
 		const size_t n = ncols( C );
+		if( m == 0 || n == 0 ) { return SUCCESS; }
 
-		// dynamic checks (I)
+		// dynamic checks
+		if( nrows( mask ) == 0 || ncols( mask ) == 0 ) {
+			return ILLEGAL;
+		}
 		if( m != nrows( mask ) || n != ncols( mask ) ) {
 			return MISMATCH;
 		}
 
-		// catch trivial case
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t delegating to final backend\n";
+#endif
+		RC ret = SUCCESS;
+		// Take care that local matrices may be empty, even if the global matrix is
+		// not. Processes with empty local matrices will not delegate (no-op).
+		{
+			auto &local_C = internal::getLocal( C );
+			const auto &local_mask = internal::getLocal( mask );
+			const size_t local_m = nrows( local_C );
+			const size_t local_n = ncols( local_C );
+			assert( local_m == nrows( local_mask ) );
+			assert( local_n == ncols( local_mask ) );
+			if( local_m > 0 && local_n > 0 ) {
+				ret = set< descr >( local_C, local_mask, val, phase );
+			}
+		}
+
+		// in the self-masked case, there is no way an error could occur
+		if( (descr & descriptors::structural) && getID( C ) == getID( mask ) ) {
+#ifdef _BSP1D_IO_DEBUG
+			std::cout << "\t structural self-masking detected, which allows trivial "
+				"exit\n"; // since the nnz nor capacity would never change
+#endif
+			assert( ret == SUCCESS );
+			return ret;
+		}
+
+		// in all other cases, in either mode (resize or execute), we must check for
+		// errors
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t all-reducing error code\n";
+#endif
+		if( collectives< BSP1D >::allreduce( ret, operators::any_or< RC >() )
+			!= SUCCESS
+		) {
+			return PANIC;
+		}
+
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t all-reduced error code is " << toString( ret ) << "\n";
+#endif
+		if( phase == RESIZE ) {
+			if( ret == SUCCESS ) {
+#ifdef _BSP1D_IO_DEBUG
+				std::cout << "\t resize phase detected -- synchronising capacity\n";
+#endif
+				ret = internal::updateCap( C );
+				if( ret != SUCCESS ) {
+					std::cerr << "Error updating capacity: " << toString( ret ) << "\n";
+				}
+			}
+		} else {
+			assert( phase == EXECUTE );
+			if( ret == SUCCESS ) {
+#ifdef _BSP1D_IO_DEBUG
+				std::cout << "\t execute phase detected -- synchronising nnz count\n";
+#endif
+				ret = internal::updateNnz( C );
+				if( ret != SUCCESS ) {
+					std::cerr << "Error updating output number of nonzeroes: "
+						<< toString( ret ) << "\n";
+				}
+			} else if( ret == ILLEGAL ) {
+#ifdef _BSP1D_IO_DEBUG
+				std::cout << "\t delegate returns ILLEGAL, clearing output\n";
+#endif
+				const RC clear_rc = clear( C );
+				if( clear_rc != SUCCESS ) {
+					ret = PANIC;
+				}
+			} else {
+				if( ret != PANIC ) {
+					std::cerr << "Warning: unexpected error code in grb::set( matrix, mask, "
+						<< "value ) (BSP1D). Please submit a bug report.\n";
+				}
+				assert( ret == PANIC );
+			}
+		}
+
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "\t done; returning " << toString( ret ) << "\n";
+#endif
+
+		// done
+		return ret;
+	}
+
+	/**
+	 * The implementation can trivially rely on the final backend, however, the
+	 * capacity or nonzero count of the output can in some cases differ. The below
+	 * implementation mostly deals with that logic.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename RIT1, typename CIT1, typename NIT1,
+		typename MaskType, typename RIT2, typename CIT2, typename NIT2,
+		typename ValueType = DataType, typename RIT3, typename CIT3, typename NIT3
+	>
+	RC set(
+		Matrix< DataType, BSP1D, RIT1, CIT1, NIT1 > &C,
+		const Matrix< MaskType, BSP1D, RIT2, CIT2, NIT2 > &mask,
+		const Matrix< ValueType, BSP1D, RIT3, CIT3, NIT3 > &A,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value &&
+			!grb::is_object< ValueType >::value &&
+			!grb::is_object< MaskType >::value
+		>::type * const = nullptr
+	) noexcept {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< ValueType, DataType >::value
+			), "grb::set( matrix, mask, matrix ) (BSP1D)",
+			"called with non-matching value types"
+		);
+		NO_CAST_ASSERT(
+			( !(descr & descriptors::no_casting) ||
+				std::is_same< MaskType, bool >::value ),
+			"grb::set( matrix, mask, matrix ) (BSP1D)",
+			"called with non-Boolean mask value type"
+		);
+		static_assert( !( (descr & descriptors::structural) &&
+				(descr & descriptors::invert_mask)
+			), "grb::set( matrix, mask, matrix ) (BSP1D): Primitives with matrix "
+			"outputs may not employ structurally inverted masking"
+		);
+#ifdef _BSP1D_IO_DEBUG
+		std::cout << "Called grb::set( matrix, mask, matrix ) (BSP1D)\n";
+#endif
+		const size_t m = nrows( C );
+		const size_t n = ncols( C );
+
+		// dynamic checks (I)
+		if( m != nrows( A ) || n != ncols( A ) ) {
+			return MISMATCH;
+		}
+
+		// catch trivial cases
 		if( m == 0 || n == 0 ) { return SUCCESS; }
+		if( nrows( mask ) == 0 || ncols( mask ) == 0 ) {
+			return set< descr >( C, A, phase );
+		}
 
 		// dynamic checks (II)
-		if( nrows( mask ) == 0 || ncols( mask ) == 0 ) {
-			return ILLEGAL;
+		if( m != nrows( mask ) || n != ncols( mask ) ) {
+			return MISMATCH;
 		}
 
 #ifdef _BSP1D_IO_DEBUG
@@ -1000,13 +1145,16 @@ namespace grb {
 		// not. Processes with empty local matrices will not delegate (no-op).
 		{
 			auto &local_C = internal::getLocal( C );
+			const auto &local_A = internal::getLocal( A );
 			const auto &local_mask = internal::getLocal( mask );
 			const size_t local_m = nrows( local_C );
 			const size_t local_n = ncols( local_C );
 			assert( local_m == nrows( local_mask ) );
 			assert( local_n == ncols( local_mask ) );
+			assert( local_m == nrows( local_A ) );
+			assert( local_n == ncols( local_A ) );
 			if( local_m > 0 && local_n > 0 ) {
-				ret = set< descr >( local_C, local_mask, val, phase );
+				ret = set< descr >( local_C, local_mask, local_A, phase );
 			}
 		}
 

From 0955f9f34f9a2255baf390929c4125bc16b9239d Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Mon, 22 Sep 2025 03:25:48 +0200
Subject: [PATCH 29/39] Remove redundant template info from constructor

---
 include/graphblas/reference/compressed_storage.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/compressed_storage.hpp b/include/graphblas/reference/compressed_storage.hpp
index 95b67a903..2d40d18cc 100644
--- a/include/graphblas/reference/compressed_storage.hpp
+++ b/include/graphblas/reference/compressed_storage.hpp
@@ -1156,7 +1156,7 @@ namespace grb {
 				{}
 
 				/** Move constructor. */
-				Compressed_Storage< void, IND, SIZE >( SelfType &&other ) {
+				Compressed_Storage( SelfType &&other ) {
 					moveFromOther( other );
 				}
 

From c69848914bcd7bbcabd8226e82034d201f597c14 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Mon, 22 Sep 2025 03:37:03 +0200
Subject: [PATCH 30/39] Hotfix missing include

---
 include/graphblas/bsp1d/init.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/graphblas/bsp1d/init.hpp b/include/graphblas/bsp1d/init.hpp
index 582479bf1..d5798d894 100644
--- a/include/graphblas/bsp1d/init.hpp
+++ b/include/graphblas/bsp1d/init.hpp
@@ -34,6 +34,7 @@
 #include <graphblas/utils/DMapper.hpp>
 #include <graphblas/utils/threadLocalStorage.hpp>
 
+#include <cstdint>  //uintptr_t
 #include <assert.h> //assertions
 
 #include <lpf/core.h>

From 182c79e618c14869c7cce72fd28dd6fec56bb583 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Mon, 22 Sep 2025 03:41:34 +0200
Subject: [PATCH 31/39] Hotfix bug in matrixFileReader

---
 include/graphblas/utils/parser/matrixFileReaderBase.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/utils/parser/matrixFileReaderBase.hpp b/include/graphblas/utils/parser/matrixFileReaderBase.hpp
index c5cdf8b7a..15aa3e4f8 100644
--- a/include/graphblas/utils/parser/matrixFileReaderBase.hpp
+++ b/include/graphblas/utils/parser/matrixFileReaderBase.hpp
@@ -280,7 +280,7 @@ namespace grb {
 						properties._nz = nz;
 						properties._entries = entries;
 						properties._pattern = pattern;
-						properties._symmetric = symmetric;
+						properties._symmetric = symmetric ? Symmetric : General;
 						properties._direct = direct;
 						properties._symmetricmap = symmetricmap;
 						// check for existance of file

From 972069b707a0c750afe3191c546e91fe978ce79d Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Mon, 22 Sep 2025 03:55:46 +0200
Subject: [PATCH 32/39] Hotfix bug in BSP1D I/O

---
 include/graphblas/bsp1d/io.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/bsp1d/io.hpp b/include/graphblas/bsp1d/io.hpp
index cfe2844c6..b8181041a 100644
--- a/include/graphblas/bsp1d/io.hpp
+++ b/include/graphblas/bsp1d/io.hpp
@@ -1773,8 +1773,9 @@ namespace grb {
 			// a pipeline depth of 2 is sufficient.
 			constexpr size_t iteration_overlaps = 2;
 
-			const std::unique_ptr< size_t > first_nnz_per_thread(
-				new size_t[ num_threads * iteration_overlaps ]()
+			const std::unique_ptr< size_t, void(*)(size_t * const) > first_nnz_per_thread(
+				new size_t[ num_threads * iteration_overlaps ](),
+				[](size_t * const array){ delete [] array; }
 			);
 			size_t * const first_nnz_per_thread_ptr = first_nnz_per_thread.get();
 			outgoing.resize( data.P );

From 66a19743091bf24d2c5599281ebd6d50a584357b Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Mon, 22 Sep 2025 03:56:40 +0200
Subject: [PATCH 33/39] Refactor SPA buffer management from blas3/mxm to
 matrix.hpp. This prepares for its use with grb::set(matrix,matrix,matrix)

---
 include/graphblas/reference/blas3.hpp  | 432 +------------------------
 include/graphblas/reference/matrix.hpp | 425 ++++++++++++++++++++++++
 2 files changed, 428 insertions(+), 429 deletions(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 054d945dd..2a37f51a8 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -579,432 +579,6 @@ namespace grb {
 			return SUCCESS;
 		}
 
-#ifndef _H_GRB_REFERENCE_OMP_BLAS3
-		/**
-		 * Meta-data for global buffer management for use with #grb::mxm.
-		 *
-		 * This meta-data is the same for both the sequential (reference) and shared-
-		 * memory parallel (reference_omp) backends.
-		 *
-		 * This class contains all meta-data necessary to interpret the global buffer
-		 * as an array of sparse accumulators (SPAs). The length of the array is given
-		 * by a call to #threads(), minus one. It is called that since a call to
-		 * #threads() retrieves how many threads can be used to process the call to
-		 * #grb::mxm.
-		 *
-		 * Each SPA has the layout (bitarray, stack, valueArray). These are packed in a
-		 * padded byte array, such that each bit array, stack, and value array is
-		 * aligned on sizeof(int) bytes.
-		 *
-		 * @tparam NIT       The nonzero index type.
-		 * @tparam ValueType The output matrix value type.
-		 */
-		template< typename NIT, typename ValueType >
-		class MXM_BufferMetaData {
-
-			static_assert( sizeof(NIT) % sizeof(int) == 0, "Unsupported type for NIT; "
-				"please submit a bug report!" );
-
-			private:
-
-				/** The size of the offset array */
-				size_t m;
-
-				/** The size of the SPA */
-				size_t n;
-
-				/** The number of threads supported during a call to #grb::mxm */
-				size_t nthreads;
-
-				/** The initial buffer offset */
-				size_t bufferOffset;
-
-				/** The size of a single SPA, including bytes needed for padding */
-				size_t paddedSPASize;
-
-				/** The number of bytes to pad the SPA array with */
-				size_t arrayShift;
-
-				/** The number of bytes to pad the SPA stack with */
-				size_t stackShift;
-
-				/**
-				 * Given a number of used bytes of the buffer, calculate the available
-				 * remainder buffer and return it.
-				 *
-				 * @param[in]  osize     The size of the buffer (in bytes) that is already
-				 *                       in use.
-				 * @param[out] remainder Pointer to any remainder buffer.
-				 * @param[out] rsize     The size of the remainder buffer.
-				 *
-				 * If no buffer space is left, \a remainder will be set to <tt>nullptr</tt>
-				 * and \a size to <tt>0</tt>.
-				 */
-				void retrieveRemainderBuffer(
-					const size_t osize,
-					void * &remainder, size_t &rsize
-				) const noexcept {
-					const size_t size = internal::template getCurrentBufferSize< char >();
-					char * rem = internal::template getReferenceBuffer< char >( size );
-					size_t rsize_calc = size - osize;
-					rem += osize;
-					const size_t mod = reinterpret_cast< uintptr_t >(rem) % sizeof(int);
-					if( mod ) {
-						const size_t shift = sizeof(int) - mod;
-						if( rsize_calc >= shift ) {
-							rsize_calc -= shift;
-							rem += rsize;
-						} else {
-							rsize_calc = 0;
-							rem = nullptr;
-						}
-					}
-					assert( !(reinterpret_cast< uintptr_t >(rem) % sizeof(int)) );
-					// write out
-					remainder = rem;
-					rsize = rsize_calc;
-				}
-
-
-			public:
-
-				/**
-				 * Base constructor.
-				 *
-				 * @param[in] _m          The length of the offset array.
-				 * @param[in] _n          The length of the SPA.
-				 * @param[in] max_threads The maximum number of threads.
-				 *
-				 * \note \a max_threads is a separate input since there might be a need to
-				 *       cap the maximum number of threads used based on some analytic
-				 *       performance model. Rather than putting such a performance model
-				 *       within this class, we make it an obligatory input parameter
-				 *       instead.
-				 *
-				 * \note It is always valid to pass <tt>config::OMP::threads()</tt>.
-				 *
-				 * \note This class \em will, however, cap the number of threads returned
-				 *       to \a _n.
-				 */
-				MXM_BufferMetaData(
-					const size_t _m, const size_t _n,
-					const size_t max_threads
-				) : m( _m ), n( _n ), arrayShift( 0 ), stackShift( 0 ) {
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t\t\t computing padded buffer size for a SPA of length "
-						<< n << " while leaving space for an additional offset buffer of length "
-						<< std::max( m, n ) << "...\n";
- #endif
-					// compute bufferOffset
-					bufferOffset = (std::max( m, n ) + 1) * sizeof( NIT );
-
-					// compute value buffer size
-					const size_t valBufSize = n * sizeof( ValueType );
-
- #ifdef _DEBUG_REFERENCE_BLAS3
-					std::cout << "\t\t\t\t bit-array size has byte-size " <<
-						internal::Coordinates< reference >::arraySize( n ) << "\n";
-					std::cout << "\t\t\t\t stack has byte-size " <<
-						internal::Coordinates< reference >::stackSize( n ) << "\n";
-					std::cout << "\t\t\t\t value buffer has byte-size " << valBufSize << "\n";
- #endif
-
-					// compute paddedSPASize
-					paddedSPASize =
-						internal::Coordinates< reference >::arraySize( n ) +
-						internal::Coordinates< reference >::stackSize( n ) +
-						valBufSize;
-					size_t shift =
-						internal::Coordinates< reference >::arraySize( n ) % sizeof(int);
-					if( shift != 0 ) {
-						arrayShift = sizeof(int) - shift;
-						paddedSPASize += arrayShift;
-					}
-					shift = internal::Coordinates< reference >::stackSize( n ) % sizeof(int);
-					if( shift != 0 ) {
-						stackShift = sizeof(int) - shift;
-						paddedSPASize += stackShift;
-					}
-					shift = valBufSize % sizeof(int);
-					if( shift != 0 ) {
-						paddedSPASize += (sizeof(int) - shift);
-					}
-
-					// pad bufferOffset
-					shift = bufferOffset % sizeof(int);
-					if( shift != 0 ) {
-						bufferOffset += (sizeof(int) - shift);
-					}
-
-					// compute free buffer size
-					const size_t freeBufferSize = internal::getCurrentBufferSize< char >() -
-						bufferOffset;
-
-					// compute max number of threads
-					nthreads = 1 + freeBufferSize / paddedSPASize;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t\t\t free buffer size: " << freeBufferSize
-						<< ", (padded) SPA size: " << paddedSPASize
-						<< " -> supported #threads: " << nthreads << ". "
-						<< " The shifts for the bit-array and the stack are " << arrayShift
-						<< ", respectively, " << stackShift << "."
-						<< "\n";
- #endif
-					// cap the final number of selected threads
-					if( nthreads > max_threads ) {
-						nthreads = max_threads;
-					}
-					if( nthreads > n ) {
-						nthreads = n;
-					}
-				}
-
-				/** @returns The maximum number of supported threads during #grb::mxm */
-				size_t threads() const noexcept {
-					return nthreads;
-				}
-
-				/**
-				 * Requests and returns a global buffer required for a thread-local SPA.
-				 *
-				 * @param[in] t The thread ID. Must be larger than 0.
-				 *
-				 * \note Thread 0 employs the SPA allocated with the output matrix.
-				 *
-				 * @returns Pointer into the global buffer starting at the area reserved for
-				 *          the SPA of thread \a t.
-				 */
-				char * getSPABuffers( size_t t ) const noexcept {
-					assert( t > 0 );
-					(void) --t;
-					char * raw = internal::template getReferenceBuffer< char >(
-						bufferOffset + nthreads * paddedSPASize );
-					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
-					raw += bufferOffset;
-					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
-					raw += t * paddedSPASize;
-					return raw;
-				}
-
-				/**
-				 * Retrieves the column offset buffer.
-				 *
-				 * @param[out] remainder Returns any remainder buffer beyond that of the row
-				 *                       offset buffer.
-				 * @param[out] rsize     The remainder buffer size \a remainder points to.
-				 *
-				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
-				 * and vice versa.
-				 *
-				 * Retrieving any remainder buffer is optional. The default is to not ask
-				 * for them.
-				 *
-				 * \warning If all buffer memory is used for the column offsets, it may be
-				 *          that \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt>
-				 *          zero.
-				 *
-				 * \warning This buffer is only guaranteed exclusive if only the retrieved
-				 *          column buffer is used. In particular, if also requesting (and
-				 *          using) SPA buffers, the remainder buffer area is shared with
-				 *          those SPA buffers, and data races are likely to occur. In other
-				 *          words: be very careful with any use of these remainder buffers.
-				 *
-				 * @returns The column offset buffer.
-				 *
-				 * \warning This buffer overlaps with the CRS offset buffer. The caller
-				 *          must ensure to only ever use one at a time.
-				 */
-				NIT * getColOffsetBuffer(
-					void * * const remainder = nullptr,
-					size_t * const rsize = nullptr
-				) const noexcept {
-					NIT * const ret = internal::template getReferenceBuffer< NIT >( n + 1 );
-					if( remainder != nullptr || rsize != nullptr ) {
-						assert( remainder != nullptr && rsize != nullptr );
-						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
-					}
-					return ret;
-				}
-
-				/**
-				 * Retrieves the row offset buffer.
-				 *
-				 * @param[out] remainder Returns any remainder buffer beyond that of the row
-				 *                       offset buffer.
-				 * @param[out] rsize     The remainder buffer size \a remainder points to.
-				 *
-				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
-				 * and vice versa.
-				 *
-				 * Retrieving any remainder buffer is optional. The default is to not ask
-				 * for them.
-				 *
-				 * \warning If all buffer memory is used for the row offsets, it may be that
-				 *          \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt> zero.
-				 *
-				 * \warning This buffer is only guaranteed exclusive if only the retrieved
-				 *          row buffer is used. In particular, if also requesting (and
-				 *          using) SPA buffers, the remainder buffer area is shared with
-				 *          those SPA buffers, and data races are likely to occur. In other
-				 *          words: be very careful with any use of these remainder buffers.
-				 *
-				 * @returns The row offset buffer.
-				 *
-				 * \warning This buffer overlaps with the CCS offset buffer. The caller
-				 *          must ensure to only ever use one at a time.
-				 */
-				NIT * getRowOffsetBuffer(
-					void * * const remainder = nullptr,
-					size_t * const rsize = nullptr
-				) const noexcept {
-					NIT * const ret = internal::template getReferenceBuffer< NIT >( m + 1 );
-					if( remainder != nullptr || rsize != nullptr ) {
-						assert( remainder != nullptr && rsize != nullptr );
-						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
-					}
-					return ret;
-				}
-
-				/**
-				 * Shifts a pointer into the global buffer by the bit-array size and its
-				 * padding.
-				 *
-				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
-				 *                    On output: an aligned pointer past the bit-array
-				 *                    position.
-				 */
-				void applyArrayShift( char * &raw ) const noexcept {
-					const size_t totalShift =
-						internal::Coordinates< reference >::arraySize( n ) +
-						arrayShift;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					std::cout << "\t\t\t shifting input pointer with "
-						<< internal::Coordinates< reference >::arraySize( n ) << " + "
-						<< arrayShift << " = " << totalShift << "bytes \n";
- #endif
-					raw += totalShift;
-				}
-
-				/**
-				 * Shifts a pointer into the global buffer by the stack size and its
-				 * padding.
-				 *
-				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
-				 *                    On output: an aligned pointer past the stack position.
-				 */
-				void applyStackShift( char * &raw ) const noexcept {
-					const size_t totalShift =
-						internal::Coordinates< reference >::stackSize( n ) +
-						stackShift;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					std::cout << "\t\t\t shifting input pointer with "
-						<< internal::Coordinates< reference >::arraySize( n ) << " + "
-						<< stackShift << " = " << totalShift << "bytes \n";
- #endif
-					raw += totalShift;
-				}
-
-		};
-#endif
-
-		/**
-		 * Retrieves the SPA buffers for the calling thread.
-		 *
-		 * \warning This function must be called from within an OpenMP parallel
-		 *          section.
-		 *
-		 * @param[out]    arr Where the bit-array may be located.
-		 * @param[out]    buf Where the stack may be located.
-		 * @param[out] valbuf Where the value buffer may be located.
-		 *
-		 * All above pointers are aligned on sizeof(int) bytes.
-		 *
-		 * @param[in] md Meta-data for global buffer management.
-		 * @param[in]  C The output matrix.
-		 *
-		 * One thread uses the buffers pre-allocated with the matrix \a C, thus
-		 * ensuring at least one thread may perform the #grb::mxm. Any remainder
-		 * threads can only help process the #grb::mxm if there is enough global
-		 * buffer memory available.
-		 *
-		 *
-		 * \note The global memory has size \f$ \Omega( \mathit{nz} ) \f$, which may
-		 *       be several factors (or even asymptotically greater than)
-		 *       \f$ \max\{ m, n \} \f$.
-		 *
-		 * \note In case the application stores multiple matrices, the global buffer
-		 *       may additionally be greater than the above note indicates if at least
-		 *       one of the other matrices is significantly (or asymptotically) larger
-		 *       than the one involved with the #grb::mxm.
-		 */
-		template<
-			typename OutputType,
-			typename RIT, typename CIT, typename NIT
-		>
-		void mxm_ompPar_getSPABuffers(
-			char * &arr, char * &buf, OutputType * &valbuf,
-			const struct MXM_BufferMetaData< NIT, OutputType > &md,
-			Matrix< OutputType, reference, RIT, CIT, NIT > &C
-		) {
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			// other threads use the global buffer to create additional SPAs
-			{
-				const size_t t = config::OMP::current_thread_ID();
- #ifndef NDEBUG
-				const size_t T = config::OMP::current_threads();
-				assert( t < T );
- #endif
-				if( t > 0 ) {
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t Thread " << t << " gets buffers from global buffer\n";
- #endif
-					char * rawBuffer = md.getSPABuffers( t );
-					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
-					arr = rawBuffer;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
- #endif
-					md.applyArrayShift( rawBuffer );
-					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
-					buf = rawBuffer;
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
- #endif
-					md.applyStackShift( rawBuffer );
-					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
-					assert( buf != arr );
-					valbuf = reinterpret_cast< OutputType * >(rawBuffer);
-					assert( static_cast< void * >(valbuf) != static_cast< void * >(buf) );
-				} else {
- #ifdef _DEBUG_REFERENCE_BLAS3
-					#pragma omp critical
-					std::cout << "\t Thread " << t << " gets buffers from matrix storage\n";
- #endif
-					// one thread uses the standard matrix buffer
-					internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
-				}
- #ifdef _DEBUG_REFERENCE_BLAS3
-				#pragma omp critical
-				{
-					std::cout << "\t Thread " << t << " has SPA array @ "
-						<< static_cast< void * >( arr ) << " and SPA stack @ "
-						<< static_cast< void * >( buf ) << " and SPA values @ "
-						<< static_cast< void * >( valbuf ) << "\n";
-				}
- #endif
-			}
-#else
- #ifdef _DEBUG_REFERENCE_BLAS3
-			std::cout << "\t Reference backend gets buffers from global buffer\n";
- #endif
-			internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
-			(void) md;
-#endif
-		}
-
 		/**
 		 * Given a computed new row_start array, moves the old index and value arrays
 		 * to new offsets. This leaves precisely enough space for the mxm algorithm to
@@ -1234,7 +808,7 @@ namespace grb {
 #endif
 			}
 
-			MXM_BufferMetaData< NIT, OutputType > bufferMD( m, n, max_threads );
+			SPA_BufferMetaData< NIT, OutputType > bufferMD( m, n, max_threads );
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 			// derive number of threads
@@ -1255,7 +829,7 @@ namespace grb {
 					char * arr = nullptr;
 					char * buf = nullptr;
 					OutputType * valbuf = nullptr;
-					mxm_ompPar_getSPABuffers( arr, buf, valbuf, bufferMD, C );
+					spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C );
 
 					// do count
 					size_t local_nzc;
@@ -1296,7 +870,7 @@ namespace grb {
 				char * arr = nullptr;
 				char * buf = nullptr;
 				OutputType * valbuf = nullptr;
-				mxm_ompPar_getSPABuffers( arr, buf, valbuf, bufferMD, C );
+				spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C );
 #ifdef _DEBUG_REFERENCE_BLAS3
  #ifdef _H_GRB_REFERENCE_OMP_BLAS3
 				#pragma omp critical
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index c1b3a365d..9f7fa96c7 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -2364,7 +2364,432 @@ namespace grb {
 			);
 			return ret;
 		}
+
+		/**
+		 * Meta-data for global buffer management for use with #grb::mxm and the
+		 * matrix-matrix-matrix variant of #grb::set.
+		 *
+		 * This class contains all meta-data necessary to interpret the global buffer
+		 * as an array of sparse accumulators (SPAs). The length of the array is given
+		 * by a call to #threads(), minus one. It is called that since a call to
+		 * #threads() retrieves how many threads can be used to process the call to
+		 * #grb::mxm.
+		 *
+		 * Each SPA has the layout (bitarray, stack, valueArray). These are packed in a
+		 * padded byte array, such that each bit array, stack, and value array is
+		 * aligned on sizeof(int) bytes.
+		 *
+		 * @tparam NIT       The nonzero index type.
+		 * @tparam ValueType The output matrix value type.
+		 *
+		 * This meta-data class applies to both the sequential (reference) and shared-
+		 * memory parallel (reference_omp) backends.
+		 */
+		template< typename NIT, typename ValueType >
+		class SPA_BufferMetaData {
+
+			static_assert( sizeof(NIT) % sizeof(int) == 0, "Unsupported type for NIT; "
+				"please submit a bug report!" );
+
+			private:
+
+				/** The size of the offset array */
+				size_t m;
+
+				/** The size of the SPA */
+				size_t n;
+
+				/** The number of threads supported during a call to #grb::mxm */
+				size_t nthreads;
+
+				/** The initial buffer offset */
+				size_t bufferOffset;
+
+				/** The size of a single SPA, including bytes needed for padding */
+				size_t paddedSPASize;
+
+				/** The number of bytes to pad the SPA array with */
+				size_t arrayShift;
+
+				/** The number of bytes to pad the SPA stack with */
+				size_t stackShift;
+
+				/**
+				 * Given a number of used bytes of the buffer, calculate the available
+				 * remainder buffer and return it.
+				 *
+				 * @param[in]  osize     The size of the buffer (in bytes) that is already
+				 *                       in use.
+				 * @param[out] remainder Pointer to any remainder buffer.
+				 * @param[out] rsize     The size of the remainder buffer.
+				 *
+				 * If no buffer space is left, \a remainder will be set to <tt>nullptr</tt>
+				 * and \a size to <tt>0</tt>.
+				 */
+				void retrieveRemainderBuffer(
+					const size_t osize,
+					void * &remainder, size_t &rsize
+				) const noexcept {
+					const size_t size = internal::template getCurrentBufferSize< char >();
+					char * rem = internal::template getReferenceBuffer< char >( size );
+					size_t rsize_calc = size - osize;
+					rem += osize;
+					const size_t mod = reinterpret_cast< uintptr_t >(rem) % sizeof(int);
+					if( mod ) {
+						const size_t shift = sizeof(int) - mod;
+						if( rsize_calc >= shift ) {
+							rsize_calc -= shift;
+							rem += rsize;
+						} else {
+							rsize_calc = 0;
+							rem = nullptr;
+						}
+					}
+					assert( !(reinterpret_cast< uintptr_t >(rem) % sizeof(int)) );
+					// write out
+					remainder = rem;
+					rsize = rsize_calc;
+				}
+
+
+			public:
+
+				/**
+				 * Base constructor.
+				 *
+				 * @param[in] _m          The length of the offset array.
+				 * @param[in] _n          The length of the SPA.
+				 * @param[in] max_threads The maximum number of threads.
+				 *
+				 * \note \a max_threads is a separate input since there might be a need to
+				 *       cap the maximum number of threads used based on some analytic
+				 *       performance model. Rather than putting such a performance model
+				 *       within this class, we make it an obligatory input parameter
+				 *       instead.
+				 *
+				 * \note It is always valid to pass <tt>config::OMP::threads()</tt>.
+				 *
+				 * \note This class \em will, however, cap the number of threads returned
+				 *       to \a _n.
+				 */
+				SPA_BufferMetaData(
+					const size_t _m, const size_t _n,
+					const size_t max_threads
+				) : m( _m ), n( _n ), arrayShift( 0 ), stackShift( 0 ) {
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t\t\t computing padded buffer size for a SPA of length "
+						<< n << " while leaving space for an additional offset buffer of length "
+						<< std::max( m, n ) << "...\n";
+ #endif
+					// compute bufferOffset
+					bufferOffset = (std::max( m, n ) + 1) * sizeof( NIT );
+
+					// compute value buffer size
+					const size_t valBufSize = n * sizeof( ValueType );
+
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					std::cout << "\t\t\t\t bit-array size has byte-size " <<
+						internal::Coordinates< reference >::arraySize( n ) << "\n";
+					std::cout << "\t\t\t\t stack has byte-size " <<
+						internal::Coordinates< reference >::stackSize( n ) << "\n";
+					std::cout << "\t\t\t\t value buffer has byte-size " << valBufSize << "\n";
+ #endif
+
+					// compute paddedSPASize
+					paddedSPASize =
+						internal::Coordinates< reference >::arraySize( n ) +
+						internal::Coordinates< reference >::stackSize( n ) +
+						valBufSize;
+					size_t shift =
+						internal::Coordinates< reference >::arraySize( n ) % sizeof(int);
+					if( shift != 0 ) {
+						arrayShift = sizeof(int) - shift;
+						paddedSPASize += arrayShift;
+					}
+					shift = internal::Coordinates< reference >::stackSize( n ) % sizeof(int);
+					if( shift != 0 ) {
+						stackShift = sizeof(int) - shift;
+						paddedSPASize += stackShift;
+					}
+					shift = valBufSize % sizeof(int);
+					if( shift != 0 ) {
+						paddedSPASize += (sizeof(int) - shift);
+					}
+
+					// pad bufferOffset
+					shift = bufferOffset % sizeof(int);
+					if( shift != 0 ) {
+						bufferOffset += (sizeof(int) - shift);
+					}
+
+					// compute free buffer size
+					const size_t freeBufferSize = internal::getCurrentBufferSize< char >() -
+						bufferOffset;
+
+					// compute max number of threads
+					nthreads = 1 + freeBufferSize / paddedSPASize;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t\t\t free buffer size: " << freeBufferSize
+						<< ", (padded) SPA size: " << paddedSPASize
+						<< " -> supported #threads: " << nthreads << ". "
+						<< " The shifts for the bit-array and the stack are " << arrayShift
+						<< ", respectively, " << stackShift << "."
+						<< "\n";
+ #endif
+					// cap the final number of selected threads
+					if( nthreads > max_threads ) {
+						nthreads = max_threads;
+					}
+					if( nthreads > n ) {
+						nthreads = n;
+					}
+				}
+
+				/** @returns The maximum number of supported threads during #grb::mxm */
+				size_t threads() const noexcept {
+					return nthreads;
+				}
+
+				/**
+				 * Requests and returns a global buffer required for a thread-local SPA.
+				 *
+				 * @param[in] t The thread ID. Must be larger than 0.
+				 *
+				 * \note Thread 0 employs the SPA allocated with the output matrix.
+				 *
+				 * @returns Pointer into the global buffer starting at the area reserved for
+				 *          the SPA of thread \a t.
+				 */
+				char * getSPABuffers( size_t t ) const noexcept {
+					assert( t > 0 );
+					(void) --t;
+					char * raw = internal::template getReferenceBuffer< char >(
+						bufferOffset + nthreads * paddedSPASize );
+					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
+					raw += bufferOffset;
+					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
+					raw += t * paddedSPASize;
+					return raw;
+				}
+
+				/**
+				 * Retrieves the column offset buffer.
+				 *
+				 * @param[out] remainder Returns any remainder buffer beyond that of the row
+				 *                       offset buffer.
+				 * @param[out] rsize     The remainder buffer size \a remainder points to.
+				 *
+				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
+				 * and vice versa.
+				 *
+				 * Retrieving any remainder buffer is optional. The default is to not ask
+				 * for them.
+				 *
+				 * \warning If all buffer memory is used for the column offsets, it may be
+				 *          that \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt>
+				 *          zero.
+				 *
+				 * \warning This buffer is only guaranteed exclusive if only the retrieved
+				 *          column buffer is used. In particular, if also requesting (and
+				 *          using) SPA buffers, the remainder buffer area is shared with
+				 *          those SPA buffers, and data races are likely to occur. In other
+				 *          words: be very careful with any use of these remainder buffers.
+				 *
+				 * @returns The column offset buffer.
+				 *
+				 * \warning This buffer overlaps with the CRS offset buffer. The caller
+				 *          must ensure to only ever use one at a time.
+				 */
+				NIT * getColOffsetBuffer(
+					void * * const remainder = nullptr,
+					size_t * const rsize = nullptr
+				) const noexcept {
+					NIT * const ret = internal::template getReferenceBuffer< NIT >( n + 1 );
+					if( remainder != nullptr || rsize != nullptr ) {
+						assert( remainder != nullptr && rsize != nullptr );
+						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
+					}
+					return ret;
+				}
+
+				/**
+				 * Retrieves the row offset buffer.
+				 *
+				 * @param[out] remainder Returns any remainder buffer beyond that of the row
+				 *                       offset buffer.
+				 * @param[out] rsize     The remainder buffer size \a remainder points to.
+				 *
+				 * If \a remainder is not a <tt>nullptr</tt> then neither should \a rsize,
+				 * and vice versa.
+				 *
+				 * Retrieving any remainder buffer is optional. The default is to not ask
+				 * for them.
+				 *
+				 * \warning If all buffer memory is used for the row offsets, it may be that
+				 *          \a remainder equals <tt>nullptr</tt> and <tt>rsize</tt> zero.
+				 *
+				 * \warning This buffer is only guaranteed exclusive if only the retrieved
+				 *          row buffer is used. In particular, if also requesting (and
+				 *          using) SPA buffers, the remainder buffer area is shared with
+				 *          those SPA buffers, and data races are likely to occur. In other
+				 *          words: be very careful with any use of these remainder buffers.
+				 *
+				 * @returns The row offset buffer.
+				 *
+				 * \warning This buffer overlaps with the CCS offset buffer. The caller
+				 *          must ensure to only ever use one at a time.
+				 */
+				NIT * getRowOffsetBuffer(
+					void * * const remainder = nullptr,
+					size_t * const rsize = nullptr
+				) const noexcept {
+					NIT * const ret = internal::template getReferenceBuffer< NIT >( m + 1 );
+					if( remainder != nullptr || rsize != nullptr ) {
+						assert( remainder != nullptr && rsize != nullptr );
+						retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize );
+					}
+					return ret;
+				}
+
+				/**
+				 * Shifts a pointer into the global buffer by the bit-array size and its
+				 * padding.
+				 *
+				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
+				 *                    On output: an aligned pointer past the bit-array
+				 *                    position.
+				 */
+				void applyArrayShift( char * &raw ) const noexcept {
+					const size_t totalShift =
+						internal::Coordinates< reference >::arraySize( n ) +
+						arrayShift;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					std::cout << "\t\t\t shifting input pointer with "
+						<< internal::Coordinates< reference >::arraySize( n ) << " + "
+						<< arrayShift << " = " << totalShift << "bytes \n";
+ #endif
+					raw += totalShift;
+				}
+
+				/**
+				 * Shifts a pointer into the global buffer by the stack size and its
+				 * padding.
+				 *
+				 * @param[in,out] raw On input: an aligned pointer into the global buffer.
+				 *                    On output: an aligned pointer past the stack position.
+				 */
+				void applyStackShift( char * &raw ) const noexcept {
+					const size_t totalShift =
+						internal::Coordinates< reference >::stackSize( n ) +
+						stackShift;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					std::cout << "\t\t\t shifting input pointer with "
+						<< internal::Coordinates< reference >::arraySize( n ) << " + "
+						<< stackShift << " = " << totalShift << "bytes \n";
+ #endif
+					raw += totalShift;
+				}
+
+		};
+#endif
+
+		/**
+		 * Retrieves the SPA buffers for the calling thread.
+		 *
+		 * \warning This function must be called from within an OpenMP parallel
+		 *          section.
+		 *
+		 * @param[out]    arr Where the bit-array may be located.
+		 * @param[out]    buf Where the stack may be located.
+		 * @param[out] valbuf Where the value buffer may be located.
+		 *
+		 * All above pointers are aligned on sizeof(int) bytes.
+		 *
+		 * @param[in] md Meta-data for global buffer management.
+		 * @param[in]  C The output matrix.
+		 *
+		 * One thread uses the buffers pre-allocated with the matrix \a C, thus
+		 * ensuring at least one thread may perform the #grb::mxm. Any remainder
+		 * threads can only help process the #grb::mxm if there is enough global
+		 * buffer memory available.
+		 *
+		 *
+		 * \note The global memory has size \f$ \Omega( \mathit{nz} ) \f$, which may
+		 *       be several factors (or even asymptotically greater than)
+		 *       \f$ \max\{ m, n \} \f$.
+		 *
+		 * \note In case the application stores multiple matrices, the global buffer
+		 *       may additionally be greater than the above note indicates if at least
+		 *       one of the other matrices is significantly (or asymptotically) larger
+		 *       than the one involved with the #grb::mxm.
+		 */
+		template<
+			typename OutputType,
+			typename RIT, typename CIT, typename NIT
+		>
+		void spa_ompPar_getBuffers(
+			char * &arr, char * &buf, OutputType * &valbuf,
+			const struct SPA_BufferMetaData< NIT, OutputType > &md,
+			Matrix< OutputType, reference, RIT, CIT, NIT > &C
+		) {
+#ifdef _H_GRB_REFERENCE_OMP_MATRIX
+			// other threads use the global buffer to create additional SPAs
+			{
+				const size_t t = config::OMP::current_thread_ID();
+ #ifndef NDEBUG
+				const size_t T = config::OMP::current_threads();
+				assert( t < T );
+ #endif
+				if( t > 0 ) {
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t Thread " << t << " gets buffers from global buffer\n";
+ #endif
+					char * rawBuffer = md.getSPABuffers( t );
+					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
+					arr = rawBuffer;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+ #endif
+					md.applyArrayShift( rawBuffer );
+					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
+					buf = rawBuffer;
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+ #endif
+					md.applyStackShift( rawBuffer );
+					assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 );
+					assert( buf != arr );
+					valbuf = reinterpret_cast< OutputType * >(rawBuffer);
+					assert( static_cast< void * >(valbuf) != static_cast< void * >(buf) );
+				} else {
+ #ifdef _DEBUG_REFERENCE_MATRIX
+					#pragma omp critical
+					std::cout << "\t Thread " << t << " gets buffers from matrix storage\n";
+ #endif
+					// one thread uses the standard matrix buffer
+					internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
+				}
+ #ifdef _DEBUG_REFERENCE_MATRIX
+				#pragma omp critical
+				{
+					std::cout << "\t Thread " << t << " has SPA array @ "
+						<< static_cast< void * >( arr ) << " and SPA stack @ "
+						<< static_cast< void * >( buf ) << " and SPA values @ "
+						<< static_cast< void * >( valbuf ) << "\n";
+				}
+ #endif
+			}
+#else
+ #ifdef _DEBUG_REFERENCE_MATRIX
+			std::cout << "\t Reference backend gets buffers from global buffer\n";
+ #endif
+			internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
+			(void) md;
 #endif
+		}
 
 	} // end namespace grb::internal
 

From 958aa6823e862a8483eb1c0c0f055dfe5f05fbe3 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 27 Sep 2025 10:33:41 +0200
Subject: [PATCH 34/39] Metabugfix: use heap for dynamic arrays, not the stack

---
 tests/unit/matrixSet.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp
index 305f6f10d..da3d48f64 100644
--- a/tests/unit/matrixSet.cpp
+++ b/tests/unit/matrixSet.cpp
@@ -425,9 +425,16 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	//  - input will be an n by n matrix with each element on its diagonal and
 	//    superdiagonal set to its row index (meaning the entries on row 0 have
 	//    value 0).
-	size_t I_mask[ 2 * n - 1 ], J_mask[ 2 * n - 1 ];
-	int mask_vals [ 2 * n - 1 ];
-	int input_vals [ 2 * n - 1 ];
+	size_t * const I_mask = new size_t[ 2 * n - 1 ];
+	size_t * const J_mask = new size_t[ 2 * n - 1 ];
+	int * const mask_vals = new int[ 2 * n - 1 ];
+	int * const input_vals = new int[ 2 * n - 1 ];
+	if( I_mask == nullptr || J_mask == nullptr || mask_vals == nullptr ||
+		input_vals == nullptr
+	) {
+		std::cerr << "\t initialisation FAILED\n";
+		return;
+	}
 	for( size_t k = 0; k < n; ++k ) {
 		I_mask[ k ] = J_mask[ k ] = k;
 		mask_vals[ k ] = 1;
@@ -478,6 +485,10 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	// postpone materialisation of inputVoid since it relies on unmasked grb::set
 	// (which is itself unit-tested later)
 
+	delete [] I_mask;
+	delete [] J_mask;
+	delete [] mask_vals;
+	delete [] input_vals;
 	std::cout << "\t test initialisation complete\n";
 
 	// check grb::set for non-voids

From df2d41dece01aa467256d19db068834cc76982de Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 27 Sep 2025 10:34:13 +0200
Subject: [PATCH 35/39] make SPA buffer retrieval compatible with void matrices

---
 include/graphblas/reference/matrix.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 9f7fa96c7..08227a673 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -2486,7 +2486,7 @@ namespace grb {
 					bufferOffset = (std::max( m, n ) + 1) * sizeof( NIT );
 
 					// compute value buffer size
-					const size_t valBufSize = n * sizeof( ValueType );
+					const size_t valBufSize = n * utils::SizeOf< ValueType >::value;
 
  #ifdef _DEBUG_REFERENCE_MATRIX
 					std::cout << "\t\t\t\t bit-array size has byte-size " <<

From efe6f6a5f0a109927f36ab51bdffa8d7a8b14a52 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 27 Sep 2025 10:34:58 +0200
Subject: [PATCH 36/39] make SPA buffer retrieval compatible with void matrices

---
 include/graphblas/reference/io.hpp | 116 ++++++++++++++++++++++-------
 1 file changed, 88 insertions(+), 28 deletions(-)

diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index 6ca510598..b4af46eea 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -2097,37 +2097,89 @@ namespace grb {
 
 		// go for implementation, preliminaries:
 		size_t nzc = 0;
-		char * mask_arr = nullptr;
-		char * mask_buf = nullptr;
-		MaskType * mask_valbuf = nullptr;
 		const auto &A_raw = internal::getCRS( A );
 		const auto &mask_raw = internal::getCRS( M );
-		internal::Coordinates< reference > mask_coors;
-		internal::getMatrixBuffers( mask_arr, mask_buf, mask_valbuf, 1, M );
-		mask_coors.set( mask_arr, false, mask_buf, ncols );
-
 		// we now have one (guaranteed) SPA, which is mask_coors. We now are going to
 		// check how many more SPAs ideally we would like (for reference_omp), and
 		// then go about trying to get those. If, finally, we get just this one SPA,
 		// we will go into this mostly-sequential code (essentially, big-Omega nrows):
-		for( size_t i = 0; i < nrows; ++i ) {
-			mask_coors.clear();
-			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
-				const auto k_col = mask_raw.row_index[ k ];
-				if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
-					mask_coors.assign( k_col );
+#ifdef _H_GRB_REFERENCE_OMP_IO
+		const size_t nnz_based_nthreads = std::max( config::OMP::threads(),
+			grb::nnz( A ) / config::CACHE_LINE_SIZE::value() );
+		grb::internal::SPA_BufferMetaData< NIT1, OutputType > bufferMD( m, n,
+			nnz_based_nthreads );
+		const size_t nthreads = bufferMD.threads();
+		std::cout << "\t set( matrix, matrix, matrix ) will use " << nthreads
+			<< " threads\n";
+#else
+		const size_t nthreads = 1;
+#endif
+		if( nthreads == 1 ) {
+			char * arr = nullptr;
+			char * buf = nullptr;
+			OutputType * valbuf = nullptr;
+			internal::Coordinates< reference > coors;
+			internal::getMatrixBuffers( arr, buf, valbuf, 1, C );
+			coors.set( arr, false, buf, ncols );
+			for( size_t i = 0; i < nrows; ++i ) {
+				coors.clear();
+				for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+					const auto k_col = mask_raw.row_index[ k ];
+					if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
+						coors.assign( k_col );
+					}
 				}
-			}
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
-			#pragma omp parallel for reduction( +: nzc ) \
-				schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+#ifdef _H_GRB_REFERENCE_OMP_IO
+				#pragma omp parallel for reduction( +: nzc ) \
+					schedule( dynamic, config::CACHE_LINE_SIZE::value() )
 #endif
-			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
-				const auto k_col = A_raw.row_index[ k ];
-				if( mask_coors.assigned( k_col ) ) {
-					(void) nzc++;
+				for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+					const auto k_col = A_raw.row_index[ k ];
+					if( coors.assigned( k_col ) ) {
+						(void) ++nzc;
+					}
+				}
+			}
+		} else {
+#ifdef _H_GRB_REFERENCE_OMP_IO
+			#pragma omp parallel num_threads( nthreads ) reduction( +: nzc )
+			{
+				// get thread-local buffers
+				size_t local_nz = 0;
+				char * arr = nullptr;
+				char * buf = nullptr;
+				OutputType * valbuf = nullptr;
+				internal::spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C );
+				internal::Coordinates< reference > coors;
+				coors.set_seq( arr, false, buf, n );
+				// follow dynamic schedule since we cannot predict sparsity structure
+				#pragma omp for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+				for( size_t i = 0; i < nrows; ++i ) {
+					coors.clear();
+					for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = mask_raw.row_index[ k ];
+						if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
+							coors.assign( k_col );
+						}
+					}
+					for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
+						const auto k_col = A_raw.row_index[ k ];
+						if( coors.assigned( k_col ) ) {
+							(void) ++local_nz;
+						}
+					}
 				}
+				nzc += local_nz;
 			}
+#else
+			const bool code_path_should_not_be_reached = false;
+			std::cerr << "\t logic error in grb::set( matrix, matrix, matrix ): "
+				<< "code path should not reach here. Please submit a bug report\n";
+			assert( code_path_should_not_be_reached );
+ #ifdef NDEBUG
+			(void) code_path_should_not_be_reached;
+ #endif
+#endif
 		}
 
 		// we now have a count. If we're in the resize phase that means we're done:
@@ -2158,7 +2210,7 @@ namespace grb {
 			getReferenceBuffer< typename config::NonzeroIndexType >( ncols + 1 );
 		CRS_raw.col_start[ 0 ] = 0;
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS3
+#ifdef _H_GRB_REFERENCE_OMP_IO
 		// TODO ALPify the below
 		#pragma omp parallel for simd
 #endif
@@ -2167,20 +2219,28 @@ namespace grb {
 			C_col_index[ j ] = 0;
 		}
 
+		//TODO: revise the below, WIP
+		char * arr = nullptr;
+		char * buf = nullptr;
+		MaskType * valbuf = nullptr;
+		internal::Coordinates< reference > coors;
+		internal::getMatrixBuffers( arr, buf, valbuf, 1, M );
+		coors.set( arr, false, buf, ncols );
+
 		// do counting sort, phase 1 -- also this loop should employ the same
 		// parallelisation strategy during counting
 		nzc = 0;
 		for( size_t i = 0; i < nrows; ++i ) {
-			mask_coors.clear();
+			coors.clear();
 			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = mask_raw.row_index[ k ];
 				if( utils::interpretMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) {
-					mask_coors.assign( k_col );
+					coors.assign( k_col );
 				}
 			}
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = A_raw.row_index[ k ];
-				if( mask_coors.assigned( k_col ) ) {
+				if( coors.assigned( k_col ) ) {
 					(void) nzc++;
 					(void) (CCS_raw.col_start[ k_col + 1 ])++;
 				}
@@ -2199,18 +2259,18 @@ namespace grb {
 		// the same (multiple-SPA) parallelisation strategy as above
 		nzc = 0;
 		for( size_t i = 0; i < nrows; ++i ) {
-			mask_coors.clear();
+			coors.clear();
 			for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = mask_raw.row_index[ k ];
 				if( utils::interpretMatrixMask< descr, MaskType >(
 					true, mask_raw.getValues(), k )
 				) {
-					mask_coors.assign( k_col );
+					coors.assign( k_col );
 				}
 			}
 			for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) {
 				const auto k_col = A_raw.row_index[ k ];
-				if( mask_coors.assigned( k_col ) ) {
+				if( coors.assigned( k_col ) ) {
 					constexpr int zero = 0;
 					CRS_raw.row_index[ nzc ] = k_col;
 					CRS_raw.setValue( nzc, A_raw.getValue( k, zero ) );

From 7b1692586ab38482915c7d2586b548215125a150 Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 27 Sep 2025 15:18:52 +0200
Subject: [PATCH 37/39] Always select at least one thread

---
 include/graphblas/reference/blas3.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp
index 2a37f51a8..3cffb588a 100644
--- a/include/graphblas/reference/blas3.hpp
+++ b/include/graphblas/reference/blas3.hpp
@@ -789,6 +789,7 @@ namespace grb {
 
 			// a basic analytic model based on the number of nonzeroes
 			size_t max_threads = config::OMP::threads();
+			assert( max_threads > 0 );
 			{
 				size_t target_nnz = 0;
 				if( phase == EXECUTE ) {
@@ -800,7 +801,7 @@ namespace grb {
 				const size_t nnz_based_nthreads =
 					target_nnz / config::CACHE_LINE_SIZE::value();
 				if( nnz_based_nthreads < max_threads ) {
-					max_threads = nnz_based_nthreads;
+					max_threads = nnz_based_nthreads > 0 ? nnz_based_nthreads : 1;
 				}
 #ifdef _DEBUG_REFERENCE_BLAS3
 				std::cout << "\t simple analytic model selects max threads of "

From 6e2e38186efeb3da8acf410e75c2e50e33da40fc Mon Sep 17 00:00:00 2001
From: Albert-Jan Yzelman <albertjan.yzelman@huawei.com>
Date: Sat, 27 Sep 2025 15:19:27 +0200
Subject: [PATCH 38/39] Bugfix error in expected capacity calculation (metabug
 - did not trigger with NDEBUG)

---
 include/graphblas/reference/matrix.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index 08227a673..84873a22c 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -1283,7 +1283,9 @@ namespace grb {
 					"We hit here a configuration border case which the implementation does not "
 					"handle at present. Please submit a bug report."
 				);
-				// compute and return
+				// compute and return the maximum of
+				//  - row- and column-wise buffers. The added factor two is for padding
+				//  - minimal buffer requirement for parallel buildMatrixUnique
 				return std::max( (m + n + 2) * globalBufferUnitSize,
 #ifdef _H_GRB_REFERENCE_OMP_MATRIX
 					config::OMP::threads() * config::CACHE_LINE_SIZE::value() *
@@ -2533,6 +2535,7 @@ namespace grb {
 					#pragma omp critical
 					std::cout << "\t\t\t free buffer size: " << freeBufferSize
 						<< ", (padded) SPA size: " << paddedSPASize
+						<< ", bufferOffset: " << bufferOffset
 						<< " -> supported #threads: " << nthreads << ". "
 						<< " The shifts for the bit-array and the stack are " << arrayShift
 						<< ", respectively, " << stackShift << "."
@@ -2564,9 +2567,10 @@ namespace grb {
 				 */
 				char * getSPABuffers( size_t t ) const noexcept {
 					assert( t > 0 );
+					assert( nthreads > 1 );
 					(void) --t;
 					char * raw = internal::template getReferenceBuffer< char >(
-						bufferOffset + nthreads * paddedSPASize );
+						bufferOffset + (nthreads - 1) * paddedSPASize );
 					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );
 					raw += bufferOffset;
 					assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 );

From a6ae18b896f447836584c0051ba01f0261be76cb Mon Sep 17 00:00:00 2001
From: "Albert-Jan N. Yzelman" <albertjan.yzelman@huawei.com>
Date: Tue, 14 Oct 2025 14:55:55 +0200
Subject: [PATCH 39/39] fix typo

---
 tests/unit/id.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/id.cpp b/tests/unit/id.cpp
index a7a7cdb0b..8711369ad 100644
--- a/tests/unit/id.cpp
+++ b/tests/unit/id.cpp
@@ -216,7 +216,7 @@ void grb_program2( const struct input &in, struct output &out ) {
 	}
 
 	/**
-	 * Test for move assignement id cleanup.
+	 * Test for move assignment id cleanup.
 	 *
 	 * Creating and performing move assignment on multiple new objects and check
 	 * for collisions.